Hindi
* Added Hindi language * Removed the hardcoded special characters from language validation. Now reading them from the .yml * improved method of hiding the letters on 0 and 1, when needed * virtual keypad adjustments * improved the single-letter validation during build time * improved Devanagari validation script * improved sorting when filters are on
This commit is contained in:
parent
622a954633
commit
f8e6668281
18 changed files with 1305176 additions and 103 deletions
15
app/languages/definitions/Hindi.yml
Normal file
15
app/languages/definitions/Hindi.yml
Normal file
|
|
@ -0,0 +1,15 @@
|
|||
locale: hi-IN
|
||||
dictionaryFile: hi-utf8.csv
|
||||
abcString: कखग
|
||||
hasUpperCase: no
|
||||
layout:
|
||||
- [SPECIAL, ॐ] # 0 ==> [SPECIAL, Om]
|
||||
- [्, ़, ऽ, ः, PUNCTUATION_IN] # 1 ==> [halant (cancel vowel sign), nuqta (converts k -> q, ph -> f, etc), avagraha, visarga]
|
||||
- [अ, आ, ब, भ, च, छ, ा] # 2 ==> [a, ā, b, bh, c, ch, ā (combining)]
|
||||
- [द, ध, ड, ढ, ए, ऐ, फ, े, ै, ॆ, ॅ] # 3 ==> [d, dh, ḍ, ḍh, ē, ai, ph, ē (combining), ai (combining), e (combining), ê (ae-combining)];
|
||||
- [ग, घ, ह, इ, ई, ि, ी] # 4 ==> [g, gh, h, i, ī, i (combining), ī (combining)]
|
||||
- [ज, झ, क, ख, ल, ळ, ऌ, ॢ] # 5 ==> [j, jh, k, kh, l, ḷ, l̥, l̥ (combining)]
|
||||
- [म, ङ, ञ, ण, न, ओ, औ, ऑ, ं, ँ, ो, ौ, ॊ, ॉ] # 6 ==> [m, ṅ, ñ, ṇ, n, o, au, ŏ, ṁ (n-combining with consonant, e.g. "ng", "nd"), m̐ (m-combining with vowel, e.g. "am", "em"), ō (combining), au (combining), o (combining), ŏ (combining)]
|
||||
- [प, र, ऋ, स, श, ष, ृ] # 7 ==> [p, r, r̥ (ri), s, sh, rsh (like "norsk"), r̥ (ri-combining)]
|
||||
- [त, थ, ट, ठ, उ, ऊ, व, ु, ू] # 8 ==> [t, th, ṭ, ṭh, u, ū, v, u (combining), ū (combining)]
|
||||
- [य] # 9 ==> [y]
|
||||
1304915
app/languages/dictionaries/hi-utf8.csv
Normal file
1304915
app/languages/dictionaries/hi-utf8.csv
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -9,6 +9,7 @@ import io.github.sspanak.tt9.languages.Language;
|
|||
import io.github.sspanak.tt9.languages.NaturalLanguage;
|
||||
import io.github.sspanak.tt9.preferences.settings.SettingsStore;
|
||||
import io.github.sspanak.tt9.util.Characters;
|
||||
import io.github.sspanak.tt9.util.TextTools;
|
||||
|
||||
class Mode123 extends ModePassthrough {
|
||||
@Override public int getId() { return MODE_123; }
|
||||
|
|
@ -49,8 +50,12 @@ class Mode123 extends ModePassthrough {
|
|||
* use the default list, but reorder it a bit for convenience.
|
||||
*/
|
||||
private void setDefaultSpecialCharacters() {
|
||||
KEY_CHARACTERS.add(applyNumericFieldCharacterOrder(settings.getOrderedKeyChars(language, 0)));
|
||||
KEY_CHARACTERS.add(applyNumericFieldCharacterOrder(settings.getOrderedKeyChars(language, 1)));
|
||||
KEY_CHARACTERS.add(
|
||||
TextTools.removeLettersFromList(applyNumericFieldCharacterOrder(settings.getOrderedKeyChars(language, 0)))
|
||||
);
|
||||
KEY_CHARACTERS.add(
|
||||
TextTools.removeLettersFromList(applyNumericFieldCharacterOrder(settings.getOrderedKeyChars(language, 1)))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@ import io.github.sspanak.tt9.languages.LanguageKind;
|
|||
import io.github.sspanak.tt9.languages.NaturalLanguage;
|
||||
import io.github.sspanak.tt9.preferences.settings.SettingsStore;
|
||||
import io.github.sspanak.tt9.util.Characters;
|
||||
import io.github.sspanak.tt9.util.TextTools;
|
||||
|
||||
class ModeCheonjiin extends InputMode {
|
||||
// used when we want do display a different set of characters for a given key, for example
|
||||
|
|
@ -68,14 +69,23 @@ class ModeCheonjiin extends InputMode {
|
|||
}
|
||||
|
||||
|
||||
/**
|
||||
* setCustomSpecialCharacters
|
||||
* Filter out the letters from the 0-key list and add "0", because there is no other way of
|
||||
* typing it.
|
||||
*/
|
||||
protected void setCustomSpecialCharacters() {
|
||||
// special
|
||||
KEY_CHARACTERS.add(TextTools.removeLettersFromList(applyPunctuationOrder(Characters.Special, 0)));
|
||||
if (settings.holdForPunctuationInKorean()) {
|
||||
ArrayList<String> specialChars = new ArrayList<>(applyPunctuationOrder(Characters.Special, 0));
|
||||
specialChars.add(0, "0");
|
||||
KEY_CHARACTERS.add(specialChars);
|
||||
KEY_CHARACTERS.get(0).add(0, "0");
|
||||
}
|
||||
}
|
||||
|
||||
// punctuation
|
||||
KEY_CHARACTERS.add(
|
||||
TextTools.removeLettersFromList(applyPunctuationOrder(Characters.PunctuationKorean, 1))
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
protected void setSpecialCharacterConstants() {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,59 @@
|
|||
package io.github.sspanak.tt9.ime.modes.predictions;
|
||||
|
||||
import androidx.annotation.NonNull;
|
||||
import androidx.annotation.Nullable;
|
||||
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.regex.Matcher;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import io.github.sspanak.tt9.languages.Language;
|
||||
import io.github.sspanak.tt9.languages.LanguageKind;
|
||||
|
||||
|
||||
class LocaleWordsSorter {
|
||||
private final Pattern sortingPattern;
|
||||
|
||||
|
||||
LocaleWordsSorter(@Nullable Language language) {
|
||||
if (LanguageKind.isHindi(language)) {
|
||||
sortingPattern = Pattern.compile("[\\u0904-\\u0939\\u0958-\\u0961][\\u0900-\\u0904\\u093A-\\u094F\\u0962\\u0963]+");
|
||||
} else {
|
||||
sortingPattern = null;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Reduces the length of a word using the sortingRegex. Usually meant to consider a
|
||||
* base letter + modifiers as a single character.
|
||||
*/
|
||||
private int reduceLength(String word) {
|
||||
Matcher matcher = sortingPattern.matcher(word);
|
||||
|
||||
int length = word.length();
|
||||
while (matcher.find()) {
|
||||
length -= matcher.end() - matcher.start() - 1;
|
||||
}
|
||||
|
||||
return length;
|
||||
}
|
||||
|
||||
|
||||
ArrayList<String> sort(ArrayList<String> words) {
|
||||
if (sortingPattern == null || words == null) {
|
||||
return words;
|
||||
}
|
||||
|
||||
ArrayList<String> wordsCopy = new ArrayList<>(words);
|
||||
Collections.sort(wordsCopy, (a, b) -> reduceLength(a) - reduceLength(b));
|
||||
|
||||
return wordsCopy;
|
||||
}
|
||||
|
||||
|
||||
boolean shouldSort(@Nullable Language language, @NonNull String stem, @NonNull String digitSequence) {
|
||||
return LanguageKind.isIndic(language) && !stem.isEmpty() && stem.length() == digitSequence.length() - 1;
|
||||
}
|
||||
}
|
||||
|
|
@ -5,12 +5,14 @@ import java.util.ArrayList;
|
|||
import io.github.sspanak.tt9.db.DataStore;
|
||||
import io.github.sspanak.tt9.ime.helpers.TextField;
|
||||
import io.github.sspanak.tt9.languages.EmojiLanguage;
|
||||
import io.github.sspanak.tt9.languages.Language;
|
||||
import io.github.sspanak.tt9.preferences.settings.SettingsStore;
|
||||
import io.github.sspanak.tt9.util.Characters;
|
||||
import io.github.sspanak.tt9.util.TextTools;
|
||||
|
||||
public class WordPredictions extends Predictions {
|
||||
private final TextField textField;
|
||||
private LocaleWordsSorter localeWordsSorter;
|
||||
|
||||
private String inputWord;
|
||||
private boolean isStemFuzzy;
|
||||
|
|
@ -21,11 +23,21 @@ public class WordPredictions extends Predictions {
|
|||
public WordPredictions(SettingsStore settings, TextField textField) {
|
||||
super(settings);
|
||||
lastEnforcedTopWord = "";
|
||||
localeWordsSorter = new LocaleWordsSorter(null);
|
||||
stem = "";
|
||||
this.textField = textField;
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public Predictions setLanguage(Language language) {
|
||||
super.setLanguage(language);
|
||||
localeWordsSorter = new LocaleWordsSorter(language);
|
||||
|
||||
return this;
|
||||
}
|
||||
|
||||
|
||||
public WordPredictions setIsStemFuzzy(boolean yes) {
|
||||
this.isStemFuzzy = yes;
|
||||
return this;
|
||||
|
|
@ -89,6 +101,7 @@ public class WordPredictions extends Predictions {
|
|||
words.addAll(dbWords);
|
||||
} else {
|
||||
suggestStem();
|
||||
dbWords = localeWordsSorter.shouldSort(language, stem, digitSequence) ? localeWordsSorter.sort(dbWords) : dbWords;
|
||||
dbWords = rearrangeByPairFrequency(dbWords);
|
||||
suggestMissingWords(generatePossibleStemVariations(dbWords));
|
||||
suggestMissingWords(dbWords.isEmpty() ? generateWordVariations(inputWord) : dbWords);
|
||||
|
|
|
|||
|
|
@ -6,12 +6,13 @@ public class LanguageKind {
|
|||
public static final int KOREAN = 601579;
|
||||
|
||||
public static boolean isArabic(Language language) { return language != null && language.getId() == 502337; }
|
||||
public static boolean isBulgarian(Language language) { return language != null && language.getId() == 231650; }
|
||||
public static boolean isCyrillic(Language language) { return language != null && language.getKeyCharacters(2).contains("а"); }
|
||||
public static boolean isEnglish(Language language) { return language != null && language.getLocale().equals(Locale.ENGLISH); }
|
||||
public static boolean isFrench(Language language) { return language != null && language.getId() == 596550; }
|
||||
public static boolean isGreek(Language language) { return language != null && language.getId() == 597381; }
|
||||
public static boolean isHebrew(Language language) { return language != null && (language.getId() == 305450 || language.getId() == 403177); }
|
||||
public static boolean isHindi(Language language) { return language != null && language.getId() == 468264; }
|
||||
public static boolean isIndic(Language language) { return isHindi(language); }
|
||||
public static boolean isHinglish(Language language) { return language != null && language.getId() == 468421; }
|
||||
public static boolean isKorean(Language language) { return language != null && language.getId() == KOREAN; }
|
||||
public static boolean isLatinBased(Language language) { return language != null && language.getKeyCharacters(2).contains("a"); }
|
||||
|
|
|
|||
|
|
@ -4,7 +4,9 @@ import androidx.annotation.NonNull;
|
|||
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
import java.util.Map;
|
||||
|
||||
import io.github.sspanak.tt9.languages.exceptions.InvalidLanguageCharactersException;
|
||||
import io.github.sspanak.tt9.util.Characters;
|
||||
|
|
@ -76,39 +78,24 @@ public class NaturalLanguage extends Language implements Comparable<NaturalLangu
|
|||
private ArrayList<String> generateSpecialChars(ArrayList<String> definitionChars) {
|
||||
final String SPECIAL_CHARS_PLACEHOLDER = "SPECIAL";
|
||||
final String PUNCTUATION_PLACEHOLDER = "PUNCTUATION";
|
||||
final String ARABIC_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_AR";
|
||||
final String FRENCH_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_FR";
|
||||
final String GERMAN_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_DE";
|
||||
final String GREEK_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_GR";
|
||||
final String KOREAN_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_KR";
|
||||
|
||||
final Map<String, List<String>> specialChars = new HashMap<>();
|
||||
specialChars.put(SPECIAL_CHARS_PLACEHOLDER, Characters.Special);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER, Characters.PunctuationEnglish);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_AR", Characters.PunctuationArabic);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_FR", Characters.PunctuationFrench);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_DE", Characters.PunctuationGerman);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_GR", Characters.PunctuationGreek);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_IN", Characters.PunctuationIndic);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_KR", Characters.PunctuationKorean);
|
||||
|
||||
ArrayList<String> keyChars = new ArrayList<>();
|
||||
for (String defChar : definitionChars) {
|
||||
switch (defChar) {
|
||||
case SPECIAL_CHARS_PLACEHOLDER:
|
||||
keyChars.addAll(Characters.Special);
|
||||
break;
|
||||
case PUNCTUATION_PLACEHOLDER:
|
||||
keyChars.addAll(Characters.PunctuationEnglish);
|
||||
break;
|
||||
case ARABIC_PUNCTUATION_STYLE:
|
||||
keyChars.addAll(Characters.PunctuationArabic);
|
||||
break;
|
||||
case FRENCH_PUNCTUATION_STYLE:
|
||||
keyChars.addAll(Characters.PunctuationFrench);
|
||||
break;
|
||||
case GERMAN_PUNCTUATION_STYLE:
|
||||
keyChars.addAll(Characters.PunctuationGerman);
|
||||
break;
|
||||
case GREEK_PUNCTUATION_STYLE:
|
||||
keyChars.addAll(Characters.PunctuationGreek);
|
||||
break;
|
||||
case KOREAN_PUNCTUATION_STYLE:
|
||||
keyChars.addAll(Characters.PunctuationKorean);
|
||||
break;
|
||||
default:
|
||||
keyChars.add(defChar);
|
||||
break;
|
||||
List<String> keySpecialChars = specialChars.containsKey(defChar) ? specialChars.get(defChar) : null;
|
||||
if (keySpecialChars != null) {
|
||||
keyChars.addAll(keySpecialChars);
|
||||
} else {
|
||||
keyChars.add(defChar);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -88,10 +88,6 @@ class SettingsPunctuation extends SettingsInput {
|
|||
orderedChars = language.getKeyCharacters(number);
|
||||
}
|
||||
|
||||
if (number < 2) {
|
||||
orderedChars = removeLettersFromList(orderedChars);
|
||||
}
|
||||
|
||||
return orderedChars;
|
||||
}
|
||||
|
||||
|
|
@ -118,16 +114,4 @@ class SettingsPunctuation extends SettingsInput {
|
|||
|
||||
return charsList;
|
||||
}
|
||||
|
||||
|
||||
private ArrayList<String> removeLettersFromList(ArrayList<String> list) {
|
||||
ArrayList<String> cleanList = new ArrayList<>();
|
||||
for (String s : list) {
|
||||
if (!Character.isAlphabetic(s.codePointAt(0))) {
|
||||
cleanList.add(s);
|
||||
}
|
||||
}
|
||||
|
||||
return cleanList;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -25,6 +25,7 @@ public class SettingsStore extends SettingsUI {
|
|||
public final static int SOFT_KEY_DOUBLE_CLICK_DELAY = 500; // ms
|
||||
public final static int SOFT_KEY_REPEAT_DELAY = 40; // ms
|
||||
public final static int SOFT_KEY_TITLE_MAX_CHARS = 5;
|
||||
public final static int SOFT_KEY_TITLE_MAX_CHARS_INDIC = 3;
|
||||
public final static int SOFT_KEY_TITLE_SIZE = 18; // sp
|
||||
public final static float SOFT_KEY_COMPLEX_LABEL_TITLE_RELATIVE_SIZE = 0.55f;
|
||||
public final static float SOFT_KEY_COMPLEX_LABEL_ARABIC_TITLE_RELATIVE_SIZE = 0.72f;
|
||||
|
|
|
|||
|
|
@ -144,22 +144,17 @@ public class SoftKeyNumber extends SoftKey {
|
|||
}
|
||||
|
||||
ArrayList<String> chars = language.getKeyCharacters(number);
|
||||
boolean isBulgarian = LanguageKind.isBulgarian(language);
|
||||
boolean isGreek = LanguageKind.isGreek(language);
|
||||
boolean isLatinBased = LanguageKind.isLatinBased(language);
|
||||
boolean isUkrainian = LanguageKind.isUkrainian(language);
|
||||
boolean isUppercase = tt9.getTextCase() == InputMode.CASE_UPPER;
|
||||
final int maxChars = LanguageKind.isIndic(language) ? SettingsStore.SOFT_KEY_TITLE_MAX_CHARS_INDIC : SettingsStore.SOFT_KEY_TITLE_MAX_CHARS;
|
||||
|
||||
if (
|
||||
isBulgarian
|
||||
|| isGreek
|
||||
|| isLatinBased
|
||||
|| (isUkrainian && number == 2)
|
||||
|| chars.size() < SettingsStore.SOFT_KEY_TITLE_MAX_CHARS) {
|
||||
return getDefaultCharList(chars, language.getLocale(), isGreek, isLatinBased, isUppercase);
|
||||
} else {
|
||||
return abbreviateCharList(chars, language.getLocale(), isUppercase);
|
||||
String displayChars = getDefaultCharList(chars, language.getLocale(), isGreek, isLatinBased, isUppercase);
|
||||
if (displayChars.length() > maxChars) {
|
||||
displayChars = abbreviateCharList(displayChars, language.getLocale(), isUppercase);
|
||||
}
|
||||
|
||||
return displayChars;
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -188,27 +183,37 @@ public class SoftKeyNumber extends SoftKey {
|
|||
* on one key. As suggested by the community, we could display them as "A-Z".
|
||||
* @see <a href="https://github.com/sspanak/tt9/issues/628">Issue #628</a>
|
||||
*/
|
||||
private String abbreviateCharList(ArrayList<String> chars, Locale locale, boolean isUppercase) {
|
||||
boolean containsCombiningChars = TextTools.isCombining(chars.get(0)) || TextTools.isCombining(chars.get(chars.size() - 1));
|
||||
private String abbreviateCharList(String chars, Locale locale, boolean isUppercase) {
|
||||
String firstLetter = chars.substring(0, 1);
|
||||
String lastLetter = chars.substring(chars.length() - 1);
|
||||
boolean containsCombiningChars = TextTools.isCombining(firstLetter) || TextTools.isCombining(lastLetter);
|
||||
|
||||
return
|
||||
(isUppercase ? chars.get(0).toUpperCase(locale) : chars.get(0))
|
||||
(isUppercase ? firstLetter.toUpperCase(locale) : firstLetter)
|
||||
+ (containsCombiningChars ? "– " : "–")
|
||||
+ (isUppercase ? chars.get(chars.size() - 1).toUpperCase(locale) : chars.get(chars.size() - 1));
|
||||
+ (isUppercase ? lastLetter.toUpperCase(locale) : lastLetter);
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* As suggested by the community, there is no need to display the accented letters.
|
||||
* People are used to seeing just "ABC", "DEF", etc. In the case of Korean, the keypad looks too
|
||||
* cluttered, so we skip the double consonants, like on phones with a physical keypad.
|
||||
* Reduces the number of displayed characters by leaving the most descriptive ones. This prevents
|
||||
* the visual clutter on the keys.
|
||||
*/
|
||||
private boolean shouldSkipAccents(char currentLetter, boolean isGreek, boolean isLatinBased) {
|
||||
return
|
||||
currentLetter == 'ѝ'
|
||||
|| currentLetter == 'ґ'
|
||||
// Latin. As suggested by the community, there is no need to display the accented letters. People are
|
||||
// used to seeing just "ABC", "DEF", etc.
|
||||
(isLatinBased && currentLetter > 'z')
|
||||
// Cyrillic. Same as above.
|
||||
|| currentLetter == 'ѝ' || currentLetter == 'ґ'
|
||||
// Korean double consonants
|
||||
|| (currentLetter == 'ㄲ' || currentLetter == 'ㄸ' || currentLetter == 'ㅃ' || currentLetter == 'ㅆ' || currentLetter == 'ㅉ')
|
||||
|| (isLatinBased && currentLetter > 'z')
|
||||
// Greek diacritics and ending sigma
|
||||
|| currentLetter == 'ς'
|
||||
|| (isGreek && (currentLetter < 'α' || currentLetter > 'ω'));
|
||||
|| (isGreek && (currentLetter < 'α' || currentLetter > 'ω'))
|
||||
// Hindi matras
|
||||
|| (currentLetter >= 0x0900 && currentLetter <= 0x0903) || (currentLetter >= 0x093A && currentLetter <= 0x094F)
|
||||
|| (currentLetter >= 0x0951 && currentLetter <= 0x0957) || currentLetter == 0x0962 || currentLetter == 0x0963
|
||||
;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -125,15 +125,21 @@ public class SuggestionsBar {
|
|||
return "";
|
||||
}
|
||||
|
||||
if (suggestions.get(id).endsWith(STEM_SUFFIX)) {
|
||||
String suggestion = suggestions.get(id);
|
||||
|
||||
if (suggestion.endsWith(STEM_SUFFIX)) {
|
||||
return stem;
|
||||
} else if (suggestions.get(id).startsWith(STEM_VARIATION_PREFIX)) {
|
||||
return stem + suggestions.get(id).substring(STEM_VARIATION_PREFIX.length());
|
||||
} else if (suggestions.get(id).startsWith(STEM_PUNCTUATION_VARIATION_PREFIX)) {
|
||||
return stem + suggestions.get(id).substring(STEM_PUNCTUATION_VARIATION_PREFIX.length());
|
||||
} else if (suggestion.startsWith(STEM_VARIATION_PREFIX)) {
|
||||
return stem + suggestion.substring(STEM_VARIATION_PREFIX.length());
|
||||
} else if (suggestion.startsWith(STEM_PUNCTUATION_VARIATION_PREFIX)) {
|
||||
return stem + suggestion.substring(STEM_PUNCTUATION_VARIATION_PREFIX.length());
|
||||
}
|
||||
|
||||
return suggestions.get(id).equals(Characters.NEW_LINE) ? "\n" : suggestions.get(id);
|
||||
return switch (suggestion) {
|
||||
case Characters.ZWJ_GRAPHIC -> Characters.ZWJ;
|
||||
case Characters.ZWNJ_GRAPHIC -> Characters.ZWNJ;
|
||||
default -> suggestion.equals(Characters.NEW_LINE) ? "\n" : suggestion;
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
|
@ -195,14 +201,15 @@ public class SuggestionsBar {
|
|||
String trimmedSuggestion = suggestion.substring(stem.length());
|
||||
trimmedSuggestion = Character.isAlphabetic(trimmedSuggestion.charAt(0)) ? STEM_VARIATION_PREFIX + trimmedSuggestion : STEM_PUNCTUATION_VARIATION_PREFIX + trimmedSuggestion;
|
||||
suggestions.add(trimmedSuggestion);
|
||||
return;
|
||||
}
|
||||
// make the new line better readable
|
||||
else if (suggestion.equals("\n")) {
|
||||
suggestions.add(Characters.NEW_LINE);
|
||||
}
|
||||
// or add any other suggestion as is
|
||||
else {
|
||||
suggestions.add(suggestion);
|
||||
|
||||
// convert the unreadable special characters to their readable form or add the readable ones
|
||||
switch (suggestion) {
|
||||
case "\n" -> suggestions.add(Characters.NEW_LINE);
|
||||
case Characters.ZWJ -> suggestions.add(Characters.ZWJ_GRAPHIC);
|
||||
case Characters.ZWNJ -> suggestions.add(Characters.ZWNJ_GRAPHIC);
|
||||
default -> suggestions.add(suggestion);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -12,6 +12,10 @@ import io.github.sspanak.tt9.languages.LanguageKind;
|
|||
public class Characters {
|
||||
public static final String GR_QUESTION_MARK = ";";
|
||||
public static final String NEW_LINE = Build.VERSION.SDK_INT >= Build.VERSION_CODES.M && new Paint().hasGlyph("⏎") ? "⏎" : "\\n";
|
||||
public static final String ZWJ = "\u200D";
|
||||
public static final String ZWJ_GRAPHIC = "ZWJ";
|
||||
public static final String ZWNJ = "\u200C";
|
||||
public static final String ZWNJ_GRAPHIC = "ZWNJ";
|
||||
|
||||
final public static ArrayList<String> ArabicNumbers = new ArrayList<>(Arrays.asList(
|
||||
"٠", "١", "٢", "٣", "٤", "٥", "٦", "٧", "٨", "٩"
|
||||
|
|
@ -47,6 +51,10 @@ public class Characters {
|
|||
",", ".", "-", "«", "»", "(", ")", "&", "~", "`", "'", "\"", "·", ":", "!", GR_QUESTION_MARK
|
||||
));
|
||||
|
||||
final public static ArrayList<String> PunctuationIndic = new ArrayList<>(Arrays.asList(
|
||||
",", ".", "-", ZWJ, ZWNJ, "(", ")", "।", "॰", "॥", "&", "~", "`", ";", ":", "'", "\"", "!", "?"
|
||||
));
|
||||
|
||||
final public static ArrayList<String> PunctuationKorean = new ArrayList<>(Arrays.asList(
|
||||
",", ".", "~", "1", "(", ")", "&", "-", "`", ";", ":", "'", "\"", "!", "?"
|
||||
));
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
package io.github.sspanak.tt9.util;
|
||||
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.List;
|
||||
import java.util.Locale;
|
||||
|
|
@ -88,4 +89,16 @@ public class TextTools {
|
|||
|
||||
return sdf.format(new Date(timestamp));
|
||||
}
|
||||
|
||||
|
||||
public static ArrayList<String> removeLettersFromList(ArrayList<String> list) {
|
||||
ArrayList<String> cleanList = new ArrayList<>();
|
||||
for (String ch : list) {
|
||||
if (!Character.isAlphabetic(ch.codePointAt(0))) {
|
||||
cleanList.add(ch);
|
||||
}
|
||||
}
|
||||
|
||||
return cleanList;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -43,7 +43,7 @@ ext.validateLanguageFiles = { definitionsDir, dictionariesDir, validationDir ->
|
|||
|
||||
|
||||
ext.parseLanguageDefintion = { File languageFile, String dictionariesDir ->
|
||||
String alphabet = ""
|
||||
String alphabet = ''
|
||||
int layoutKey = 0
|
||||
HashMap<String, String> sounds = new HashMap<>()
|
||||
HashMap<String, String> layoutSounds = new HashMap<>()
|
||||
|
|
@ -58,10 +58,6 @@ ext.parseLanguageDefintion = { File languageFile, String dictionariesDir ->
|
|||
String localeString = ""
|
||||
String dictionaryFileName = ""
|
||||
|
||||
alphabet = languageFile.name.contains("Catalan") ? '·' : alphabet
|
||||
alphabet = languageFile.name.contains("Hebrew") || languageFile.name.contains("Yiddish") ? '"' : alphabet
|
||||
alphabet = languageFile.name.contains("Korean") ? ':' : alphabet
|
||||
|
||||
for (String line : languageFile.readLines()) {
|
||||
if (
|
||||
line.matches("^[a-zA-Z].*")
|
||||
|
|
@ -110,19 +106,15 @@ ext.parseLanguageDefintion = { File languageFile, String dictionariesDir ->
|
|||
|
||||
// alphabet string
|
||||
def lineCharacters = extractAlphabetCharsFromLine(line)
|
||||
lineCharacters = lineCharacters.isEmpty() ? extractAlphabetExtraCharsFromLine(languageFile.name, line) : lineCharacters
|
||||
|
||||
alphabet += lineCharacters
|
||||
|
||||
// sounds, single letters
|
||||
// sounds, single letters or special characters that are treated as letters
|
||||
if (lineCharacters) {
|
||||
lineCharacters.each { letter ->
|
||||
layoutSounds.put(letter, layoutKey.toString())
|
||||
}
|
||||
} else if (line.contains("PUNCTUATION")) {
|
||||
layoutSounds.put("-", layoutKey.toString())
|
||||
layoutSounds.put(".", layoutKey.toString())
|
||||
layoutSounds.put("'", layoutKey.toString())
|
||||
layoutSounds.put('"', layoutKey.toString())
|
||||
layoutSounds.put('·', layoutKey.toString())
|
||||
}
|
||||
|
||||
if (isLayoutLine(line)) {
|
||||
|
|
@ -178,7 +170,8 @@ ext.parseLanguageDefintion = { File languageFile, String dictionariesDir ->
|
|||
|
||||
// this cannot be static, because DictionaryTools will not be visible
|
||||
def validateDictionary(File dictionaryFile, String alphabet, HashMap<String, String> sounds, boolean isAlphabeticLanguage, Locale locale, int maxErrors, String csvDelimiter, int maxWordFrequency) {
|
||||
final VALID_CHARS = alphabet.toUpperCase(locale) == alphabet ? "^[${alphabet}\\-\\.']+\$" : "^[${alphabet}${alphabet.toUpperCase(locale)}\\-\\.']+\$"
|
||||
String regexSafeAlphabet = alphabet.replaceAll("([\\[\\]\\-\\.])", "")
|
||||
final VALID_CHARS = alphabet.toUpperCase(locale) == alphabet ? "^[${regexSafeAlphabet}\\.\\-]+\$" : "^[${regexSafeAlphabet}${regexSafeAlphabet.toUpperCase(locale)}\\.\\-]+\$"
|
||||
|
||||
int errorCount = 0
|
||||
String errorMsg = ''
|
||||
|
|
@ -238,6 +231,30 @@ def validateDictionary(File dictionaryFile, String alphabet, HashMap<String, Str
|
|||
|
||||
//////////////////// PARSING ////////////////////
|
||||
|
||||
static def extractAlphabetExtraCharsFromLine(String languageName, String line) {
|
||||
if (languageName == null || !line.contains('PUNCTUATION') || !isLayoutLine(line)) {
|
||||
return ''
|
||||
}
|
||||
|
||||
final DEFAULT = "'-."
|
||||
|
||||
if (languageName.contains('Korean')) {
|
||||
return DEFAULT
|
||||
} else if (languageName.contains("Hebrew") || languageName.contains("Yiddish")) {
|
||||
return DEFAULT + '"'
|
||||
}
|
||||
|
||||
String allChars = line
|
||||
.replaceFirst('\\].*', '')
|
||||
.replaceFirst('^\\s+- \\[', '')
|
||||
.replaceFirst("PUNCTUATION[^,\\s]*", '')
|
||||
.replace(',', '')
|
||||
.replace(' ', '')
|
||||
|
||||
return DEFAULT + allChars
|
||||
}
|
||||
|
||||
|
||||
static def extractAlphabetCharsFromLine(String line) {
|
||||
if (line.contains('PUNCTUATION') || line.contains('SPECIAL') || !isLayoutLine(line)) {
|
||||
return ''
|
||||
|
|
@ -298,7 +315,7 @@ static def validateWord(String word, String validCharacters, boolean isAlphabeti
|
|||
errors += "${errorMsgPrefix}. Found a garbage word: '${word}' on line ${lineNumber}.\n"
|
||||
}
|
||||
|
||||
if (isAlphabeticLanguage && word.matches("^(.|\\p{L}\\p{M}?)\$")) {
|
||||
if (isAlphabeticLanguage && word.trim().length() == 1) {
|
||||
errorCount++
|
||||
errors += "${errorMsgPrefix}. Found a single letter: '${word}' on line ${lineNumber}. Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
|
||||
}
|
||||
|
|
|
|||
17
docs/dictionaries/hiWordlistReadme.txt
Normal file
17
docs/dictionaries/hiWordlistReadme.txt
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
Hindi word list 1 by: FreeDict
|
||||
Version: 2017-12-02
|
||||
Sources: http://freedict.org/, http://www.iiit.net/ltrc/Dictionaries/Dict_Frame.html
|
||||
License: GPL
|
||||
|
||||
Conjunct consonants list and some more common words obtained from Wikipedia
|
||||
Version: 2024-12-05
|
||||
Sources: https://en.wiktionary.org/wiki/Appendix:Common_Hindi_words, https://en.wikipedia.org/wiki/Devanagari_conjuncts
|
||||
License: Creative Commons Attribution-ShareAlike 4.0 License
|
||||
|
||||
Hindi and Sanskrit word list and frequencies by: CC-100;
|
||||
Version: 2020
|
||||
Source: https://data.statmt.org/cc-100/
|
||||
References (PDF links are available in the source URL):
|
||||
- Unsupervised Cross-lingual Representation Learning at Scale, Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer, Veselin Stoyanov, Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL), p. 8440-8451, July 2020.
|
||||
- CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data, Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, Edouard Grave, Proceedings of the 12th Language Resources and Evaluation Conference (LREC), p. 4003-4012, May 2020.
|
||||
Remark: Only the words that appear 3 times or more in each list were used.
|
||||
BIN
downloads/hi-utf8.zip
Normal file
BIN
downloads/hi-utf8.zip
Normal file
Binary file not shown.
|
|
@ -65,12 +65,26 @@ function containsMultipleMatraNasalizations(word) {
|
|||
.test(word);
|
||||
}
|
||||
|
||||
function containsModifierMatra(word) {
|
||||
return /[\u{900}-\u{903}\u{94d}][\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]/u.test(word);
|
||||
}
|
||||
|
||||
|
||||
function containsTooManyRepeatedLetters(word) {
|
||||
return /(.)\1{2,}/.test(word);
|
||||
}
|
||||
|
||||
|
||||
function containsForeignLetters(word) {
|
||||
return /[\u{944}ऑऍऎऒॠ]+[\u{900}-\u{903}\u{94d}\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]?/u.test(word);
|
||||
}
|
||||
|
||||
|
||||
function fixNuqta(word) {
|
||||
return word.replaceAll('ऴ', '\u{933}\u{93c}');
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* isValid
|
||||
*
|
||||
|
|
@ -83,13 +97,15 @@ function isValid(word) {
|
|||
&& !containsInvalidZWJ(word)
|
||||
&& !containsMultipleNasalizations(word)
|
||||
&& !containsMultipleMatraNasalizations(word)
|
||||
&& !containsModifierMatra(word)
|
||||
&& !containsTooManyRepeatedLetters(word)
|
||||
&& !containsForeignLetters(word)
|
||||
}
|
||||
|
||||
|
||||
function work({ file }) {
|
||||
Array.from(getWordsFromFile(file)).forEach(w => {
|
||||
if (isValid(w)) print(w);
|
||||
if (isValid(w)) print(fixNuqta(w));
|
||||
});
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue