1
0
Fork 0
* Added Hindi language

	* Removed the hardcoded special characters from language validation. Now reading them from the .yml

	* improved method of hiding the letters on 0 and 1, when needed

	* virtual keypad adjustments

	* improved the single-letter validation during build time

	* improved Devanagari validation script

	* improved sorting when filters are on
This commit is contained in:
sspanak 2024-12-01 18:47:41 +02:00 committed by Dimo Karaivanov
parent 622a954633
commit f8e6668281
18 changed files with 1305176 additions and 103 deletions

View file

@ -0,0 +1,15 @@
locale: hi-IN
dictionaryFile: hi-utf8.csv
abcString: कखग
hasUpperCase: no
layout:
- [SPECIAL, ॐ] # 0 ==> [SPECIAL, Om]
- [्, ़, ऽ, , PUNCTUATION_IN] # 1 ==> [halant (cancel vowel sign), nuqta (converts k -> q, ph -> f, etc), avagraha, visarga]
- [अ, आ, ब, भ, च, छ, ा] # 2 ==> [a, ā, b, bh, c, ch, ā (combining)]
- [द, ध, ड, ढ, ए, ऐ, फ, े, ै, ॆ, ॅ] # 3 ==> [d, dh, ḍ, ḍh, ē, ai, ph, ē (combining), ai (combining), e (combining), ê (ae-combining)];
- [ग, घ, ह, इ, ई, ि, ी] # 4 ==> [g, gh, h, i, ī, i (combining), ī (combining)]
- [ज, झ, क, ख, ल, ळ, ऌ, ॢ] # 5 ==> [j, jh, k, kh, l, ḷ, l̥, l̥ (combining)]
- [म, ङ, ञ, ण, न, ओ, औ, ऑ, ं, ँ, ो, ौ, ॊ, ॉ] # 6 ==> [m, ṅ, ñ, ṇ, n, o, au, ŏ, ṁ (n-combining with consonant, e.g. "ng", "nd"), m̐ (m-combining with vowel, e.g. "am", "em"), ō (combining), au (combining), o (combining), ŏ (combining)]
- [प, र, ऋ, स, श, ष, ृ] # 7 ==> [p, r, r̥ (ri), s, sh, rsh (like "norsk"), r̥ (ri-combining)]
- [त, थ, ट, ठ, उ, ऊ, व, ु, ू] # 8 ==> [t, th, ṭ, ṭh, u, ū, v, u (combining), ū (combining)]
- [य] # 9 ==> [y]

File diff suppressed because it is too large Load diff

View file

@ -9,6 +9,7 @@ import io.github.sspanak.tt9.languages.Language;
import io.github.sspanak.tt9.languages.NaturalLanguage; import io.github.sspanak.tt9.languages.NaturalLanguage;
import io.github.sspanak.tt9.preferences.settings.SettingsStore; import io.github.sspanak.tt9.preferences.settings.SettingsStore;
import io.github.sspanak.tt9.util.Characters; import io.github.sspanak.tt9.util.Characters;
import io.github.sspanak.tt9.util.TextTools;
class Mode123 extends ModePassthrough { class Mode123 extends ModePassthrough {
@Override public int getId() { return MODE_123; } @Override public int getId() { return MODE_123; }
@ -49,8 +50,12 @@ class Mode123 extends ModePassthrough {
* use the default list, but reorder it a bit for convenience. * use the default list, but reorder it a bit for convenience.
*/ */
private void setDefaultSpecialCharacters() { private void setDefaultSpecialCharacters() {
KEY_CHARACTERS.add(applyNumericFieldCharacterOrder(settings.getOrderedKeyChars(language, 0))); KEY_CHARACTERS.add(
KEY_CHARACTERS.add(applyNumericFieldCharacterOrder(settings.getOrderedKeyChars(language, 1))); TextTools.removeLettersFromList(applyNumericFieldCharacterOrder(settings.getOrderedKeyChars(language, 0)))
);
KEY_CHARACTERS.add(
TextTools.removeLettersFromList(applyNumericFieldCharacterOrder(settings.getOrderedKeyChars(language, 1)))
);
} }

View file

@ -18,6 +18,7 @@ import io.github.sspanak.tt9.languages.LanguageKind;
import io.github.sspanak.tt9.languages.NaturalLanguage; import io.github.sspanak.tt9.languages.NaturalLanguage;
import io.github.sspanak.tt9.preferences.settings.SettingsStore; import io.github.sspanak.tt9.preferences.settings.SettingsStore;
import io.github.sspanak.tt9.util.Characters; import io.github.sspanak.tt9.util.Characters;
import io.github.sspanak.tt9.util.TextTools;
class ModeCheonjiin extends InputMode { class ModeCheonjiin extends InputMode {
// used when we want do display a different set of characters for a given key, for example // used when we want do display a different set of characters for a given key, for example
@ -68,14 +69,23 @@ class ModeCheonjiin extends InputMode {
} }
/**
* setCustomSpecialCharacters
* Filter out the letters from the 0-key list and add "0", because there is no other way of
* typing it.
*/
protected void setCustomSpecialCharacters() { protected void setCustomSpecialCharacters() {
// special
KEY_CHARACTERS.add(TextTools.removeLettersFromList(applyPunctuationOrder(Characters.Special, 0)));
if (settings.holdForPunctuationInKorean()) { if (settings.holdForPunctuationInKorean()) {
ArrayList<String> specialChars = new ArrayList<>(applyPunctuationOrder(Characters.Special, 0)); KEY_CHARACTERS.get(0).add(0, "0");
specialChars.add(0, "0");
KEY_CHARACTERS.add(specialChars);
}
} }
// punctuation
KEY_CHARACTERS.add(
TextTools.removeLettersFromList(applyPunctuationOrder(Characters.PunctuationKorean, 1))
);
}
protected void setSpecialCharacterConstants() { protected void setSpecialCharacterConstants() {

View file

@ -0,0 +1,59 @@
package io.github.sspanak.tt9.ime.modes.predictions;
import androidx.annotation.NonNull;
import androidx.annotation.Nullable;
import java.util.ArrayList;
import java.util.Collections;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import io.github.sspanak.tt9.languages.Language;
import io.github.sspanak.tt9.languages.LanguageKind;
class LocaleWordsSorter {
private final Pattern sortingPattern;
LocaleWordsSorter(@Nullable Language language) {
if (LanguageKind.isHindi(language)) {
sortingPattern = Pattern.compile("[\\u0904-\\u0939\\u0958-\\u0961][\\u0900-\\u0904\\u093A-\\u094F\\u0962\\u0963]+");
} else {
sortingPattern = null;
}
}
/**
* Reduces the length of a word using the sortingRegex. Usually meant to consider a
* base letter + modifiers as a single character.
*/
private int reduceLength(String word) {
Matcher matcher = sortingPattern.matcher(word);
int length = word.length();
while (matcher.find()) {
length -= matcher.end() - matcher.start() - 1;
}
return length;
}
ArrayList<String> sort(ArrayList<String> words) {
if (sortingPattern == null || words == null) {
return words;
}
ArrayList<String> wordsCopy = new ArrayList<>(words);
Collections.sort(wordsCopy, (a, b) -> reduceLength(a) - reduceLength(b));
return wordsCopy;
}
boolean shouldSort(@Nullable Language language, @NonNull String stem, @NonNull String digitSequence) {
return LanguageKind.isIndic(language) && !stem.isEmpty() && stem.length() == digitSequence.length() - 1;
}
}

View file

@ -5,12 +5,14 @@ import java.util.ArrayList;
import io.github.sspanak.tt9.db.DataStore; import io.github.sspanak.tt9.db.DataStore;
import io.github.sspanak.tt9.ime.helpers.TextField; import io.github.sspanak.tt9.ime.helpers.TextField;
import io.github.sspanak.tt9.languages.EmojiLanguage; import io.github.sspanak.tt9.languages.EmojiLanguage;
import io.github.sspanak.tt9.languages.Language;
import io.github.sspanak.tt9.preferences.settings.SettingsStore; import io.github.sspanak.tt9.preferences.settings.SettingsStore;
import io.github.sspanak.tt9.util.Characters; import io.github.sspanak.tt9.util.Characters;
import io.github.sspanak.tt9.util.TextTools; import io.github.sspanak.tt9.util.TextTools;
public class WordPredictions extends Predictions { public class WordPredictions extends Predictions {
private final TextField textField; private final TextField textField;
private LocaleWordsSorter localeWordsSorter;
private String inputWord; private String inputWord;
private boolean isStemFuzzy; private boolean isStemFuzzy;
@ -21,11 +23,21 @@ public class WordPredictions extends Predictions {
public WordPredictions(SettingsStore settings, TextField textField) { public WordPredictions(SettingsStore settings, TextField textField) {
super(settings); super(settings);
lastEnforcedTopWord = ""; lastEnforcedTopWord = "";
localeWordsSorter = new LocaleWordsSorter(null);
stem = ""; stem = "";
this.textField = textField; this.textField = textField;
} }
@Override
public Predictions setLanguage(Language language) {
super.setLanguage(language);
localeWordsSorter = new LocaleWordsSorter(language);
return this;
}
public WordPredictions setIsStemFuzzy(boolean yes) { public WordPredictions setIsStemFuzzy(boolean yes) {
this.isStemFuzzy = yes; this.isStemFuzzy = yes;
return this; return this;
@ -89,6 +101,7 @@ public class WordPredictions extends Predictions {
words.addAll(dbWords); words.addAll(dbWords);
} else { } else {
suggestStem(); suggestStem();
dbWords = localeWordsSorter.shouldSort(language, stem, digitSequence) ? localeWordsSorter.sort(dbWords) : dbWords;
dbWords = rearrangeByPairFrequency(dbWords); dbWords = rearrangeByPairFrequency(dbWords);
suggestMissingWords(generatePossibleStemVariations(dbWords)); suggestMissingWords(generatePossibleStemVariations(dbWords));
suggestMissingWords(dbWords.isEmpty() ? generateWordVariations(inputWord) : dbWords); suggestMissingWords(dbWords.isEmpty() ? generateWordVariations(inputWord) : dbWords);

View file

@ -6,12 +6,13 @@ public class LanguageKind {
public static final int KOREAN = 601579; public static final int KOREAN = 601579;
public static boolean isArabic(Language language) { return language != null && language.getId() == 502337; } public static boolean isArabic(Language language) { return language != null && language.getId() == 502337; }
public static boolean isBulgarian(Language language) { return language != null && language.getId() == 231650; }
public static boolean isCyrillic(Language language) { return language != null && language.getKeyCharacters(2).contains("а"); } public static boolean isCyrillic(Language language) { return language != null && language.getKeyCharacters(2).contains("а"); }
public static boolean isEnglish(Language language) { return language != null && language.getLocale().equals(Locale.ENGLISH); } public static boolean isEnglish(Language language) { return language != null && language.getLocale().equals(Locale.ENGLISH); }
public static boolean isFrench(Language language) { return language != null && language.getId() == 596550; } public static boolean isFrench(Language language) { return language != null && language.getId() == 596550; }
public static boolean isGreek(Language language) { return language != null && language.getId() == 597381; } public static boolean isGreek(Language language) { return language != null && language.getId() == 597381; }
public static boolean isHebrew(Language language) { return language != null && (language.getId() == 305450 || language.getId() == 403177); } public static boolean isHebrew(Language language) { return language != null && (language.getId() == 305450 || language.getId() == 403177); }
public static boolean isHindi(Language language) { return language != null && language.getId() == 468264; }
public static boolean isIndic(Language language) { return isHindi(language); }
public static boolean isHinglish(Language language) { return language != null && language.getId() == 468421; } public static boolean isHinglish(Language language) { return language != null && language.getId() == 468421; }
public static boolean isKorean(Language language) { return language != null && language.getId() == KOREAN; } public static boolean isKorean(Language language) { return language != null && language.getId() == KOREAN; }
public static boolean isLatinBased(Language language) { return language != null && language.getKeyCharacters(2).contains("a"); } public static boolean isLatinBased(Language language) { return language != null && language.getKeyCharacters(2).contains("a"); }

View file

@ -4,7 +4,9 @@ import androidx.annotation.NonNull;
import java.util.ArrayList; import java.util.ArrayList;
import java.util.HashMap; import java.util.HashMap;
import java.util.List;
import java.util.Locale; import java.util.Locale;
import java.util.Map;
import io.github.sspanak.tt9.languages.exceptions.InvalidLanguageCharactersException; import io.github.sspanak.tt9.languages.exceptions.InvalidLanguageCharactersException;
import io.github.sspanak.tt9.util.Characters; import io.github.sspanak.tt9.util.Characters;
@ -76,39 +78,24 @@ public class NaturalLanguage extends Language implements Comparable<NaturalLangu
private ArrayList<String> generateSpecialChars(ArrayList<String> definitionChars) { private ArrayList<String> generateSpecialChars(ArrayList<String> definitionChars) {
final String SPECIAL_CHARS_PLACEHOLDER = "SPECIAL"; final String SPECIAL_CHARS_PLACEHOLDER = "SPECIAL";
final String PUNCTUATION_PLACEHOLDER = "PUNCTUATION"; final String PUNCTUATION_PLACEHOLDER = "PUNCTUATION";
final String ARABIC_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_AR";
final String FRENCH_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_FR"; final Map<String, List<String>> specialChars = new HashMap<>();
final String GERMAN_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_DE"; specialChars.put(SPECIAL_CHARS_PLACEHOLDER, Characters.Special);
final String GREEK_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_GR"; specialChars.put(PUNCTUATION_PLACEHOLDER, Characters.PunctuationEnglish);
final String KOREAN_PUNCTUATION_STYLE = PUNCTUATION_PLACEHOLDER + "_KR"; specialChars.put(PUNCTUATION_PLACEHOLDER + "_AR", Characters.PunctuationArabic);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_FR", Characters.PunctuationFrench);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_DE", Characters.PunctuationGerman);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_GR", Characters.PunctuationGreek);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_IN", Characters.PunctuationIndic);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_KR", Characters.PunctuationKorean);
ArrayList<String> keyChars = new ArrayList<>(); ArrayList<String> keyChars = new ArrayList<>();
for (String defChar : definitionChars) { for (String defChar : definitionChars) {
switch (defChar) { List<String> keySpecialChars = specialChars.containsKey(defChar) ? specialChars.get(defChar) : null;
case SPECIAL_CHARS_PLACEHOLDER: if (keySpecialChars != null) {
keyChars.addAll(Characters.Special); keyChars.addAll(keySpecialChars);
break; } else {
case PUNCTUATION_PLACEHOLDER:
keyChars.addAll(Characters.PunctuationEnglish);
break;
case ARABIC_PUNCTUATION_STYLE:
keyChars.addAll(Characters.PunctuationArabic);
break;
case FRENCH_PUNCTUATION_STYLE:
keyChars.addAll(Characters.PunctuationFrench);
break;
case GERMAN_PUNCTUATION_STYLE:
keyChars.addAll(Characters.PunctuationGerman);
break;
case GREEK_PUNCTUATION_STYLE:
keyChars.addAll(Characters.PunctuationGreek);
break;
case KOREAN_PUNCTUATION_STYLE:
keyChars.addAll(Characters.PunctuationKorean);
break;
default:
keyChars.add(defChar); keyChars.add(defChar);
break;
} }
} }

View file

@ -88,10 +88,6 @@ class SettingsPunctuation extends SettingsInput {
orderedChars = language.getKeyCharacters(number); orderedChars = language.getKeyCharacters(number);
} }
if (number < 2) {
orderedChars = removeLettersFromList(orderedChars);
}
return orderedChars; return orderedChars;
} }
@ -118,16 +114,4 @@ class SettingsPunctuation extends SettingsInput {
return charsList; return charsList;
} }
private ArrayList<String> removeLettersFromList(ArrayList<String> list) {
ArrayList<String> cleanList = new ArrayList<>();
for (String s : list) {
if (!Character.isAlphabetic(s.codePointAt(0))) {
cleanList.add(s);
}
}
return cleanList;
}
} }

View file

@ -25,6 +25,7 @@ public class SettingsStore extends SettingsUI {
public final static int SOFT_KEY_DOUBLE_CLICK_DELAY = 500; // ms public final static int SOFT_KEY_DOUBLE_CLICK_DELAY = 500; // ms
public final static int SOFT_KEY_REPEAT_DELAY = 40; // ms public final static int SOFT_KEY_REPEAT_DELAY = 40; // ms
public final static int SOFT_KEY_TITLE_MAX_CHARS = 5; public final static int SOFT_KEY_TITLE_MAX_CHARS = 5;
public final static int SOFT_KEY_TITLE_MAX_CHARS_INDIC = 3;
public final static int SOFT_KEY_TITLE_SIZE = 18; // sp public final static int SOFT_KEY_TITLE_SIZE = 18; // sp
public final static float SOFT_KEY_COMPLEX_LABEL_TITLE_RELATIVE_SIZE = 0.55f; public final static float SOFT_KEY_COMPLEX_LABEL_TITLE_RELATIVE_SIZE = 0.55f;
public final static float SOFT_KEY_COMPLEX_LABEL_ARABIC_TITLE_RELATIVE_SIZE = 0.72f; public final static float SOFT_KEY_COMPLEX_LABEL_ARABIC_TITLE_RELATIVE_SIZE = 0.72f;

View file

@ -144,22 +144,17 @@ public class SoftKeyNumber extends SoftKey {
} }
ArrayList<String> chars = language.getKeyCharacters(number); ArrayList<String> chars = language.getKeyCharacters(number);
boolean isBulgarian = LanguageKind.isBulgarian(language);
boolean isGreek = LanguageKind.isGreek(language); boolean isGreek = LanguageKind.isGreek(language);
boolean isLatinBased = LanguageKind.isLatinBased(language); boolean isLatinBased = LanguageKind.isLatinBased(language);
boolean isUkrainian = LanguageKind.isUkrainian(language);
boolean isUppercase = tt9.getTextCase() == InputMode.CASE_UPPER; boolean isUppercase = tt9.getTextCase() == InputMode.CASE_UPPER;
final int maxChars = LanguageKind.isIndic(language) ? SettingsStore.SOFT_KEY_TITLE_MAX_CHARS_INDIC : SettingsStore.SOFT_KEY_TITLE_MAX_CHARS;
if ( String displayChars = getDefaultCharList(chars, language.getLocale(), isGreek, isLatinBased, isUppercase);
isBulgarian if (displayChars.length() > maxChars) {
|| isGreek displayChars = abbreviateCharList(displayChars, language.getLocale(), isUppercase);
|| isLatinBased
|| (isUkrainian && number == 2)
|| chars.size() < SettingsStore.SOFT_KEY_TITLE_MAX_CHARS) {
return getDefaultCharList(chars, language.getLocale(), isGreek, isLatinBased, isUppercase);
} else {
return abbreviateCharList(chars, language.getLocale(), isUppercase);
} }
return displayChars;
} }
@ -188,27 +183,37 @@ public class SoftKeyNumber extends SoftKey {
* on one key. As suggested by the community, we could display them as "A-Z". * on one key. As suggested by the community, we could display them as "A-Z".
* @see <a href="https://github.com/sspanak/tt9/issues/628">Issue #628</a> * @see <a href="https://github.com/sspanak/tt9/issues/628">Issue #628</a>
*/ */
private String abbreviateCharList(ArrayList<String> chars, Locale locale, boolean isUppercase) { private String abbreviateCharList(String chars, Locale locale, boolean isUppercase) {
boolean containsCombiningChars = TextTools.isCombining(chars.get(0)) || TextTools.isCombining(chars.get(chars.size() - 1)); String firstLetter = chars.substring(0, 1);
String lastLetter = chars.substring(chars.length() - 1);
boolean containsCombiningChars = TextTools.isCombining(firstLetter) || TextTools.isCombining(lastLetter);
return return
(isUppercase ? chars.get(0).toUpperCase(locale) : chars.get(0)) (isUppercase ? firstLetter.toUpperCase(locale) : firstLetter)
+ (containsCombiningChars ? " " : "") + (containsCombiningChars ? " " : "")
+ (isUppercase ? chars.get(chars.size() - 1).toUpperCase(locale) : chars.get(chars.size() - 1)); + (isUppercase ? lastLetter.toUpperCase(locale) : lastLetter);
} }
/** /**
* As suggested by the community, there is no need to display the accented letters. * Reduces the number of displayed characters by leaving the most descriptive ones. This prevents
* People are used to seeing just "ABC", "DEF", etc. In the case of Korean, the keypad looks too * the visual clutter on the keys.
* cluttered, so we skip the double consonants, like on phones with a physical keypad.
*/ */
private boolean shouldSkipAccents(char currentLetter, boolean isGreek, boolean isLatinBased) { private boolean shouldSkipAccents(char currentLetter, boolean isGreek, boolean isLatinBased) {
return return
currentLetter == 'ѝ' // Latin. As suggested by the community, there is no need to display the accented letters. People are
|| currentLetter == 'ґ' // used to seeing just "ABC", "DEF", etc.
(isLatinBased && currentLetter > 'z')
// Cyrillic. Same as above.
|| currentLetter == 'ѝ' || currentLetter == 'ґ'
// Korean double consonants
|| (currentLetter == 'ㄲ' || currentLetter == 'ㄸ' || currentLetter == 'ㅃ' || currentLetter == 'ㅆ' || currentLetter == 'ㅉ') || (currentLetter == 'ㄲ' || currentLetter == 'ㄸ' || currentLetter == 'ㅃ' || currentLetter == 'ㅆ' || currentLetter == 'ㅉ')
|| (isLatinBased && currentLetter > 'z') // Greek diacritics and ending sigma
|| currentLetter == 'ς' || currentLetter == 'ς'
|| (isGreek && (currentLetter < 'α' || currentLetter > 'ω')); || (isGreek && (currentLetter < 'α' || currentLetter > 'ω'))
// Hindi matras
|| (currentLetter >= 0x0900 && currentLetter <= 0x0903) || (currentLetter >= 0x093A && currentLetter <= 0x094F)
|| (currentLetter >= 0x0951 && currentLetter <= 0x0957) || currentLetter == 0x0962 || currentLetter == 0x0963
;
} }
} }

View file

@ -125,15 +125,21 @@ public class SuggestionsBar {
return ""; return "";
} }
if (suggestions.get(id).endsWith(STEM_SUFFIX)) { String suggestion = suggestions.get(id);
if (suggestion.endsWith(STEM_SUFFIX)) {
return stem; return stem;
} else if (suggestions.get(id).startsWith(STEM_VARIATION_PREFIX)) { } else if (suggestion.startsWith(STEM_VARIATION_PREFIX)) {
return stem + suggestions.get(id).substring(STEM_VARIATION_PREFIX.length()); return stem + suggestion.substring(STEM_VARIATION_PREFIX.length());
} else if (suggestions.get(id).startsWith(STEM_PUNCTUATION_VARIATION_PREFIX)) { } else if (suggestion.startsWith(STEM_PUNCTUATION_VARIATION_PREFIX)) {
return stem + suggestions.get(id).substring(STEM_PUNCTUATION_VARIATION_PREFIX.length()); return stem + suggestion.substring(STEM_PUNCTUATION_VARIATION_PREFIX.length());
} }
return suggestions.get(id).equals(Characters.NEW_LINE) ? "\n" : suggestions.get(id); return switch (suggestion) {
case Characters.ZWJ_GRAPHIC -> Characters.ZWJ;
case Characters.ZWNJ_GRAPHIC -> Characters.ZWNJ;
default -> suggestion.equals(Characters.NEW_LINE) ? "\n" : suggestion;
};
} }
@ -195,14 +201,15 @@ public class SuggestionsBar {
String trimmedSuggestion = suggestion.substring(stem.length()); String trimmedSuggestion = suggestion.substring(stem.length());
trimmedSuggestion = Character.isAlphabetic(trimmedSuggestion.charAt(0)) ? STEM_VARIATION_PREFIX + trimmedSuggestion : STEM_PUNCTUATION_VARIATION_PREFIX + trimmedSuggestion; trimmedSuggestion = Character.isAlphabetic(trimmedSuggestion.charAt(0)) ? STEM_VARIATION_PREFIX + trimmedSuggestion : STEM_PUNCTUATION_VARIATION_PREFIX + trimmedSuggestion;
suggestions.add(trimmedSuggestion); suggestions.add(trimmedSuggestion);
return;
} }
// make the new line better readable
else if (suggestion.equals("\n")) { // convert the unreadable special characters to their readable form or add the readable ones
suggestions.add(Characters.NEW_LINE); switch (suggestion) {
} case "\n" -> suggestions.add(Characters.NEW_LINE);
// or add any other suggestion as is case Characters.ZWJ -> suggestions.add(Characters.ZWJ_GRAPHIC);
else { case Characters.ZWNJ -> suggestions.add(Characters.ZWNJ_GRAPHIC);
suggestions.add(suggestion); default -> suggestions.add(suggestion);
} }
} }

View file

@ -12,6 +12,10 @@ import io.github.sspanak.tt9.languages.LanguageKind;
public class Characters { public class Characters {
public static final String GR_QUESTION_MARK = ";"; public static final String GR_QUESTION_MARK = ";";
public static final String NEW_LINE = Build.VERSION.SDK_INT >= Build.VERSION_CODES.M && new Paint().hasGlyph("") ? "" : "\\n"; public static final String NEW_LINE = Build.VERSION.SDK_INT >= Build.VERSION_CODES.M && new Paint().hasGlyph("") ? "" : "\\n";
public static final String ZWJ = "\u200D";
public static final String ZWJ_GRAPHIC = "ZWJ";
public static final String ZWNJ = "\u200C";
public static final String ZWNJ_GRAPHIC = "ZWNJ";
final public static ArrayList<String> ArabicNumbers = new ArrayList<>(Arrays.asList( final public static ArrayList<String> ArabicNumbers = new ArrayList<>(Arrays.asList(
"٠", "١", "٢", "٣", "٤", "٥", "٦", "٧", "٨", "٩" "٠", "١", "٢", "٣", "٤", "٥", "٦", "٧", "٨", "٩"
@ -47,6 +51,10 @@ public class Characters {
",", ".", "-", "«", "»", "(", ")", "&", "~", "`", "'", "\"", "·", ":", "!", GR_QUESTION_MARK ",", ".", "-", "«", "»", "(", ")", "&", "~", "`", "'", "\"", "·", ":", "!", GR_QUESTION_MARK
)); ));
final public static ArrayList<String> PunctuationIndic = new ArrayList<>(Arrays.asList(
",", ".", "-", ZWJ, ZWNJ, "(", ")", "", "", "", "&", "~", "`", ";", ":", "'", "\"", "!", "?"
));
final public static ArrayList<String> PunctuationKorean = new ArrayList<>(Arrays.asList( final public static ArrayList<String> PunctuationKorean = new ArrayList<>(Arrays.asList(
",", ".", "~", "1", "(", ")", "&", "-", "`", ";", ":", "'", "\"", "!", "?" ",", ".", "~", "1", "(", ")", "&", "-", "`", ";", ":", "'", "\"", "!", "?"
)); ));

View file

@ -1,6 +1,7 @@
package io.github.sspanak.tt9.util; package io.github.sspanak.tt9.util;
import java.text.SimpleDateFormat; import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date; import java.util.Date;
import java.util.List; import java.util.List;
import java.util.Locale; import java.util.Locale;
@ -88,4 +89,16 @@ public class TextTools {
return sdf.format(new Date(timestamp)); return sdf.format(new Date(timestamp));
} }
public static ArrayList<String> removeLettersFromList(ArrayList<String> list) {
ArrayList<String> cleanList = new ArrayList<>();
for (String ch : list) {
if (!Character.isAlphabetic(ch.codePointAt(0))) {
cleanList.add(ch);
}
}
return cleanList;
}
} }

View file

@ -43,7 +43,7 @@ ext.validateLanguageFiles = { definitionsDir, dictionariesDir, validationDir ->
ext.parseLanguageDefintion = { File languageFile, String dictionariesDir -> ext.parseLanguageDefintion = { File languageFile, String dictionariesDir ->
String alphabet = "" String alphabet = ''
int layoutKey = 0 int layoutKey = 0
HashMap<String, String> sounds = new HashMap<>() HashMap<String, String> sounds = new HashMap<>()
HashMap<String, String> layoutSounds = new HashMap<>() HashMap<String, String> layoutSounds = new HashMap<>()
@ -58,10 +58,6 @@ ext.parseLanguageDefintion = { File languageFile, String dictionariesDir ->
String localeString = "" String localeString = ""
String dictionaryFileName = "" String dictionaryFileName = ""
alphabet = languageFile.name.contains("Catalan") ? '·' : alphabet
alphabet = languageFile.name.contains("Hebrew") || languageFile.name.contains("Yiddish") ? '"' : alphabet
alphabet = languageFile.name.contains("Korean") ? '' : alphabet
for (String line : languageFile.readLines()) { for (String line : languageFile.readLines()) {
if ( if (
line.matches("^[a-zA-Z].*") line.matches("^[a-zA-Z].*")
@ -110,19 +106,15 @@ ext.parseLanguageDefintion = { File languageFile, String dictionariesDir ->
// alphabet string // alphabet string
def lineCharacters = extractAlphabetCharsFromLine(line) def lineCharacters = extractAlphabetCharsFromLine(line)
lineCharacters = lineCharacters.isEmpty() ? extractAlphabetExtraCharsFromLine(languageFile.name, line) : lineCharacters
alphabet += lineCharacters alphabet += lineCharacters
// sounds, single letters // sounds, single letters or special characters that are treated as letters
if (lineCharacters) { if (lineCharacters) {
lineCharacters.each { letter -> lineCharacters.each { letter ->
layoutSounds.put(letter, layoutKey.toString()) layoutSounds.put(letter, layoutKey.toString())
} }
} else if (line.contains("PUNCTUATION")) {
layoutSounds.put("-", layoutKey.toString())
layoutSounds.put(".", layoutKey.toString())
layoutSounds.put("'", layoutKey.toString())
layoutSounds.put('"', layoutKey.toString())
layoutSounds.put('·', layoutKey.toString())
} }
if (isLayoutLine(line)) { if (isLayoutLine(line)) {
@ -178,7 +170,8 @@ ext.parseLanguageDefintion = { File languageFile, String dictionariesDir ->
// this cannot be static, because DictionaryTools will not be visible // this cannot be static, because DictionaryTools will not be visible
def validateDictionary(File dictionaryFile, String alphabet, HashMap<String, String> sounds, boolean isAlphabeticLanguage, Locale locale, int maxErrors, String csvDelimiter, int maxWordFrequency) { def validateDictionary(File dictionaryFile, String alphabet, HashMap<String, String> sounds, boolean isAlphabeticLanguage, Locale locale, int maxErrors, String csvDelimiter, int maxWordFrequency) {
final VALID_CHARS = alphabet.toUpperCase(locale) == alphabet ? "^[${alphabet}\\-\\.']+\$" : "^[${alphabet}${alphabet.toUpperCase(locale)}\\-\\.']+\$" String regexSafeAlphabet = alphabet.replaceAll("([\\[\\]\\-\\.])", "")
final VALID_CHARS = alphabet.toUpperCase(locale) == alphabet ? "^[${regexSafeAlphabet}\\.\\-]+\$" : "^[${regexSafeAlphabet}${regexSafeAlphabet.toUpperCase(locale)}\\.\\-]+\$"
int errorCount = 0 int errorCount = 0
String errorMsg = '' String errorMsg = ''
@ -238,6 +231,30 @@ def validateDictionary(File dictionaryFile, String alphabet, HashMap<String, Str
//////////////////// PARSING //////////////////// //////////////////// PARSING ////////////////////
static def extractAlphabetExtraCharsFromLine(String languageName, String line) {
if (languageName == null || !line.contains('PUNCTUATION') || !isLayoutLine(line)) {
return ''
}
final DEFAULT = "'-."
if (languageName.contains('Korean')) {
return DEFAULT
} else if (languageName.contains("Hebrew") || languageName.contains("Yiddish")) {
return DEFAULT + '"'
}
String allChars = line
.replaceFirst('\\].*', '')
.replaceFirst('^\\s+- \\[', '')
.replaceFirst("PUNCTUATION[^,\\s]*", '')
.replace(',', '')
.replace(' ', '')
return DEFAULT + allChars
}
static def extractAlphabetCharsFromLine(String line) { static def extractAlphabetCharsFromLine(String line) {
if (line.contains('PUNCTUATION') || line.contains('SPECIAL') || !isLayoutLine(line)) { if (line.contains('PUNCTUATION') || line.contains('SPECIAL') || !isLayoutLine(line)) {
return '' return ''
@ -298,7 +315,7 @@ static def validateWord(String word, String validCharacters, boolean isAlphabeti
errors += "${errorMsgPrefix}. Found a garbage word: '${word}' on line ${lineNumber}.\n" errors += "${errorMsgPrefix}. Found a garbage word: '${word}' on line ${lineNumber}.\n"
} }
if (isAlphabeticLanguage && word.matches("^(.|\\p{L}\\p{M}?)\$")) { if (isAlphabeticLanguage && word.trim().length() == 1) {
errorCount++ errorCount++
errors += "${errorMsgPrefix}. Found a single letter: '${word}' on line ${lineNumber}. Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n" errors += "${errorMsgPrefix}. Found a single letter: '${word}' on line ${lineNumber}. Only uppercase single letters are allowed. The rest of the alphabet will be added automatically.\n"
} }

View file

@ -0,0 +1,17 @@
Hindi word list 1 by: FreeDict
Version: 2017-12-02
Sources: http://freedict.org/, http://www.iiit.net/ltrc/Dictionaries/Dict_Frame.html
License: GPL
Conjunct consonants list and some more common words obtained from Wikipedia
Version: 2024-12-05
Sources: https://en.wiktionary.org/wiki/Appendix:Common_Hindi_words, https://en.wikipedia.org/wiki/Devanagari_conjuncts
License: Creative Commons Attribution-ShareAlike 4.0 License
Hindi and Sanskrit word list and frequencies by: CC-100;
Version: 2020
Source: https://data.statmt.org/cc-100/
References (PDF links are available in the source URL):
- Unsupervised Cross-lingual Representation Learning at Scale, Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer, Veselin Stoyanov, Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL), p. 8440-8451, July 2020.
- CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data, Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, Edouard Grave, Proceedings of the 12th Language Resources and Evaluation Conference (LREC), p. 4003-4012, May 2020.
Remark: Only the words that appear 3 times or more in each list were used.

BIN
downloads/hi-utf8.zip Normal file

Binary file not shown.

View file

@ -65,12 +65,26 @@ function containsMultipleMatraNasalizations(word) {
.test(word); .test(word);
} }
function containsModifierMatra(word) {
return /[\u{900}-\u{903}\u{94d}][\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]/u.test(word);
}
function containsTooManyRepeatedLetters(word) { function containsTooManyRepeatedLetters(word) {
return /(.)\1{2,}/.test(word); return /(.)\1{2,}/.test(word);
} }
function containsForeignLetters(word) {
return /[\u{944}ऑऍऎऒॠ]+[\u{900}-\u{903}\u{94d}\u{93E}-\u{944}\u{947}\u{948}\u{94B}\u{94C}\u{962}\u{963}]?/u.test(word);
}
function fixNuqta(word) {
return word.replaceAll('ऴ', '\u{933}\u{93c}');
}
/** /**
* isValid * isValid
* *
@ -83,13 +97,15 @@ function isValid(word) {
&& !containsInvalidZWJ(word) && !containsInvalidZWJ(word)
&& !containsMultipleNasalizations(word) && !containsMultipleNasalizations(word)
&& !containsMultipleMatraNasalizations(word) && !containsMultipleMatraNasalizations(word)
&& !containsModifierMatra(word)
&& !containsTooManyRepeatedLetters(word) && !containsTooManyRepeatedLetters(word)
&& !containsForeignLetters(word)
} }
function work({ file }) { function work({ file }) {
Array.from(getWordsFromFile(file)).forEach(w => { Array.from(getWordsFromFile(file)).forEach(w => {
if (isValid(w)) print(w); if (isValid(w)) print(fixNuqta(w));
}); });
} }