Farsi language
This commit is contained in:
parent
ffae563b95
commit
2098b1a994
9 changed files with 562357 additions and 2 deletions
17
app/languages/definitions/Farsi.yml
Normal file
17
app/languages/definitions/Farsi.yml
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
locale: fa-IR
|
||||
currency: ﷼
|
||||
dictionaryFile: fa-utf8.csv
|
||||
abcString: أﺏﺕ
|
||||
hasUpperCase: no
|
||||
numerals: [۰,۱,۲,۳,۴,۵,۶,۷,۸,۹]
|
||||
layout:
|
||||
- [SPECIAL] # 0
|
||||
- [ً,PUNCTUATION_FA] # 1
|
||||
- [ب,پ,ت,ث,ة] # 2
|
||||
- [ا,أ,إ,آ,ى,ؤ,ئ,ء] # 3
|
||||
- [س,ش,ص,ض] # 4
|
||||
- [د,ذ,ر,ز,ژ] # 5
|
||||
- [ج,چ,ح,خ] # 6
|
||||
- [ن,ه,و,ي,ی] # 7
|
||||
- [ف,ق,ك,ک,گ,ل,م] # 8
|
||||
- [ط,ظ,ع,غ] # 9
|
||||
562295
app/languages/dictionaries/fa-utf8.csv
Normal file
562295
app/languages/dictionaries/fa-utf8.csv
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -7,10 +7,11 @@ public class LanguageKind {
|
|||
|
||||
public static boolean isCyrillic(Language language) { return language != null && language.getKeyCharacters(2).contains("а"); }
|
||||
public static boolean isLatinBased(Language language) { return language != null && language.getKeyCharacters(2).contains("a"); }
|
||||
public static boolean isRTL(Language language) { return isArabic(language) || isHebrew(language); }
|
||||
public static boolean isRTL(Language language) { return isArabic(language) || isFarsi(language) || isHebrew(language); }
|
||||
|
||||
public static boolean isArabic(Language language) { return language != null && language.getId() == 502337; }
|
||||
public static boolean isEnglish(Language language) { return language != null && language.getLocale().equals(Locale.ENGLISH); }
|
||||
public static boolean isFarsi(Language language) { return language != null && language.getId() == 599078; }
|
||||
public static boolean isFrench(Language language) { return language != null && language.getId() == 596550; }
|
||||
public static boolean isGreek(Language language) { return language != null && language.getId() == 597381; }
|
||||
public static boolean isGujarati(Language language) { return language != null && language.getId() == 468647; }
|
||||
|
|
|
|||
|
|
@ -86,6 +86,7 @@ public class NaturalLanguage extends Language implements Comparable<NaturalLangu
|
|||
specialChars.put(SPECIAL_CHARS_PLACEHOLDER, Characters.Special);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER, Characters.PunctuationEnglish);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_AR", Characters.PunctuationArabic);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_FA", Characters.PunctuationFarsi);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_FR", Characters.PunctuationFrench);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_DE", Characters.PunctuationGerman);
|
||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_GR", Characters.PunctuationGreek);
|
||||
|
|
|
|||
|
|
@ -229,6 +229,7 @@ public class SuggestionsBar {
|
|||
char firstChar = trimmedSuggestion.charAt(0);
|
||||
|
||||
String prefix = Character.isAlphabetic(firstChar) && !Characters.isCombiningPunctuation(firstChar) ? STEM_VARIATION_PREFIX : STEM_PUNCTUATION_VARIATION_PREFIX;
|
||||
prefix = Characters.isFathatan(firstChar) ? " " : prefix; // Fix incorrect display of fathatan without a base character. It is a combining character, but since it is a letter, we must include a base character not to break it, with a "..." prefix
|
||||
suggestions.add(prefix + formatUnreadableSuggestion(trimmedSuggestion));
|
||||
return;
|
||||
}
|
||||
|
|
@ -251,7 +252,6 @@ public class SuggestionsBar {
|
|||
}
|
||||
|
||||
|
||||
|
||||
private void setSuggestionsOnScreen() {
|
||||
if (mView != null) {
|
||||
mSuggestionsAdapter.resetItems(selectedIndex);
|
||||
|
|
|
|||
|
|
@ -48,4 +48,8 @@ public class Characters extends Emoji {
|
|||
public static boolean isCurrency(Language language, String c) {
|
||||
return Currency.contains(c) || (language != null && language.getCurrency().equals(c));
|
||||
}
|
||||
|
||||
public static boolean isFathatan(char ch) {
|
||||
return ch == 0x064B;
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -21,6 +21,10 @@ class Punctuation {
|
|||
',', '-', '\'', ':', ';', '!', '?', '.'
|
||||
));
|
||||
|
||||
final public static ArrayList<Character> CombiningPunctuationFarsi = new ArrayList<>(Arrays.asList(
|
||||
'،', ZWNJ.charAt(0), '-', '\'', ':', '؛', '!', '؟', '.'
|
||||
));
|
||||
|
||||
final private static ArrayList<Character> CombiningPunctuationGujarati = new ArrayList<>(Arrays.asList(
|
||||
'્', '઼', 'ઽ', 'ઃ', '।', '॰', '॥' // Indic combining chars look the same, but have different Unicode values
|
||||
));
|
||||
|
|
@ -41,6 +45,11 @@ class Punctuation {
|
|||
",", ".", "-", "(", ")", "&", "~", "`", ";", ":", "'", "\"", "!", "?"
|
||||
));
|
||||
|
||||
// the same as Arabic + ZWNJ
|
||||
final public static ArrayList<String> PunctuationFarsi = new ArrayList<>(Arrays.asList(
|
||||
"،", ".", "-", ZWNJ, "(", ")", "&", "~", "`", "'", "\"", "؛", ":", "!", "؟"
|
||||
));
|
||||
|
||||
final public static ArrayList<String> PunctuationFrench = new ArrayList<>(Arrays.asList(
|
||||
",", ".", "-", "«", "»", "(", ")", "&", "`", "~", ";", ":", "'", "\"", "!", "?"
|
||||
));
|
||||
|
|
@ -64,6 +73,7 @@ class Punctuation {
|
|||
public static boolean isCombiningPunctuation(Language language, char ch) {
|
||||
return
|
||||
CombiningPunctuation.contains(ch)
|
||||
|| (LanguageKind.isFarsi(language) && CombiningPunctuationFarsi.contains(ch))
|
||||
|| (LanguageKind.isGujarati(language) && CombiningPunctuationGujarati.contains(ch))
|
||||
|| (LanguageKind.isHindi(language) && CombiningPunctuationHindi.contains(ch))
|
||||
|| (LanguageKind.isHebrew(language) && CombiningPunctuationHebrew.contains(ch));
|
||||
|
|
@ -72,6 +82,7 @@ class Punctuation {
|
|||
public static boolean isCombiningPunctuation(char ch) {
|
||||
return
|
||||
CombiningPunctuation.contains(ch)
|
||||
|| CombiningPunctuationFarsi.contains(ch)
|
||||
|| CombiningPunctuationGujarati.contains(ch)
|
||||
|| CombiningPunctuationHindi.contains(ch)
|
||||
|| CombiningPunctuationHebrew.contains(ch);
|
||||
|
|
|
|||
|
|
@ -263,6 +263,11 @@ static def extractAlphabetExtraCharsFromLine(String languageName, String line) {
|
|||
.replace(',', '')
|
||||
.replace(' ', '')
|
||||
|
||||
// add Zero-width non-joiner for some languages
|
||||
if (line.contains("PUNCTUATION_FA") || line.contains("PUNCTUATION_IN")) {
|
||||
allChars += '\u200C'
|
||||
}
|
||||
|
||||
return DEFAULT + allChars
|
||||
}
|
||||
|
||||
|
|
|
|||
21
docs/dictionaries/faWordlistReadme.txt
Normal file
21
docs/dictionaries/faWordlistReadme.txt
Normal file
|
|
@ -0,0 +1,21 @@
|
|||
Farsi wordlist 1 by: shahind
|
||||
Version: 1a9c953f6edaebaff5d434b319e1dd4b700a5fc3 (2022-03-04)
|
||||
Source: https://github.com/shahind/Persian-Words-Database
|
||||
License: Public Domain
|
||||
|
||||
Farsi wordlist 2 by: Semnan University
|
||||
Version: 48bcda68e1b89d6d259e326e2c5985e9e87d5493 (2022-08-15)
|
||||
Source: https://github.com/semnan-university-ai/English-Persian-Word-Database
|
||||
License: (Apache 2.0) https://github.com/semnan-university-ai/English-Persian-Word-Database/blob/master/LICENSE
|
||||
|
||||
Farsi wordlist 3 by: AnySoft Keyboard
|
||||
Version: f7b0b74c043259dc69b037f71f5109974cf15d59 (2020-04-20)
|
||||
Source: https://github.com/AnySoftKeyboard/AnySoftKeyboard
|
||||
License: (Apache 2.0) https://github.com/AnySoftKeyboard/AnySoftKeyboard/blob/main/LICENSE
|
||||
|
||||
All wordlists were manually cleaned up from repeating and invalid words (as much as possible).
|
||||
|
||||
Word frequencies by: Hermit Dave
|
||||
Version: 525f9b560de45753a5ea01069454e72e9aa541c6 (2022-02-07)
|
||||
Source: https://github.com/hermitdave/FrequencyWords
|
||||
License: (MIT) https://github.com/hermitdave/FrequencyWords/blob/master/LICENSE
|
||||
Loading…
Add table
Add a link
Reference in a new issue