1
0
Fork 0

Farsi language

This commit is contained in:
sspanak 2025-02-12 17:53:14 +02:00 committed by Dimo Karaivanov
parent ffae563b95
commit 2098b1a994
9 changed files with 562357 additions and 2 deletions

View file

@ -0,0 +1,17 @@
locale: fa-IR
currency:
dictionaryFile: fa-utf8.csv
abcString: أﺏﺕ
hasUpperCase: no
numerals: [۰,۱,۲,۳,۴,۵,۶,۷,۸,۹]
layout:
- [SPECIAL] # 0
- [ً,PUNCTUATION_FA] # 1
- [ب,پ,ت,ث,ة] # 2
- [ا,أ,إ,آ,ى,ؤ,ئ,ء] # 3
- [س,ش,ص,ض] # 4
- [د,ذ,ر,ز,ژ] # 5
- [ج,چ,ح,خ] # 6
- [ن,ه,و,ي,ی] # 7
- [ف,ق,ك,ک,گ,ل,م] # 8
- [ط,ظ,ع,غ] # 9

File diff suppressed because it is too large Load diff

View file

@ -7,10 +7,11 @@ public class LanguageKind {
public static boolean isCyrillic(Language language) { return language != null && language.getKeyCharacters(2).contains("а"); } public static boolean isCyrillic(Language language) { return language != null && language.getKeyCharacters(2).contains("а"); }
public static boolean isLatinBased(Language language) { return language != null && language.getKeyCharacters(2).contains("a"); } public static boolean isLatinBased(Language language) { return language != null && language.getKeyCharacters(2).contains("a"); }
public static boolean isRTL(Language language) { return isArabic(language) || isHebrew(language); } public static boolean isRTL(Language language) { return isArabic(language) || isFarsi(language) || isHebrew(language); }
public static boolean isArabic(Language language) { return language != null && language.getId() == 502337; } public static boolean isArabic(Language language) { return language != null && language.getId() == 502337; }
public static boolean isEnglish(Language language) { return language != null && language.getLocale().equals(Locale.ENGLISH); } public static boolean isEnglish(Language language) { return language != null && language.getLocale().equals(Locale.ENGLISH); }
public static boolean isFarsi(Language language) { return language != null && language.getId() == 599078; }
public static boolean isFrench(Language language) { return language != null && language.getId() == 596550; } public static boolean isFrench(Language language) { return language != null && language.getId() == 596550; }
public static boolean isGreek(Language language) { return language != null && language.getId() == 597381; } public static boolean isGreek(Language language) { return language != null && language.getId() == 597381; }
public static boolean isGujarati(Language language) { return language != null && language.getId() == 468647; } public static boolean isGujarati(Language language) { return language != null && language.getId() == 468647; }

View file

@ -86,6 +86,7 @@ public class NaturalLanguage extends Language implements Comparable<NaturalLangu
specialChars.put(SPECIAL_CHARS_PLACEHOLDER, Characters.Special); specialChars.put(SPECIAL_CHARS_PLACEHOLDER, Characters.Special);
specialChars.put(PUNCTUATION_PLACEHOLDER, Characters.PunctuationEnglish); specialChars.put(PUNCTUATION_PLACEHOLDER, Characters.PunctuationEnglish);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_AR", Characters.PunctuationArabic); specialChars.put(PUNCTUATION_PLACEHOLDER + "_AR", Characters.PunctuationArabic);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_FA", Characters.PunctuationFarsi);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_FR", Characters.PunctuationFrench); specialChars.put(PUNCTUATION_PLACEHOLDER + "_FR", Characters.PunctuationFrench);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_DE", Characters.PunctuationGerman); specialChars.put(PUNCTUATION_PLACEHOLDER + "_DE", Characters.PunctuationGerman);
specialChars.put(PUNCTUATION_PLACEHOLDER + "_GR", Characters.PunctuationGreek); specialChars.put(PUNCTUATION_PLACEHOLDER + "_GR", Characters.PunctuationGreek);

View file

@ -229,6 +229,7 @@ public class SuggestionsBar {
char firstChar = trimmedSuggestion.charAt(0); char firstChar = trimmedSuggestion.charAt(0);
String prefix = Character.isAlphabetic(firstChar) && !Characters.isCombiningPunctuation(firstChar) ? STEM_VARIATION_PREFIX : STEM_PUNCTUATION_VARIATION_PREFIX; String prefix = Character.isAlphabetic(firstChar) && !Characters.isCombiningPunctuation(firstChar) ? STEM_VARIATION_PREFIX : STEM_PUNCTUATION_VARIATION_PREFIX;
prefix = Characters.isFathatan(firstChar) ? " " : prefix; // Fix incorrect display of fathatan without a base character. It is a combining character, but since it is a letter, we must include a base character not to break it, with a "..." prefix
suggestions.add(prefix + formatUnreadableSuggestion(trimmedSuggestion)); suggestions.add(prefix + formatUnreadableSuggestion(trimmedSuggestion));
return; return;
} }
@ -251,7 +252,6 @@ public class SuggestionsBar {
} }
private void setSuggestionsOnScreen() { private void setSuggestionsOnScreen() {
if (mView != null) { if (mView != null) {
mSuggestionsAdapter.resetItems(selectedIndex); mSuggestionsAdapter.resetItems(selectedIndex);

View file

@ -48,4 +48,8 @@ public class Characters extends Emoji {
public static boolean isCurrency(Language language, String c) { public static boolean isCurrency(Language language, String c) {
return Currency.contains(c) || (language != null && language.getCurrency().equals(c)); return Currency.contains(c) || (language != null && language.getCurrency().equals(c));
} }
public static boolean isFathatan(char ch) {
return ch == 0x064B;
}
} }

View file

@ -21,6 +21,10 @@ class Punctuation {
',', '-', '\'', ':', ';', '!', '?', '.' ',', '-', '\'', ':', ';', '!', '?', '.'
)); ));
final public static ArrayList<Character> CombiningPunctuationFarsi = new ArrayList<>(Arrays.asList(
'،', ZWNJ.charAt(0), '-', '\'', ':', '؛', '!', '؟', '.'
));
final private static ArrayList<Character> CombiningPunctuationGujarati = new ArrayList<>(Arrays.asList( final private static ArrayList<Character> CombiningPunctuationGujarati = new ArrayList<>(Arrays.asList(
'્', '઼', 'ઽ', '', '।', '॰', '॥' // Indic combining chars look the same, but have different Unicode values '્', '઼', 'ઽ', '', '।', '॰', '॥' // Indic combining chars look the same, but have different Unicode values
)); ));
@ -41,6 +45,11 @@ class Punctuation {
",", ".", "-", "(", ")", "&", "~", "`", ";", ":", "'", "\"", "!", "?" ",", ".", "-", "(", ")", "&", "~", "`", ";", ":", "'", "\"", "!", "?"
)); ));
// the same as Arabic + ZWNJ
final public static ArrayList<String> PunctuationFarsi = new ArrayList<>(Arrays.asList(
"،", ".", "-", ZWNJ, "(", ")", "&", "~", "`", "'", "\"", "؛", ":", "!", "؟"
));
final public static ArrayList<String> PunctuationFrench = new ArrayList<>(Arrays.asList( final public static ArrayList<String> PunctuationFrench = new ArrayList<>(Arrays.asList(
",", ".", "-", "«", "»", "(", ")", "&", "`", "~", ";", ":", "'", "\"", "!", "?" ",", ".", "-", "«", "»", "(", ")", "&", "`", "~", ";", ":", "'", "\"", "!", "?"
)); ));
@ -64,6 +73,7 @@ class Punctuation {
public static boolean isCombiningPunctuation(Language language, char ch) { public static boolean isCombiningPunctuation(Language language, char ch) {
return return
CombiningPunctuation.contains(ch) CombiningPunctuation.contains(ch)
|| (LanguageKind.isFarsi(language) && CombiningPunctuationFarsi.contains(ch))
|| (LanguageKind.isGujarati(language) && CombiningPunctuationGujarati.contains(ch)) || (LanguageKind.isGujarati(language) && CombiningPunctuationGujarati.contains(ch))
|| (LanguageKind.isHindi(language) && CombiningPunctuationHindi.contains(ch)) || (LanguageKind.isHindi(language) && CombiningPunctuationHindi.contains(ch))
|| (LanguageKind.isHebrew(language) && CombiningPunctuationHebrew.contains(ch)); || (LanguageKind.isHebrew(language) && CombiningPunctuationHebrew.contains(ch));
@ -72,6 +82,7 @@ class Punctuation {
public static boolean isCombiningPunctuation(char ch) { public static boolean isCombiningPunctuation(char ch) {
return return
CombiningPunctuation.contains(ch) CombiningPunctuation.contains(ch)
|| CombiningPunctuationFarsi.contains(ch)
|| CombiningPunctuationGujarati.contains(ch) || CombiningPunctuationGujarati.contains(ch)
|| CombiningPunctuationHindi.contains(ch) || CombiningPunctuationHindi.contains(ch)
|| CombiningPunctuationHebrew.contains(ch); || CombiningPunctuationHebrew.contains(ch);

View file

@ -263,6 +263,11 @@ static def extractAlphabetExtraCharsFromLine(String languageName, String line) {
.replace(',', '') .replace(',', '')
.replace(' ', '') .replace(' ', '')
// add Zero-width non-joiner for some languages
if (line.contains("PUNCTUATION_FA") || line.contains("PUNCTUATION_IN")) {
allChars += '\u200C'
}
return DEFAULT + allChars return DEFAULT + allChars
} }

View file

@ -0,0 +1,21 @@
Farsi wordlist 1 by: shahind
Version: 1a9c953f6edaebaff5d434b319e1dd4b700a5fc3 (2022-03-04)
Source: https://github.com/shahind/Persian-Words-Database
License: Public Domain
Farsi wordlist 2 by: Semnan University
Version: 48bcda68e1b89d6d259e326e2c5985e9e87d5493 (2022-08-15)
Source: https://github.com/semnan-university-ai/English-Persian-Word-Database
License: (Apache 2.0) https://github.com/semnan-university-ai/English-Persian-Word-Database/blob/master/LICENSE
Farsi wordlist 3 by: AnySoft Keyboard
Version: f7b0b74c043259dc69b037f71f5109974cf15d59 (2020-04-20)
Source: https://github.com/AnySoftKeyboard/AnySoftKeyboard
License: (Apache 2.0) https://github.com/AnySoftKeyboard/AnySoftKeyboard/blob/main/LICENSE
All wordlists were manually cleaned up from repeating and invalid words (as much as possible).
Word frequencies by: Hermit Dave
Version: 525f9b560de45753a5ea01069454e72e9aa541c6 (2022-02-07)
Source: https://github.com/hermitdave/FrequencyWords
License: (MIT) https://github.com/hermitdave/FrequencyWords/blob/master/LICENSE