Irish (#756)
This commit is contained in:
parent
7e3d2c0062
commit
91d2476dc6
6 changed files with 180542 additions and 4 deletions
13
app/languages/definitions/Irish.yml
Normal file
13
app/languages/definitions/Irish.yml
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
locale: ga-IE
|
||||||
|
dictionaryFile: ga-utf8.csv
|
||||||
|
layout:
|
||||||
|
- [SPECIAL] # 0
|
||||||
|
- [PUNCTUATION_IE] # 1
|
||||||
|
- [a, b, c, á] # 2
|
||||||
|
- [d, e, f, é] # 3
|
||||||
|
- [g, h, i, í] # 4
|
||||||
|
- [j, k, l] # 5
|
||||||
|
- [m, n, o, ó] # 6
|
||||||
|
- [p, q, r, s] # 7
|
||||||
|
- [t, u, v, ú] # 8
|
||||||
|
- [w, x, y, z] # 9
|
||||||
180507
app/languages/dictionaries/ga-utf8.csv
Normal file
180507
app/languages/dictionaries/ga-utf8.csv
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -90,6 +90,7 @@ public class NaturalLanguage extends Language implements Comparable<NaturalLangu
|
||||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_FR", Characters.PunctuationFrench);
|
specialChars.put(PUNCTUATION_PLACEHOLDER + "_FR", Characters.PunctuationFrench);
|
||||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_DE", Characters.PunctuationGerman);
|
specialChars.put(PUNCTUATION_PLACEHOLDER + "_DE", Characters.PunctuationGerman);
|
||||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_GR", Characters.PunctuationGreek);
|
specialChars.put(PUNCTUATION_PLACEHOLDER + "_GR", Characters.PunctuationGreek);
|
||||||
|
specialChars.put(PUNCTUATION_PLACEHOLDER + "_IE", Characters.PunctuationIrish);
|
||||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_IN", Characters.PunctuationIndic);
|
specialChars.put(PUNCTUATION_PLACEHOLDER + "_IN", Characters.PunctuationIndic);
|
||||||
specialChars.put(PUNCTUATION_PLACEHOLDER + "_KR", Characters.PunctuationKorean);
|
specialChars.put(PUNCTUATION_PLACEHOLDER + "_KR", Characters.PunctuationKorean);
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -45,10 +45,7 @@ class Punctuation {
|
||||||
",", ".", "-", "(", ")", "&", "~", "`", ";", ":", "'", "\"", "!", "?"
|
",", ".", "-", "(", ")", "&", "~", "`", ";", ":", "'", "\"", "!", "?"
|
||||||
));
|
));
|
||||||
|
|
||||||
// the same as Arabic + ZWNJ
|
final public static ArrayList<String> PunctuationFarsi = insertChar(PunctuationArabic, ZWNJ, "-");
|
||||||
final public static ArrayList<String> PunctuationFarsi = new ArrayList<>(Arrays.asList(
|
|
||||||
"،", ".", "-", ZWNJ, "(", ")", "&", "~", "`", "'", "\"", "؛", ":", "!", "؟"
|
|
||||||
));
|
|
||||||
|
|
||||||
final public static ArrayList<String> PunctuationFrench = new ArrayList<>(Arrays.asList(
|
final public static ArrayList<String> PunctuationFrench = new ArrayList<>(Arrays.asList(
|
||||||
",", ".", "-", "«", "»", "(", ")", "&", "`", "~", ";", ":", "'", "\"", "!", "?"
|
",", ".", "-", "«", "»", "(", ")", "&", "`", "~", ";", ":", "'", "\"", "!", "?"
|
||||||
|
|
@ -62,6 +59,8 @@ class Punctuation {
|
||||||
",", ".", "-", "«", "»", "(", ")", "&", "~", "`", "'", "\"", "·", ":", "!", GR_QUESTION_MARK
|
",", ".", "-", "«", "»", "(", ")", "&", "~", "`", "'", "\"", "·", ":", "!", GR_QUESTION_MARK
|
||||||
));
|
));
|
||||||
|
|
||||||
|
final public static ArrayList<String> PunctuationIrish = insertChar(PunctuationEnglish, "⁊", "&");
|
||||||
|
|
||||||
final public static ArrayList<String> PunctuationIndic = new ArrayList<>(Arrays.asList(
|
final public static ArrayList<String> PunctuationIndic = new ArrayList<>(Arrays.asList(
|
||||||
",", ".", "-", ZWJ, ZWNJ, "(", ")", "।", "॰", "॥", "&", "~", "`", ";", ":", "'", "\"", "!", "?"
|
",", ".", "-", ZWJ, ZWNJ, "(", ")", "।", "॰", "॥", "&", "~", "`", ";", ":", "'", "\"", "!", "?"
|
||||||
));
|
));
|
||||||
|
|
@ -87,4 +86,10 @@ class Punctuation {
|
||||||
|| CombiningPunctuationHindi.contains(ch)
|
|| CombiningPunctuationHindi.contains(ch)
|
||||||
|| CombiningPunctuationHebrew.contains(ch);
|
|| CombiningPunctuationHebrew.contains(ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private static ArrayList<String> insertChar(ArrayList<String> list, String newChar, String afterChar) {
|
||||||
|
ArrayList<String> newList = new ArrayList<>(list);
|
||||||
|
newList.add(list.indexOf(afterChar) + 1, newChar);
|
||||||
|
return newList;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
12
docs/dictionaries/gaWordlistReadme.txt
Normal file
12
docs/dictionaries/gaWordlistReadme.txt
Normal file
|
|
@ -0,0 +1,12 @@
|
||||||
|
Irish word list by: Teanglann.ie
|
||||||
|
Version: March 2025
|
||||||
|
Source: https://www.teanglann.ie, https://github.com/michmech/BuNaMo
|
||||||
|
License: (Open Database License) https://github.com/michmech/BuNaMo/blob/master/LICENCE
|
||||||
|
|
||||||
|
Irish word list and frequencies by: CC-100
|
||||||
|
Version: 2020
|
||||||
|
Source: https://data.statmt.org/cc-100/
|
||||||
|
References (PDF links are available in the source URL):
|
||||||
|
- Unsupervised Cross-lingual Representation Learning at Scale, Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer, Veselin Stoyanov, Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics (ACL), p. 8440-8451, July 2020.
|
||||||
|
- CCNet: Extracting High Quality Monolingual Datasets from Web Crawl Data, Guillaume Wenzek, Marie-Anne Lachaux, Alexis Conneau, Vishrav Chaudhary, Francisco Guzmán, Armand Joulin, Edouard Grave, Proceedings of the 12th Language Resources and Evaluation Conference (LREC), p. 4003-4012, May 2020.
|
||||||
|
Remark: Only the words that appear at least 10 times were used.
|
||||||
BIN
downloads/ga-utf8.zip
Normal file
BIN
downloads/ga-utf8.zip
Normal file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue