Dictionaries update (#80)

* added missing words to the Bulgarian dictionary * English dictionary update * removed repeating words from the Italian and Bulgarian dictionaries * fixed incorrectly broken words and removed repeating ones from the Ukrainian dictionary * Russian dictionary update * documentation update * made it possible to type words with apostrophes (Dutch, English and Ukrainian)
2022-10-24 13:32:31 +03:00 · 2022-10-24 13:32:31 +03:00 · 8b67929a07
commit 8b67929a07
parent 6c19edc8a3
23 changed files with 187613 additions and 57933 deletions
--- a/README.md
+++ b/README.md
@ -50,7 +50,9 @@ To support a new language one needs to:
    - The text must be white and the background must be transparent as per the [official Android guide](https://android-doc.github.io/guide/practices/ui_guidelines/icon_design_status_bar.html).
    - To simplify the process, you could use Android Studio. It has a built-in icon generator accessible by right-cicking on "drawable" folder -> New -> Image Asset. Then choose "Icon Type": "Notification Icons", "Asset Type": Text, "Trim": No, "Padding": 0%.
 - Find a suitable dictionary and add it to `assets` folder.
- Create a new language class in `languages/definitions/`. Make sure to set all properties. The ID must be the next available one. Currently, the range is limited between 1 and 31, so there can be 31 languages in total.
+- Create a new language class in `languages/definitions/`. Make sure to set all properties.
+  - `ID` must be the next available number. Currently, the range is limited between 1 and 31, so there can be 31 languages in total.
+  - Set `isPunctuationPartOfWords` to `true`, if you need to use the 1-key for typing words, such as: `it's`, `a'tje` or `п'ят`. Otherwise, it would not be possible to type them, nor will they appear as suggestions. `false` is recommended when apostrophes or other punctuation are not part of the words, to allow faster typing.
 - Add the new language to the list in `LanguageCollection.java`. You only need to add it in one place, in the constructor. Please, be nice and maintain the alphabetical order.
 - Add a new entry in `res/values/const.xml`. Make sure the new ID matches the one in the language class.
 - Add new entries in `res/values/arrays.xml`.
--- a/assets/bg-utf8.txt
+++ b/assets/bg-utf8.txt
@ -44,6 +44,7 @@
 ок
 ос
 от
+оф
 ох
 па
 пи
@ -84,8 +85,8 @@
 яз
 ял
 ям
-де
 аба
+абе
 аби
 абу
 ага
@ -97,6 +98,7 @@
 але
 ало
 алт
+алф
 ама
 ами
 ана
@ -440,6 +442,7 @@
 мри
 мря
 мсе
+мхм
 мъж
 мър
 мъх
@ -650,8 +653,9 @@
 тих
 тиф
 тия
-ток
+тоз
 той
+ток
 том
 тон
 топ
@ -1005,6 +1009,7 @@
 блей
 блея
 блок
+блус
 блъф
 блян
 боаз
@ -2291,6 +2296,7 @@
 маят
 мвае
 мваи
+мега
 меди
 мезе
 мека
@ -3132,6 +3138,8 @@
 сена
 сено
 сент
+сера
+сере
 сери
 серт
 сета
@ -3662,6 +3670,7 @@
 хари
 харч
 хасе
+хаха
 хвощ
 хека
 херц
@ -4027,7 +4036,6 @@
 ячуа
 ящен
 току-що
-ли
 току-виж
 току-тъй
 горе-долу
@ -6167,6 +6175,7 @@
 дюйма
 дюкян
 дюлев
+дюнер
 дюшек
 дявол
 дявам
@ -8301,6 +8310,7 @@
 метра
 метро
 метри
+метъл
 метър
 метял
 мехме
@ -10336,6 +10346,8 @@
 първа
 първи
 първо
+пърди
+пърдя
 пържа
 пържи
 пърли
@ -10993,7 +11005,10 @@
 сепия
 сепна
 сепне
+серат
+серем
 серен
+сереш
 серив
 серии
 серия
@ -12095,6 +12110,7 @@
 тупне
 тупти
 туптя
+турбо
 турен
 турел
 турим
@ -17334,6 +17350,8 @@
 дюлите
 дюлята
 дюната
+дюнера
+дюнери
 дюните
 дюшеме
 дюшека
@ -18178,6 +18196,7 @@
 зарека
 зареша
 зарече
+зариби
 зарибя
 зарива
 зариеш
@ -18382,6 +18401,7 @@
 звънът
 звънял
 звънят
+звънях
 звярът
 здание
 здания
@ -23138,6 +23158,7 @@
 обточа
 обтяга
 обувай
+обувал
 обуват
 обувка
 обувам
@ -25889,6 +25910,7 @@
 пруски
 пруско
 пръдла
+пръдна
 пръдни
 пръдня
 пръжка
@ -26055,6 +26077,9 @@
 първак
 първия
 пъргав
+пърдим
+пърдиш
+пърдят
 пържат
 пържел
 пържен
@ -27158,6 +27183,7 @@
 сергей
 сергии
 сергия
+серете
 сериен
 серист
 серите
@ -27738,6 +27764,7 @@
 смукач
 смукна
 смутен
+смутих
 смутни
 смутня
 смучат
@ -29941,6 +29968,7 @@
 усещах
 усилен
 усилие
+усилих
 усилва
 усещаш
 усилил
@ -36767,6 +36795,7 @@
 дюдюкам
 дюкянче
 дюкянът
+дюнерът
 дюлгери
 дяволит
 дяволии
@ -38642,6 +38671,7 @@
 звънчев
 звъняла
 звъняло
+звъняха
 звъняща
 звънящи
 здравей
@ -44209,6 +44239,8 @@
 накацат
 наквася
 накваси
+накефен
+накефил
 накисна
 накипря
 накисва
@ -50893,6 +50925,7 @@
 присвия
 присвои
 присвоя
+присети
 присипя
 присипи
 прислон
@ -51275,6 +51308,7 @@
 прусаци
 пруския
 пръдльо
+пръднах
 пръкват
 пръсвам
 пръскам
@ -51505,6 +51539,7 @@
 пъргава
 пъргави
 пъргаво
+пърдите
 пържели
 пържена
 пържене
@ -54103,6 +54138,7 @@
 смутено
 смутили
 смутило
+смутиха
 смутове
 смучене
 смучещи
@ -55645,6 +55681,7 @@
 сюртука
 сюрприз
 сядайки
+сядайте
 сяклата
 сяклото
 сякохте
@ -57216,6 +57253,7 @@
 усилила
 усилили
 усилило
+усилиха
 усилния
 ускорен
 ускорим
@ -65170,6 +65208,7 @@
 дюлгеров
 дюлевото
 дюлевата
+дюнерите
 дюшемето
 дюшеклък
 дюшеците
@ -67829,6 +67868,8 @@
 звънчеви
 звънчето
 звънчета
+звъняхме
+звъняхте
 зданието
 зданията
 здравата
@ -74384,6 +74425,12 @@
 накацане
 наквасям
 накачуля
+накефена
+накефени
+накефено
+накефила
+накефило
+накефили
 накисвам
 накипели
 накипрям
@ -79725,6 +79772,7 @@
 петролно
 петролът
 петромир
+петрохан
 петрунов
 петрушев
 петстаен
@ -84110,6 +84158,8 @@
 присвоят
 присегна
 приседна
+присетил
+присетих
 прислони
 прислоня
 прислуга
@ -84897,6 +84947,7 @@
 пруските
 пруският
 пруското
+пръднаха
 пръднята
 пръжките
 пръкване
@ -88121,6 +88172,8 @@
 смукване
 смутения
 смутител
+смутихме
+смутихте
 смутната
 смутните
 смутното
@ -91605,6 +91658,8 @@
 усилващо
 усиления
 усилието
+усилихме
+усилихте
 усилията
 усилната
 усилните
@ -99610,6 +99665,7 @@
 еднородно
 едноръкия
 едноселец
+еднослоен
 едностаен
 еднотипен
 еднотипна
@ -101711,6 +101767,7 @@
 зарибявам
 зарибяван
 зарибяват
+зарибяващ
 заридавам
 заричания
 заробване
@ -109152,6 +109209,8 @@
 наквасвам
 наквасено
 наквасяне
+накефиния
+накефилия
 накипряне
 накипявам
 накисване
@ -120438,6 +120497,10 @@
 присвоили
 присвоиха
 присвоява
+присетила
+присетили
+присетило
+присетиха
 присипвам
 присламча
 присламчи
@ -121514,6 +121577,8 @@
 пружинира
 пружините
 прусаците
+пръднахме
+пръднахте
 пръждосам
 пръскалка
 пръскания
@ -124593,7 +124658,9 @@
 сканиращи
 сканиращо
 скапалата
+скапаната
 скапаните
+скапаният
 скапаното
 скапулите
 скараните
@ -134829,6 +134896,7 @@
 доносничка
 дооглаждам
 дооздравея
+дооправяне
 дооформена
 дооформени
 дооформяне
@ -135391,6 +135459,9 @@
 еднорелсов
 еднородния
 едносложен
+еднослойна
+еднослойни
+еднослойно
 едносменен
 едносменно
 едносричен
@ -137062,6 +137133,10 @@
 зарежещото
 зарибяване
 зарибявани
+зарибяваме
+зарибявате
+зарибяваща
+зарибяващи
 зарибяващо
 заридаване
 заробените
@ -143285,6 +143360,14 @@
 накачилите
 накачулвам
 наквасване
+накефената
+накефените
+накефеният
+некефеното
+накефилата
+накефилите
+накефилият
+накефилото
 накипяване
 накирливям
 накичената
@ -153937,6 +154020,8 @@
 присвояващ
 приседнала
 приседнали
+присетихме
+присетихте
 присипване
 прискърбен
 прискърбие
@ -166423,6 +166508,7 @@
 дообработва
 дообяснения
 дообяснявам
+дооправяйки
 дооформявам
 дооценяване
 допечатване
@ -166833,6 +166919,7 @@
 едносеменен
 едносеменна
 едносеменно
+еднослойния
 едносмислен
 едносричния
 едностайния
@ -168236,6 +168323,7 @@
 зарежданите
 зареждащата
 зареждащото
+зарибяващия
 зарзаватчия
 заробването
 зародишната
@ -191054,6 +191142,7 @@
 дообясняваме
 дообясняване
 дооздравявам
+дооправянето
 допирателния
 допитванията
 допринасяйки
@ -191368,6 +191457,10 @@
 едноседмична
 едноседмични
 едноседмично
+еднослойната
+еднослойните
+еднослойният
+еднослойното
 едносмислено
 едносричните
 едностайната
@ -192104,6 +192197,10 @@
 зарежданията
 зарибяването
 зарибяваните
+зарибяващата
+зарибяващите
+зарибяващият
+зарибяващото
 заробителите
 зарозовяване
 заруменяване
--- a/assets/en-utf8.txt
+++ b/assets/en-utf8.txt
--- a/assets/it-utf8.txt
+++ b/assets/it-utf8.txt
@ -4670,7 +4670,6 @@ acutezza
 acuti
 acutissimo
 acuto
-ad
 adagerai
 adageranno
 adagerebbe
@ -7568,7 +7567,6 @@ aguzzò
 agì
 ahi
 ahimè
-ai
 aitante
 aitanti
 aiuola
@ -7634,7 +7632,6 @@ aizzasse
 aizzata
 aizzatori
 aizzava
-al
 ala
 alabardieri
 alabastro
@ -16070,9 +16067,7 @@ beffarlo
 beffato
 beffatore
 beffe
-beh
 bei
-bel
 belano
 belare
 belati
@ -16100,7 +16095,6 @@ beltà
 belva
 belve
 bemolle
-ben
 benché
 benda
 bendai
@ -16467,7 +16461,6 @@ birra
 birre
 birreria
 birrerie
-bis
 bisacce
 bisaccia
 bisava
@ -16626,7 +16619,6 @@ blocchiate
 blocchino
 blocco
 bloccò
-blu
 bobina
 bocca
 boccacce
@ -17631,7 +17623,6 @@ buttino
 butto
 buttò
 buzzurri
-c
 cabala
 cabalistiche
 cabina
@ -19331,7 +19322,6 @@ cavò
 cazzata
 cazzo
 cazzotti
-ce
 cecino
 cecità
 ceco
@ -20152,7 +20142,6 @@ chiuso
 chiusura
 chiusure
 ché
-ci
 ciabattino
 cialda
 cialde
@ -20800,7 +20789,6 @@ coinvolgimento
 coinvolgono
 coinvolse
 coinvolti
-col
 cola
 colai
 colammo
@ -21222,7 +21210,6 @@ coltura
 colui
 colà
 colò
-com
 comanda
 comandai
 comandamenti
@ -22413,7 +22400,6 @@ comunisti
 comunitari
 comunità
 comunque
-con
 conca
 concatena
 concatenai
@ -25871,7 +25857,6 @@ corto
 corvaccio
 corvi
 corvo
-cos
 cosa
 cosce
 coscia
@ -27181,8 +27166,6 @@ custodivi
 custodivo
 custodì
 cute
-d
-da
 dabbene
 daccapo
 dacceli
@ -27196,7 +27179,6 @@ daglielo
 dai
 daini
 daino
-dal
 dall
 dalla
 dalle
@ -27211,7 +27193,6 @@ dammele
 dammelo
 dammene
 dammi
-dan
 danaro
 danarose
 dando
@ -27373,7 +27354,6 @@ dappocaggine
 dappoco
 dappresso
 dapprima
-dar
 darai
 daranno
 darcelo
@ -28319,13 +28299,11 @@ degradiate
 degradino
 degrado
 degradò
-deh
 dei
 deiezione
 deificare
 deificati
 deità
-del
 delato
 delatore
 delatori
@ -29838,7 +29816,6 @@ deturpiate
 deturpino
 deturpo
 deturpò
-dev
 devasta
 devastai
 devastammo
@ -29906,7 +29883,6 @@ devote
 devoti
 devoto
 devozione
-di
 dia
 diabete
 diabolica
@ -31280,7 +31256,6 @@ diplomazie
 diplomi
 dipolo
 diporto
-dir
 dirada
 diradai
 diradammo
@ -33482,7 +33457,6 @@ divulgò
 dizionari
 dizionario
 dizione
-do
 dobbiamo
 docce
 doccia
@ -33699,7 +33673,6 @@ domino
 dominò
 domo
 domò
-don
 dona
 donaci
 donai
@ -34012,7 +33985,6 @@ dottrinale
 dottrinalmente
 dottrine
 dotò
-dov
 dove
 dovendo
 dovendosi
@ -34500,7 +34472,6 @@ economizza
 economizzi
 economizzo
 ecumenica
-ed
 edera
 edere
 edicola
@ -34744,10 +34715,8 @@ eguagliò
 eguale
 eguali
 egualmente
-eh
 ehi
 ehm
-ei
 elabora
 elaborai
 elaborammo
@ -37304,7 +37273,6 @@ evolverà
 evviva
 extraterrestri
 eziandio
-fa
 fabbri
 fabbrica
 fabbricai
@ -37585,7 +37553,6 @@ famosissima
 famosissimi
 famosissimo
 famoso
-fan
 fanale
 fanali
 fanatica
@ -37629,7 +37596,6 @@ fantine
 fantini
 fantino
 fantocciata
-far
 fara
 farabutti
 farai
@ -38564,7 +38530,6 @@ filtri
 filtro
 filza
 filò
-fin
 fina
 finale
 finali
@ -39553,7 +39518,6 @@ fotografo
 fotogramma
 fotolitografica
 fottuto
-fra
 fracassa
 fracassai
 fracassammo
@ -40228,7 +40192,6 @@ fruttiere
 fruttino
 frutto
 fruttò
-fu
 fucila
 fucilai
 fucilammo
@ -40787,7 +40750,6 @@ garze
 garzoncello
 garzone
 garzoni
-gas
 gasati
 gassose
 gatta
@ -41679,7 +41641,6 @@ glaciali
 gladiatore
 gladiatori
 gleba
-gli
 gliceridi
 glicerina
 gliel
@ -42506,7 +42467,6 @@ grossolano
 grotta
 grotte
 groviera
-gru
 grucce
 gruccia
 grugnire
@ -42951,14 +42911,11 @@ gustose
 gustosi
 gustoso
 gustò
-ha
 hacker
 hai
 hamiltoniana
-han
 hanno
 hardware
-ho
 iattura
 ibrida
 ibridi
@ -43173,7 +43130,6 @@ ignuda
 ignude
 ignudi
 igroscopica
-il
 ilari
 ilarità
 illanguidire
@ -45026,7 +44982,6 @@ imputino
 imputo
 imputridiscono
 imputò
-in
 inabile
 inabili
 inabilità
@ -50363,7 +50318,6 @@ inzuppiate
 inzuppino
 inzuppo
 inzuppò
-io
 iodio
 ione
 ionizzato
@ -50911,8 +50865,6 @@ itinerario
 itterizia
 ivi
 kg
-l
-la
 labbra
 labbreggiava
 labbro
@ -51612,7 +51564,6 @@ lavorò
 lavò
 lazzaretto
 lazzi
-le
 leale
 leali
 lealtà
@ -52101,7 +52052,6 @@ lezioni
 leziosaggine
 lezioso
 lezzo
-li
 liana
 libazioni
 libbra
@ -52606,7 +52556,6 @@ livornese
 livornesi
 livrea
 livree
-lo
 lobbia
 locale
 locali
@ -53041,8 +52990,6 @@ luttuose
 luttuosissima
 là
 lì
-m
-ma
 maccherone
 maccheroni
 macchia
@ -53222,7 +53169,6 @@ mais
 maiuscola
 maiuscole
 maiuscolo
-mal
 mala
 malagevole
 malagevoli
@ -54270,7 +54216,6 @@ mazzetti
 mazzetto
 mazzi
 mazzo
-me
 meccanica
 meccanicamente
 meccaniche
@ -54437,7 +54382,6 @@ memorizziate
 memorizzino
 memorizzo
 memorizzò
-men
 mena
 menadito
 menai
@ -54909,7 +54853,6 @@ mezzo
 mezzodì
 mezzogiorno
 mezzosangue
-mi
 mia
 miagola
 miagolai
@ -56528,7 +56471,6 @@ mutualmente
 mutuamente
 mutui
 mutò
-n
 nacque
 nacquero
 nacqui
@ -56821,7 +56763,6 @@ nazionalizzate
 nazionalizzazione
 nazione
 nazioni
-ne
 neanchio
 neanche
 nebbia
@ -56959,7 +56900,6 @@ negri
 negro
 negò
 nei
-nel
 nell
 nella
 nelle
@ -57089,7 +57029,6 @@ nipote
 nipoti
 nitrati
 nitrato
-no
 nobile
 nobili
 nobiliare
@ -57264,7 +57203,6 @@ nominiate
 nominino
 nomino
 nominò
-non
 nona
 nonché
 noncurante
@ -58053,7 +57991,6 @@ oculati
 oculato
 oculista
 oculisti
-od
 ode
 odi
 odia
@ -60412,7 +60349,6 @@ pappagalli
 pappagallo
 pappagorgia
 pappe
-par
 para
 parabola
 parabole
@ -61853,7 +61789,6 @@ penò
 pepe
 peperone
 peperoni
-per
 pera
 peraltro
 perbacco
@ -62811,7 +62746,6 @@ pezzettini
 pezzetto
 pezzi
 pezzo
-pi
 pia
 piaccia
 piacciamo
@ -66165,7 +66099,6 @@ privilegiati
 privilegiato
 privilegio
 privo
-pro
 probabile
 probabili
 probabilistici
@ -68489,7 +68422,6 @@ pupulliate
 pupullino
 pupullo
 pupullò
-pur
 pura
 puramente
 purché
@ -70851,7 +70783,6 @@ razzoliate
 razzolino
 razzolo
 razzolò
-re
 rea
 reagente
 reagenti
@ -78790,8 +78721,6 @@ ruzzoliate
 ruzzolino
 ruzzolo
 ruzzolò
-s
-sa
 sabatico
 sabato
 sabbia
@ -79322,7 +79251,6 @@ salvo
 salvò
 salì
 salò
-san
 sana
 sanai
 sanammo
@ -84367,7 +84295,6 @@ sdrucciolare
 sdrucciolava
 sdrucciolò
 sdrucito
-se
 sebbene
 secca
 seccaggine
@ -87379,7 +87306,6 @@ sguinzaglino
 sguinzaglio
 sguinzagliò
 sgusciavano
-si
 sia
 siamo
 siano
@ -87806,7 +87732,6 @@ simultanee
 simultanei
 simultaneo
 simulò
-sin
 sinagoga
 sinagoghe
 sincera
@ -89128,7 +89053,6 @@ snodiate
 snodino
 snodo
 snodò
-so
 soave
 soavemente
 soavi
@ -89601,7 +89525,6 @@ sognino
 sogno
 sognò
 soia
-sol
 sola
 solai
 solaio
@ -89980,7 +89903,6 @@ sommo
 sommossa
 sommosse
 sommò
-son
 sonagli
 sonaglio
 sonanti
@ -91031,7 +90953,6 @@ sotto
 sottoargomenti
 sottocapitoli
 sottocapitolo
-sottocchio
 sottocoppa
 sottocosto
 sottocutanei
@ -97608,11 +97529,9 @@ stuzzichiate
 stuzzichino
 stuzzico
 stuzzicò
-su
 sua
 suadente
 suadenti
-sub
 subacquea
 subacquee
 subacquei
@ -97832,7 +97751,6 @@ succulenti
 succulento
 succursale
 succursali
-sud
 suda
 sudai
 sudammo
@ -98116,7 +98034,6 @@ suicidio
 suindicato
 suini
 suino
-sul
 sulfurea
 sulfuree
 sulfurei
@ -99634,7 +99551,6 @@ svuoto
 svuotò
 sé
 sì
-t
 tabaccai
 tabaccaio
 tabacchi
@ -99830,7 +99746,6 @@ tagliuzzino
 tagliuzzo
 tagliuzzò
 tagliò
-tal
 talamo
 talari
 talché
@ -100395,7 +100310,6 @@ tavolozza
 tavolozze
 tazza
 tazze
-te
 teatrale
 teatrali
 teatri
@ -101470,7 +101384,6 @@ tetto
 tettoia
 tettoie
 tettuccio
-ti
 tibia
 tibie
 ticchettii
@ -102532,7 +102445,6 @@ tozza
 tozze
 tozzi
 tozzo
-tra
 traballa
 traballai
 traballammo
@ -104841,7 +104753,6 @@ travolti
 travolto
 trazione
 trazioni
-tre
 trebbi
 trebbia
 trebbiai
@ -105797,7 +105708,6 @@ truffo
 truffò
 truppa
 truppe
-tu
 tua
 tuba
 tubai
@ -106199,7 +106109,6 @@ tuttavia
 tutte
 tutti
 tutto
-tuttora
 tè
 ubbidiente
 ubbidienti
@ -106630,7 +106539,6 @@ umori
 umorismi
 umorismo
 umoristico
-un
 una
 unanime
 unanimi
@ -107251,8 +107159,6 @@ uve
 uxoricida
 uxoricidi
 uxoricidio
-v
-va
 vacante
 vacanti
 vacanza
@ -107535,7 +107441,6 @@ vagò
 vai
 vaioli
 vaiolo
-val
 valanga
 valanghe
 vale
@ -107545,7 +107450,6 @@ valente
 valenti
 valentissimo
 valentuomini
-valentuomo
 valenza
 valere
 valersi
@ -107696,7 +107600,6 @@ vammi
 vampa
 vampiri
 vampiro
-van
 vana
 vanagloria
 vanamente
@ -107997,7 +107900,6 @@ vaticano
 vaticinare
 vattene
 vatti
-ve
 vecchi
 vecchia
 vecchiacci
@ -108931,7 +108833,6 @@ vezzosa
 vezzose
 vezzosi
 vezzoso
-vi
 via
 viadotti
 viadotto
@ -110178,7 +110079,6 @@ xenofobia
 xilofono
 zabaione
 zabaioni
-zac
 zacchere
 zaffata
 zaffate
@ -110271,7 +110171,6 @@ zero
 zeta
 zia
 zibellino
-zic
 zie
 zigomo
 zii
--- a/assets/ru-utf8.txt
+++ b/assets/ru-utf8.txt
--- a/assets/uk-utf8.txt
+++ b/assets/uk-utf8.txt
--- a/docs/bgWordlistReadme.txt
+++ b/docs/bgWordlistReadme.txt
@ -1,5 +1,6 @@
 Bulgarian wordlist by: Miglen Georgiev
 Version: f46eff1 (2022-04-26)
-Words Count: 234114
 Source: https://github.com/miglen/bulgarian-wordlists/blob/master/wordlists/bg-words-validated-cyrillic.txt
 License: https://github.com/miglen/bulgarian-wordlists/blob/master/LICENSE
+
+Additionally cleaned up repeating words and added some missing ones.
--- a/docs/enWordlistReadme.txt
+++ b/docs/enWordlistReadme.txt
@ -1,15 +1,33 @@
-// Source for English dictionary: http://wordlist.sourceforge.net/
+Custom wordlist generated from http://app.aspell.net/create using SCOWL
+
+with parameters (words with 2-3 letters):
+  diacritic: strip
+  max_size: 50
+  max_variant: 0
+  special: <none>
+  spelling: US
+
+with parameters (words with 4 or more letters):
+  diacritic: strip
+  max_size: 70
+  max_variant: 2
+  special: hacker
+  spelling: US GBz
+
+Using Git Commit From: Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]
+
+=====

 Spell Checking Oriented Word Lists (SCOWL)
-Revision 7.1 (SVN Revision 161)
-January 6, 2011
+
+Mon Dec 7 20:14:35 2020 -0500 [5ef55f9]
 by Kevin Atkinson (kevina@gnu.org)

 The SCOWL is a collection of word lists split up in various sizes, and
 other categories, intended to be suitable for use in spell checkers.
 However, I am sure it will have numerous other uses as well.

-The latest version can be found at http://wordlist.sourceforge.net/.
+The latest version can be found at http://wordlist.aspell.net/.

 The directory final/ contains the actual word lists broken up into
 various sizes and categories.  The r/ directory contains Readmes from
@ -29,10 +47,11 @@ Except for the special word lists the files follow the following
 naming convention:
  <spelling category>-<sub-category>.<size>
 Where the spelling category is one of
-  english, american, british, british_z, canadian, 
-  variant_0, varaint_1, variant_2,
-  british_variant_0, british_variant_1,
-  canadian_variant_0, canadian_variant_1,
+  english, american, british, british_z, canadian, australian
+  variant_1, variant_2, variant_3,
+  british_variant_1, british_variant_2,
+  canadian_variant_1, canadian_variant_2,
+  australian_variant_1, australian_variant_2
 Sub-category is one of
  abbreviations, contractions, proper-names, upper, words
 And size is one of
@ -44,131 +63,273 @@ Where description is one of:
  roman-numerals, hacker

 The perl script "mk-list" can be used to create a word list of the
-desired size, it usage is:
+desired size, its usage is:
  ./mk-list [-f] [-v#] <spelling categories> <size>
 where <spelling categories> is one of the above spelling categories
 (the english and special categories are automatically included as well
-as all sub-categories) and <size> is the desired desired size.  The
-"-v" option can be used to used to also include the appropriate
+as all sub-categories) and <size> is the desired size.  The
+"-v" option can be used to also include the appropriate
 variants file up to level '#'.  The normal output will be a sorted
 word list.  If you rather see what files will be included, use the
 "-f" option.

 When manually combining the words lists the "english" spelling
 category should be used as well as one of "american", "british",
-"british_z" (british with ize spelling), or "canadian".  Great care
-has been taken so that that only one spelling for any particular word
-is included in the main list (with some minor exceptions).  When two
-variants were considered equal I randomly picked one for inclusion in
-the main word list.  Unfortunately this means that my choice in how to
-spell a word may not match your choice.  If this is the case you can
-try including one of the "variant_0" spelling categories which
+"british_z" (british with ize spelling), "canadian" or "australian".
+Great care has been taken so that only one spelling for any particular
+word is included in the main list (with some minor exceptions).  When
+two variants were considered equal I randomly picked one for inclusion
+in the main word list.  Unfortunately this means that my choice in how
+to spell a word may not match your choice.  If this is the case you
+can try including one of the "variant_1" spelling categories which
 includes most variants which are considered almost equal.  The
-"variant_0" spelling category corresponds mostly to American variants,
-while the "british_variant_0" and "canadian_variant_0" are for British
-and Canadian variants, respectively.  The "variant_1" spelling
-categories include variants which are also generally considered
-acceptable, and "variant_2" contains variants which are seldom used
-and may now even be considered correct.  There is no
-"british_variant_2" or "canadian_variant_2" spelling category since
+"variant_1" spelling category corresponds mostly to American variants,
+while the "british_variant_1", "canadian_variant_1" and
+"australian_variant_1" are for British, Canadian and Australian
+variants, respectively.  The "variant_2" spelling categories include
+variants which are also generally considered acceptable, and
+"variant_3" contains variants which are seldom used and may not even
+be considered correct.  There is no "british_variant_3",
+"canadian_variant_3" or "australian_variant_3" spelling category since
 the distinction would be almost meaningless.

 The "abbreviation" category includes abbreviations and acronyms which
 are not also normal words. The "contractions" category should be self
 explanatory. The "upper" category includes upper case words and proper
 names which are common enough to appear in a typical dictionary. The
-"proper-names" category included all the additional uppercase words.
-Final the "words" category contains all the normal English words.
+"proper-names" category includes all the additional uppercase words.
+Finally the "words" category contains all the normal English words.

 To give you an idea of what the words in the various sizes look like
 here is a sample of 25 random words found only in that size:

-10: advertised agreeing artificial bucket changes closest currently finding
-    implications learning liable obvious partial peace planet preparing
-    produced regulations shortly tries under unnecessary vacations vast wind 
+10: blow convert delete enables flow hot individual job maintains occurred
+    pointless political population provided quits recovering results settles
+    simultaneous situation source tickets uncertain uses why

-20: accomplishes addict baffles blink chapel corrections depresses dripping
-    erased infant interfere launch nicking novels paranoid passport pursued
-    recruitment rectifying relaxed sixteen sundry tab undergone withdraws 
+20: additions advertisement akin applicants appoints celebrated contracts
+    crime degradation discriminate enforcing escapes fabric funeral
+    genetically inconsistencies initialized innovative lodge lurking
+    photographic punches tiring trumpet wary

-35: adores affixes brisks caking conciliates decimates discretionary
-    dispatches forensics glorify gridiron healed hurling kelp massacring
-    necks pits placarding pyramids ratting recreates renovated sandals shirks
-    subtract 
+35: bagel brewed bushel charting commutative consigning dabbed displacements
+    fatties flotillas flung gunshots harrow hull hungriest kangaroos math
+    memoirs negatives nonresident rampages ranchers submissive subtractions
+    tipped

-40: demoed dichotomy dilapidation disheveled ebullience estimable finagling
-    hemorrhoid lazily medalists mintiest motherboards ostracism pornographers
-    predilections remarries southbound steamrolled sympathizers tads tampons
-    tattletale upchucked vainly viscous 
+40: astrologers bedraggles buzzword cupcakes eyeglass gridlock grungy
+    hairpiece hallucinates hotcakes inebriated leakier nymphomania papergirls
+    patchier patrolman predisposed reshuffled sasses snowmobiling
+    southeasterly teargas testiest topographer wimpy

-50: bootless brawler bulkhead canoeist declassifying farthings hake hectors
-    helpmate hermitage humanoid kitsch mercerize pawnshops pleasingly
-    retrorockets scurrilously solemnizes superficiality symbiosis tangelo
-    timetabling unenviable unmoral unreconstructed 
+50: apiaries besmirching boozier caducei communicant drainpipe ductile
+    exigencies gammas grouted harbinger hyphenations licentiate lynxes
+    maidenly malingerer palmettos pinwheeled prepackage propellant scrimmaged
+    sculleries senselessly unscrambled viburnums

-55: beachfront bicarbonate caff campanologists execrably fab fightback
-    firebricks insipidity laboriousness megawatts mirthlessly misnames
-    nymphos photocell potholed psychoactive psychoanalytically schoolmarmish
-    simulacra subeditors supremo sweated turbocharges yogic 
+55: bloodstock bodge bruiting bumbag carthorse clumpy dandifying etiolated
+    fleabite guestrooms marge moi overdeveloped owlishly perisher plebes
+    pseudy pukka putzes sangria splodges stocktaking subspecies tiebreaks
+    touchpapers

-60: assayer banteringly besmeared brazer chromatin cremes deciliters
-    doubtfulness enshrinement ephemerally fibular globalist gypper
-    legitimatized mensch mopers oversea pantyliner paratyphoid redivide
-    rehabilitative salesladies sensualists superposition univalves 
+60: autobiographic cytologist fellowman footraces gypsters hardihood
+    headshrinker homo interfile nonoperational nonsupporting outdraw
+    profligately readopted revetments semanticist stagnantly tapper thanes
+    thetas uncloaking uncross versifiers wasabi xylene

-70: adactylous anticapitalist bezant bister boraginaceous civically cossacks
-    cousinly curricle dekaliter grippingly grugrus gurging hermaphroditism
-    levanted magnetizer nonapplicable panegyrists parametrize radomes
-    refilter ruinations teths truistic uts 
+70: biltongs bookcraft bouilli bouse bronchiole cirrostrati coenurus
+    desorption feculence hackbuts heterolysis hylophagous ichthyosaur
+    iguanodon jillion lapidated mistranslating pullulating redd shylock skink
+    storaxes thalluses vermiculations voiture

-80: bodikin buhrs covetiveness diarch disaccharidases drumbeater empusas
-    flyings hyperexcitability hyperpolarizations janizaries overwash
-    physiocrats postform postsecondary preambulate puzzlehead remixer
-    snoutier tetrathlons toothdrawing triff unaffectionate wearish yawy 
+80: cellulolytic chomper costrels ditheistic doddard dwarfest fellwalkers
+    fernless gammoners gasolinic introductive labrets macaber
+    perspicaciousness pharmacodynamics pitchwomen pleuritical protore
+    repurifies ristras rolamite rumping sedimenting smithereening tolans

-95: actinophone aerobious anadenia biochemics chromatopathia ciclatouns
-    gaspiest guapinol hagigah interdorsal melanotekite minnicking
-    nonretrenchment overloftily oystriges peltandra retromaxillary
-    subterraqueous transphysically unconfidential unvalidating upspew
-    verminlike vetiveria yerth 
+95: amherstite appropinquations arsefoot assur commodate craspedia cutitis
+    disciferous endeavourments endocondensation glyoxalase hatherlite
+    interreticular interspicular lipothymy prieved reconvergence rousette
+    septerium superdonation tenaim topepo trachelitis transgeneses
+    ultraenthusiastic

-And here is a count on the number of in each spelling category
+
+And here is a count on the number of words in each spelling category
 (american + english spelling category):

  Size   Words       Names    Running Total  %
-   10    4,427          15        4,442     0.7
-   20    8,122           0       12,564     1.9
-   35   37,251         224       50,039     7.7
-   40    6,802         503       57,344     8.8
-   50   24,505      15,455       97,304    14.9
-   55    6,555           0      103,859    15.9
-   60   13,633         775      118,267    18.1
-   70   35,507       7,747      161,521    24.8
-   80  143,791      33,293      338,605    51.9
-   95  227,056      86,814      652,475   100.0
+   10    4,425          13        4,438     0.7
+   20    8,126           0       12,564     1.9
+   35   37,260         220       50,044     7.6
+   40    6,858         489       57,391     8.7
+   50   25,289      18,683      101,363    15.4
+   55    6,487           0      107,850    16.4
+   60   14,551         850      123,251    18.7
+   70   35,294       7,897      166,442    25.3
+   80  144,158      33,368      343,968    52.3
+   95  227,633      86,630      658,231   100.0
+

 (The "Words" column does not include the name count.)

 Size 35 is the recommended small size, 50 the medium and 70 the large.
-For spell checking I recommend using 60.  Sizes 70 and below contain
-words found in most dictionaries while the 80 size contains all the
-strange and unusual words people like to use in word games such as
-Scrabble (TM).  While a lot of the the words in the 80 size are not
-used very often, they are all generally considered valid words in the
-English language.  The 95 contains just about every English word in
-existence and then some.  Many of the words at the 95 level will
-probably not be considered valid English words by most people.  I use
-the 60 size for the English dictionary for Aspell, and I don't
-recommend anyone use levels above 70 for spell checking.  Levels above
-70 contain rarely used words which can hide misspellings of similar
-more commonly used words.  For example the word "ort" can hide a
-common typo of "or".  No one should need to use a size larger than 80,
-the 95 size is labeled insane for a reason.
+Sizes 70 and below contain words found in most dictionaries while the
+80 size contains all the strange and unusual words people like to use
+in word games such as Scrabble (TM).  While a lot of the words in the
+80 size are not used very often, they are all generally considered
+valid words in the English language.  The 95 contains just about every
+English word in existence and then some.  Many of the words at the 95
+level will probably not be considered valid English words by most
+people.

-Accents are present on certain words such as caf顩n iso8859-1 format.
+For spell checking I recommend using size 60.  This size is the
+largest size that I am fairly confident does not contain any
+misspellings or invalid words.  In addition an effort is made to
+exclude valid yet problematic words (such as "calender") from the 60
+size that are likely to be a misspelling of a more common word.  The
+70 size is reasonable for those wanting a larger list and don't mind a
+few errors.  The 80 or larger sizes are not reasonable for spell
+checking.
+
+Accents are present on certain words such as café in iso8859-1 format.

 CHANGES:

+From Version 2019.10.06 to 2020.12.07
+
+  Various new words.
+
+  Variant cleanups.
+
+  Bump irregardless, froward (+ derivatives) and perpend to level 70.
+
+From Version 2018.04.16 to 2019.10.06
+
+  Various new words.
+
+  Remove compare's and fail's.
+
+From Version 2017.08.24 to 2018.04.16
+
+  Various new words.
+
+  Fix build problems on macOS.
+
+From Version 2017.01.22 to 2017.08.24
+
+  Various new words.
+
+From Version 2016.11.20 to 2017.01.22
+
+  Various new words.
+
+From Version 2016.06.26 to 2016.11.20
+
+  New Australian spelling category thanks to the work of Benjamin
+  Titze (btitze@protonmail.ch)
+
+  Various new words.
+
+From Version 2016.01.19 to 2016.06.26
+
+  Various new words.
+
+  Updated to Version 6.0.2 of 12dicts
+
+  Other minor changes.
+
+From Version 2015.08.24 to 2016.01.19
+
+  Various new words.
+
+  Clarified README to indicate why the 60 size is the preferred size
+  for spell checking.
+
+  Remove some very uncommon possessive forms.
+
+  Change "SET UTF8" to "SET UTF-8" in hunspell affix file.
+
+From Version 2015.05.18 to 2015.08.24 (Aug 24, 2015)
+
+  Various new words.
+
+From Version 2015.04.24 to 2015.05.18 (May 18, 2015)
+
+  Added some new words found to have a high frequency in the COCA
+  corpus.  (http://corpus.byu.edu/coca/).
+
+  Fix en spelling suggestions for 'alot' and 'exersize' in hunspell
+  dictionary (upstreamed from the changes made in Firefox).
+
+From Version 2015.02.15 to 2015.04.24 (April 24, 2015)
+
+  Added some new words.
+
+  Convert hunspell dictionary to UTF-8 in order to handle smart
+  quotes correctly.
+
+From Version 2015.01.28 to 2015.02.15 (February 15, 2015)
+
+  Added a large number of neologisms (newly invented words)
+  such as "selfie" and "smartwatch" thanks to Alan Beale.
+
+  Various other new words.
+
+  Clean up the special-hacker category by removing some words that
+  didn't exist in the Google Book's Corpus (1980 - 2008) and
+  originated from the "Unofficial Jargon File Word Lists".
+
+From Version 2014.11.17 to 2015.01.28 (January 28, 2015)
+
+  Various new words, many from analyzing the Google Book's Corpus
+  (1980 - 2008).  See http://app.aspell.net/lookup-freq.
+
+  Moved some uncommon words that can easily hide a misspelling of a
+  more common word to level 70.  (calender, adrenalin and Joesph)
+
+  Removed several -er and -est forms from adjectives that were so
+  uncommon that they were not found anywhere is the Google Book's
+  Corpus (1980 - 2008).
+
+From Version 2014.08.11.1 to 2014.11.17 (November 17, 2014)
+
+  Various new words.
+
+  Fix typo in Hunspell readme.
+
+From Version 2014.08.11 to 2014.08.11.1 (August 13, 2014)
+
+  Forgot to mention this important change from 7.1 to 2014.08.11:
+
+    Shifted the variant levels up by one: variant_0 is now variant_1,
+    variant_1 is now variant_2, and variant_2 is now variant_3.
+
+  Other minor fixes in this README.
+
+  No changes to the contents of the lists.
+
+From Revision 7.1 to Version 2014.08.11 (August 11, 2014)
+
+  Added some missing possessive forms.
+
+  Added some new words and proper names.
+
+  Clean up the categories (words, upper, proper-names etc) so that they
+  are more accurate.
+
+  Convert documentation to UTF-8.  For now, the wordlist are still in
+  ISO-8859-1 to prevent compatibility problems.
+
+  Add schema and scripts for creating a SQLite database from SCOWL.
+  Add some utility and library functions using them.  This database is
+  used by the new web app's (http://app.aspell.net/lookup & create).
+
+  Enhance speller/make-hunspell-dict.  The biggest improvement is that
+  it that it now generates several more dictionaries in addition to
+  the official ones.  These additional dictionaries are ones for
+  British English and larger dictionaries that include up to SCOWL
+  size 70.
+
 From Revision 7 to 7.1 (January 6, 2011)

  Updated to revision 5.1 of Varcon which corrected several errors.
@ -179,7 +340,7 @@ From Revision 7 to 7.1 (January 6, 2011)
  Added several now common proper names and some other words now
  in common use.

-  Include misc/ and speller/ directory which where in SVN but left
+  Include misc/ and speller/ directory which were in SVN but left
  out of the release tarball.

  Other minor fixes, including some fixes to the taboo word lists.
@ -216,7 +377,7 @@ From Revision 5 to 6 (August 10, 2004)

  Updated to version 4.1 of VarCon.

-  Added the "british_z" spelling category which it British using the
+  Added the "british_z" spelling category which is British using the
  "ize" spelling.

 From Revision 4a to 5 (January 3, 2002)
@ -254,7 +415,7 @@ From Revision 3 to 4 (January 28, 2001)
  Added words in the Ispell word list at the 65 level.

  Other changes due to using more recent versions of various sources
-  included a more accurate version of AGID thanks to the word of
+  included a more accurate version of AGID thanks to the work of
  Alan Beale

 From Revision 2 to 3 (August 18, 2000)
@ -285,10 +446,10 @@ From Revision 1 to 2 (August 5, 2000)

 COPYRIGHT, SOURCES, and CREDITS:

-The collective work is Copyright 2000-2011 by Kevin Atkinson as well
+The collective work is Copyright 2000-2018 by Kevin Atkinson as well
 as any of the copyrights mentioned below:

-  Copyright 2000-2011 by Kevin Atkinson
+  Copyright 2000-2018 by Kevin Atkinson

  Permission to use, copy, modify, distribute and sell these word
  lists, the associated scripts, the output created from the scripts,
@ -399,7 +560,7 @@ The 40 level includes words from Alan's 3esl list found in version 4.0
 of his 12dicts package.  Like his other stuff the 3esl list is also in the
 public domain.

-The 50 level includes Brian's frequency class 1, words words appearing
+The 50 level includes Brian's frequency class 1, words appearing
 in at least 5 of 12 of the dictionaries as indicated in the 12Dicts
 package, and uppercase words in at least 4 of the previous 12
 dictionaries.  A decent number of proper names is also included: The
@ -428,11 +589,11 @@ The 70 level includes Brian's frequency class 0 and the 74,550 common
 dictionary words from the MWords package.  The common dictionary words,
 like those from the 12Dicts package, have had all likely inflections
 added.  The 70 level also included the 5desk list from version 4.0 of
-the 12Dics package which is the public domain.
+the 12Dics package which is in the public domain.

 The 80 level includes the ENABLE word list, all the lists in the
 ENABLE supplement package (except for ABLE), the "UK Advanced Cryptics
-Dictionary" (UKACD), the list of signature words in from YAWL package,
+Dictionary" (UKACD), the list of signature words from the YAWL package,
 and the 10,196 places list from the MWords package.

 The ENABLE package, mainted by M\Cooper <thegrendel@theriver.com>,
@ -476,11 +637,30 @@ found anywhere else.

 Accent information was taken from UKACD.

-My VARCON package was used to create the American, British, and
-Canadian word list. 
+The VarCon package was used to create the American, British, Canadian,
+and Australian word list.  It is under the following copyright:

-Since the original word lists used used in the VARCON package came
-from the Ispell distribution they are under the Ispell copyright:
+  Copyright 2000-2016 by Kevin Atkinson
+
+  Permission to use, copy, modify, distribute and sell this array, the
+  associated software, and its documentation for any purpose is hereby
+  granted without fee, provided that the above copyright notice appears
+  in all copies and that both that copyright notice and this permission
+  notice appear in supporting documentation. Kevin Atkinson makes no
+  representations about the suitability of this array for any
+  purpose. It is provided "as is" without express or implied warranty.
+
+  Copyright 2016 by Benjamin Titze
+
+  Permission to use, copy, modify, distribute and sell this array, the
+  associated software, and its documentation for any purpose is hereby
+  granted without fee, provided that the above copyright notice appears
+  in all copies and that both that copyright notice and this permission
+  notice appear in supporting documentation. Benjamin Titze makes no
+  representations about the suitability of this array for any
+  purpose. It is provided "as is" without express or implied warranty.
+
+  Since the original words lists come from the Ispell distribution:

  Copyright 1993, Geoff Kuenning, Granada Hills, CA
  All rights reserved.
@ -503,18 +683,18 @@ from the Ispell distribution they are under the Ispell copyright:
     products derived from this software without specific prior
     written permission.

-  THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS
-  IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-  FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF
-  KUENNING OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
-  INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
-  BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
-  LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
-  ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
-  POSSIBILITY OF SUCH DAMAGE.
+  THIS SOFTWARE IS PROVIDED BY GEOFF KUENNING AND CONTRIBUTORS ``AS IS'' AND
+  ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED.  IN NO EVENT SHALL GEOFF KUENNING OR CONTRIBUTORS BE LIABLE
+  FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+  OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+  SUCH DAMAGE.
+

 The variant word lists were created from a list of variants found in
 the 12dicts supplement package as well as a list of variants I created
@ -536,7 +716,7 @@ giant perl script.  With the amount of memory available these days (at
 least 2 GB, often 4 GB or more) this should not really be a problem.

 In addition, there is a very nice frequency analyze of the BNC corpus
-done by Adam Kilgarriff.  Unlike Brain's word lists the BNC lists
+done by Adam Kilgarriff.  Unlike Brian's word lists the BNC lists
 include part of speech information.  I plan on somehow using these
 lists as Adam Kilgarriff has given me the OK to use it in SCOWL.
 These lists will greatly reduce the problem of inflected forms of a
@ -545,7 +725,7 @@ information.

 There is frequency information for some other corpus such as COCA
 (Corpus of Contemporary American English) and ANS (American National
-Corpus) which I might also be able to use.  The formal will require
+Corpus) which I might also be able to use.  The former will require
 permission, and the latter is of questionable quality.

 RECREATING THE WORD LISTS:
@ -553,17 +733,17 @@ RECREATING THE WORD LISTS:
 In order to recreate the word lists you need a modern version of Perl,
 bash, the traditional set of shell utilities, a system that supports
 symbolic links, and quite possibly GNU Make.  The easiest way to
-recreate the word lists is to checkout SVN revision 161 (or tag
-scowl-7.1) and simply type "make" (see http://wordlist.sourceforge.net).
-You can try to download all the pieces manually, but you may not get
-the same result since the latest version of some parts used to create
-SCOWL may not have been released yet.
+recreate the word lists is to checkout the corresponding Git version
+(see the version string at the start of the file) and simply type
+"make" (see http://wordlist.aspell.net).  You can try to download all
+the pieces manually, but this method is not no longer tested nor
+supported.

 The src/ directory contains the numerous scripts used in the creation
 of the final product.

 The r/ directory contains the raw data used to create the final
-product.  If you checkout from SVN this directory should be populated
+product.  If you checkout from Git this directory should be populated
 automatically for you.  If you insist on doing it the hard way see the
 README file in the r/ directory for more information.

--- a/docs/ruWordlistReadme.txt
+++ b/docs/ruWordlistReadme.txt
@ -1 +1,6 @@
-Source for Russian dictionary: Various sources from Russian user
+Russian wordlist by: William Hingston
+Version: 5481cb8 (2018-09-13)
+Source: https://github.com/hingston/russian/blob/master/100000-russian-words.txt
+License: https://github.com/hingston/russian/blob/master/LICENSE.md
+
+Additionally cleaned up repeating and nonsense words.
--- a/scripts/remove-dictionary-repeating-words.js
+++ b/scripts/remove-dictionary-repeating-words.js
@ -0,0 +1,60 @@
+const { basename } = require('path');
+const { createReadStream, existsSync } = require('fs');
+
+
+
+function printHelp() {
+	console.log(`Usage ${basename(process.argv[1])} LOCALE FILENAME.txt `);
+	console.log('Removes repeating words from a word list');
+	console.log('\nLocale could any valid JS locale, for exmaple: en, en-US, etc...');
+}
+
+
+
+function validateInput() {
+	if (process.argv.length < 4) {
+		printHelp();
+		process.exit(1);
+	}
+
+
+	if (!existsSync(process.argv[3])) {
+		console.error(`Failure! Could not find file "${process.argv[3]}."`);
+		process.exit(2);
+	}
+
+	return { fileName: process.argv[3], locale: process.argv[2] };
+}
+
+
+
+async function removeRepeatingWords({ fileName, locale }) {
+	const lineReader = require('readline').createInterface({
+	  input: createReadStream(fileName)
+	});
+
+	const geographicalName = /[A-Z]\w+\-[^\n]+/;
+	const wordMap = {};
+
+	for await (const line of lineReader) {
+		const wordKey = geographicalName.test(line) ? line : line.toLocaleLowerCase(locale);
+		wordMap[wordKey] = true
+	}
+
+	return Object.keys(wordMap);
+}
+
+
+
+function printWords(wordList) {
+	if (!Array.isArray(wordList)) {
+		return;
+	}
+
+	wordList.forEach(w => console.log(w));
+}
+
+
+
+/** main **/
+removeRepeatingWords(validateInput()).then(words => printWords(words));
--- a/src/io/github/sspanak/tt9/ime/TraditionalT9.java
+++ b/src/io/github/sspanak/tt9/ime/TraditionalT9.java
@ -183,7 +183,7 @@ public class TraditionalT9 extends KeyPadHandler {
 	 * @return boolean
 	 */
 	protected boolean onNumber(int key, boolean hold, boolean repeat) {
-		if (mInputMode.shouldAcceptCurrentSuggestion(key, hold, repeat)) {
+		if (mInputMode.shouldAcceptCurrentSuggestion(mLanguage, key, hold, repeat)) {
 			mInputMode.onAcceptSuggestion(mLanguage, getComposingText());
 			commitCurrentSuggestion(false);
 			determineNextTextCase();
--- a/src/io/github/sspanak/tt9/ime/modes/InputMode.java
+++ b/src/io/github/sspanak/tt9/ime/modes/InputMode.java
@ -83,6 +83,6 @@ abstract public class InputMode {
 	public boolean shouldTrackNumPress() { return true; }
 	public boolean shouldTrackUpDown() { return false; }
 	public boolean shouldTrackLeftRight() { return false; }
-	public boolean shouldAcceptCurrentSuggestion(int key, boolean hold, boolean repeat) { return false; }
+	public boolean shouldAcceptCurrentSuggestion(Language language, int key, boolean hold, boolean repeat) { return false; }
 	public boolean shouldSelectNextSuggestion() { return false; }
 }
--- a/src/io/github/sspanak/tt9/ime/modes/ModeABC.java
+++ b/src/io/github/sspanak/tt9/ime/modes/ModeABC.java
@ -33,7 +33,7 @@ public class ModeABC extends InputMode {
 	final public boolean isABC() { return true; }
 	public int getSequenceLength() { return 1; }

-	public boolean shouldAcceptCurrentSuggestion(int key, boolean hold, boolean repeat) {	return hold || !repeat;	}
+	public boolean shouldAcceptCurrentSuggestion(Language l, int key, boolean hold, boolean repeat) { return hold || !repeat; }
 	public boolean shouldTrackUpDown() { return true; }
 	public boolean shouldTrackLeftRight() { return true; }
 	public boolean shouldSelectNextSuggestion() {
--- a/src/io/github/sspanak/tt9/ime/modes/ModePredictive.java
+++ b/src/io/github/sspanak/tt9/ime/modes/ModePredictive.java
@ -105,16 +105,16 @@ public class ModePredictive extends InputMode {
 	 * In this mode, In addition to confirming the suggestion in the input field,
 	 * we also increase its' priority. This function determines whether we want to do all this or not.
 	 */
-	public boolean shouldAcceptCurrentSuggestion(int key, boolean hold, boolean repeat) {
+	public boolean shouldAcceptCurrentSuggestion(Language language, int key, boolean hold, boolean repeat) {
 		return
 			hold
 			// Quickly accept suggestions using "space" instead of pressing "ok" then "space"
 			|| key == 0
 			// Punctuation is considered "a word", so that we can increase the priority as needed
 			// Also, it must break the current word.
-			|| (key == 1 && digitSequence.length() > 0 && !digitSequence.endsWith("1"))
+			|| (!language.isPunctuationPartOfWords() && key == 1 && digitSequence.length() > 0 && !digitSequence.endsWith("1"))
 			// On the other hand, letters also "break" punctuation.
-			|| (key != 1 && digitSequence.endsWith("1"));
+			|| (!language.isPunctuationPartOfWords() && key != 1 && digitSequence.endsWith("1"));
 	}


--- a/src/io/github/sspanak/tt9/languages/Language.java
+++ b/src/io/github/sspanak/tt9/languages/Language.java
@ -8,6 +8,7 @@ public class Language {
 	protected int id;
 	protected String name;
 	protected Locale locale;
+	protected boolean isPunctuationPartOfWords; // see the getter for more info
 	protected int icon;
 	protected String dictionaryFile;
 	protected int abcLowerCaseIcon;
@ -30,6 +31,24 @@ public class Language {
 		return icon;
 	}

+	/**
+	 * isPunctuationPartOfWords
+	 * This plays a role in Predictive mode only.
+	 *
+	 * Return "true", if you need to use the 1-key for typing words, such as:
+	 * "it's" (English), "a'tje" (Dutch), "п'ят" (Ukrainian).
+	 *
+	 * Return "false" also:
+	 * 		- hide words like the above from the suggestions.
+	 *		- 1-key would commit the current word, then display the punctuation list.
+	 * 			For example, pressing 1-key after "it" would accept "it" as a separate word,
+	 * 			then display only: | , | . | ! | ? | ...
+	 *
+	 * "false" is recommended when apostrophes or other punctuation are not part of the words,
+	 * because it would allow faster typing.
+	 */
+	final public boolean isPunctuationPartOfWords() { return isPunctuationPartOfWords; }
+
 	final public String getDictionaryFile() {
 		return dictionaryFile;
 	}
--- a/src/io/github/sspanak/tt9/languages/definitions/Bulgarian.java
+++ b/src/io/github/sspanak/tt9/languages/definitions/Bulgarian.java
@ -14,6 +14,7 @@ public class Bulgarian extends Language {
 		name = "български";
 		locale = new Locale("bg","BG");
 		dictionaryFile = "bg-utf8.txt";
+		isPunctuationPartOfWords = false;
 		icon = R.drawable.ime_lang_bg;
 		abcLowerCaseIcon = R.drawable.ime_lang_cyrillic_lower;
 		abcUpperCaseIcon = R.drawable.ime_lang_cyrillic_upper;
--- a/src/io/github/sspanak/tt9/languages/definitions/Dutch.java
+++ b/src/io/github/sspanak/tt9/languages/definitions/Dutch.java
@ -12,13 +12,14 @@ public class Dutch extends English {
 		id = 8;
 		name = "Nederlands";
 		locale = new Locale("nl","NL");
+		isPunctuationPartOfWords = true;
 		dictionaryFile = "nl-utf8.txt";
 		icon = R.drawable.ime_lang_nl;

 		characterMap.get(2).addAll(Arrays.asList("à", "ä", "ç"));
 		characterMap.get(3).addAll(Arrays.asList("é", "è", "ê", "ë"));
 		characterMap.get(4).addAll(Arrays.asList("î", "ï"));
-		characterMap.get(6).addAll(Arrays.asList("ö"));
+		characterMap.get(6).add("ö");
 		characterMap.get(8).addAll(Arrays.asList("û", "ü"));
 	}
 }
--- a/src/io/github/sspanak/tt9/languages/definitions/English.java
+++ b/src/io/github/sspanak/tt9/languages/definitions/English.java
@ -14,6 +14,7 @@ public class English extends Language {
 		name = "English";
 		locale = Locale.ENGLISH;
 		dictionaryFile = "en-utf8.txt";
+		isPunctuationPartOfWords = true;
 		icon = R.drawable.ime_lang_en;
 		abcLowerCaseIcon = R.drawable.ime_lang_latin_lower;
 		abcUpperCaseIcon = R.drawable.ime_lang_latin_upper;
--- a/src/io/github/sspanak/tt9/languages/definitions/French.java
+++ b/src/io/github/sspanak/tt9/languages/definitions/French.java
@ -14,6 +14,7 @@ public class French extends English {
 		locale = Locale.FRENCH;
 		dictionaryFile = "fr-utf8.txt";
 		icon = R.drawable.ime_lang_fr;
+		isPunctuationPartOfWords = false;

 		characterMap.get(2).addAll(Arrays.asList("à", "â", "æ", "ç"));
 		characterMap.get(3).addAll(Arrays.asList("é", "è", "ê", "ë"));
--- a/src/io/github/sspanak/tt9/languages/definitions/German.java
+++ b/src/io/github/sspanak/tt9/languages/definitions/German.java
@ -13,6 +13,7 @@ public class German extends English {
 		locale = Locale.GERMAN;
 		dictionaryFile = "de-utf8.txt";
 		icon = R.drawable.ime_lang_de;
+		isPunctuationPartOfWords = false;

 		characterMap.get(2).add("ä");
 		characterMap.get(6).add("ö");
--- a/src/io/github/sspanak/tt9/languages/definitions/Italian.java
+++ b/src/io/github/sspanak/tt9/languages/definitions/Italian.java
@ -14,6 +14,7 @@ public class Italian extends English {
 		locale = Locale.ITALIAN;
 		dictionaryFile = "it-utf8.txt";
 		icon = R.drawable.ime_lang_it;
+		isPunctuationPartOfWords = false;

 		characterMap.get(2).add("à");
 		characterMap.get(3).addAll(Arrays.asList("é", "è"));
--- a/src/io/github/sspanak/tt9/languages/definitions/Russian.java
+++ b/src/io/github/sspanak/tt9/languages/definitions/Russian.java
@ -14,6 +14,7 @@ public class Russian extends Language {
 		name = "русский";
 		locale = new Locale("ru","RU");
 		dictionaryFile = "ru-utf8.txt";
+		isPunctuationPartOfWords = false;
 		icon = R.drawable.ime_lang_ru;
 		abcLowerCaseIcon = R.drawable.ime_lang_cyrillic_lower;
 		abcUpperCaseIcon = R.drawable.ime_lang_cyrillic_upper;
--- a/src/io/github/sspanak/tt9/languages/definitions/Ukrainian.java
+++ b/src/io/github/sspanak/tt9/languages/definitions/Ukrainian.java
@ -14,6 +14,7 @@ public class Ukrainian extends Language {
 		name = "українська";
 		locale = new Locale("uk","UA");
 		dictionaryFile = "uk-utf8.txt";
+		isPunctuationPartOfWords = true;
 		icon = R.drawable.ime_lang_uk;
 		abcLowerCaseIcon = R.drawable.ime_lang_cyrillic_lower;
 		abcUpperCaseIcon = R.drawable.ime_lang_cyrillic_upper;