From 546d20a9bc0e01ab2368649759dca09aaa556a9f Mon Sep 17 00:00:00 2001 From: sspanak Date: Wed, 16 Apr 2025 17:15:19 +0300 Subject: [PATCH] Fixed the word pair predictions not working when one of the words contains an apostrophe. This will result in significantly more accurate suggestions in Ukrainian and Hebrew, and slightly better in French, Italian and other languages that use apostrophes a lot --- .../sspanak/tt9/db/wordPairs/WordPair.java | 6 ++-- .../modes/predictions/WordPredictions.java | 21 ++++++++++- .../java/io/github/sspanak/tt9/util/Text.java | 36 ++++++++++++++++++- 3 files changed, 58 insertions(+), 5 deletions(-) diff --git a/app/src/main/java/io/github/sspanak/tt9/db/wordPairs/WordPair.java b/app/src/main/java/io/github/sspanak/tt9/db/wordPairs/WordPair.java index 97efa9ce..bc2cf100 100644 --- a/app/src/main/java/io/github/sspanak/tt9/db/wordPairs/WordPair.java +++ b/app/src/main/java/io/github/sspanak/tt9/db/wordPairs/WordPair.java @@ -24,8 +24,8 @@ public class WordPair { boolean isInvalid() { - Text w1 = new Text(word1); - Text w2 = new Text(word2); + Text w1 = new Text(language, word1); + Text w2 = new Text(language, word2); return language == null @@ -33,7 +33,7 @@ public class WordPair { || word1.equals(word2) || sequence2 == null || !(new Text(sequence2).isNumeric()) || (w1.codePointLength() > SettingsStore.WORD_PAIR_MAX_WORD_LENGTH && w2.codePointLength() > SettingsStore.WORD_PAIR_MAX_WORD_LENGTH) - || !w1.isAlphabetic() || !w2.isAlphabetic(); + || !w1.isWord() || !w2.isWord(); } diff --git a/app/src/main/java/io/github/sspanak/tt9/ime/modes/predictions/WordPredictions.java b/app/src/main/java/io/github/sspanak/tt9/ime/modes/predictions/WordPredictions.java index c3ffd364..220e2d5a 100644 --- a/app/src/main/java/io/github/sspanak/tt9/ime/modes/predictions/WordPredictions.java +++ b/app/src/main/java/io/github/sspanak/tt9/ime/modes/predictions/WordPredictions.java @@ -8,7 +8,9 @@ import io.github.sspanak.tt9.db.DataStore; import io.github.sspanak.tt9.ime.helpers.TextField; import io.github.sspanak.tt9.languages.EmojiLanguage; import io.github.sspanak.tt9.languages.Language; +import io.github.sspanak.tt9.languages.LanguageKind; import io.github.sspanak.tt9.preferences.settings.SettingsStore; +import io.github.sspanak.tt9.util.Text; import io.github.sspanak.tt9.util.TextTools; import io.github.sspanak.tt9.util.chars.Characters; @@ -348,6 +350,23 @@ public class WordPredictions extends Predictions { */ @NonNull protected String getPenultimateWord(@NonNull String currentWord) { - return textField.getWordBeforeCursor(language, 1, true); + // We are in the middle of a word or at the beginning of a new one. Pairing makes no sense. + Text after = textField.getTextAfterCursor(1); + if (after.startsWithWord()) { + return ""; + } + + Text before = textField.getTextBeforeCursor(); + + // We are at the end of word. The user is probably typing a compound word. We do not want to + // pair with the first part of the compound word. + if (before.length() > currentWord.length() && before.toString().endsWith(currentWord) && Character.isAlphabetic(before.toString().charAt(before.length() - currentWord.length() - 1))) { + return ""; + } + + return before.getPreviousWord( + !currentWord.isEmpty(), + LanguageKind.isUkrainian(language) || LanguageKind.isHebrew(language) + ); } } diff --git a/app/src/main/java/io/github/sspanak/tt9/util/Text.java b/app/src/main/java/io/github/sspanak/tt9/util/Text.java index fd5942f4..12e10be5 100644 --- a/app/src/main/java/io/github/sspanak/tt9/util/Text.java +++ b/app/src/main/java/io/github/sspanak/tt9/util/Text.java @@ -16,7 +16,11 @@ public class Text extends TextTools { private final Language language; private final String text; - private final static Pattern QUICK_DELETE_GROUP = Pattern.compile("(?:([\\s\\u3000]{2,})|([.,、。,،]{2,})|([^、。,\\s\\u3000]*.))$"); + private static final Pattern QUICK_DELETE_GROUP = Pattern.compile("(?:([\\s\\u3000]{2,})|([.,、。,،]{2,})|([^、。,\\s\\u3000]*.))$"); + private static final Pattern PREVIOUS_WORD = Pattern.compile("(?<=\\s|^)([\\p{L}\\p{Mc}\\p{Mn}\\p{Me}\\x{200D}\\x{200C}]+)$"); + private static final Pattern PREVIOUS_WORD_WITH_APOSTROPHES = Pattern.compile("(?<=\\s|^)([\\p{L}\\p{Mc}\\p{Mn}\\p{Me}\\x{200D}\\x{200C}']+)$"); + private static final Pattern PENULTIMATE_WORD = Pattern.compile("(?<=\\s|^)([\\p{L}\\p{Mc}\\p{Mn}\\p{Me}\\x{200D}\\x{200C}]+)[\\s'][^\\s']*$"); + private static final Pattern PENULTIMATE_WORD_WITH_APOSTROPHES = Pattern.compile("(?<=\\s|^)([\\p{L}\\p{Mc}\\p{Mn}\\p{Me}\\x{200D}\\x{200C}']+)\\s\\S*$"); public Text(Language language, String text) { @@ -58,6 +62,24 @@ public class Text extends TextTools { } + @NonNull + public String getPreviousWord(boolean skipOne, boolean isLanguageWithApostrophes) { + if (text == null || text.isEmpty()) { + return ""; + } + + Matcher matcher; + if (isLanguageWithApostrophes) { + matcher = skipOne ? PENULTIMATE_WORD_WITH_APOSTROPHES.matcher(text) : PREVIOUS_WORD_WITH_APOSTROPHES.matcher(text); + } else { + matcher = skipOne ? PENULTIMATE_WORD.matcher(text) : PREVIOUS_WORD.matcher(text); + } + + String word = matcher.find() ? matcher.group(1) : null; + return word == null ? "" : word; + } + + public int getTextCase() { if (isUpperCase()) { return InputMode.CASE_UPPER; @@ -97,6 +119,18 @@ public class Text extends TextTools { } + public boolean isWord() { + boolean isApostropheAllowed = LanguageKind.isUkrainian(language) || LanguageKind.isHebrew(language); + for (int i = 0, end = text == null ? 0 : text.length(); i < end; i++) { + if (!Character.isAlphabetic(text.charAt(i)) && !(isApostropheAllowed && text.charAt(i) == '\'')) { + return false; + } + } + + return true; + } + + public boolean isEmpty() { return text == null || text.isEmpty(); }