From ec73b244ee7b0fa73388e6499e39c5778385ee6e Mon Sep 17 00:00:00 2001 From: mormegil Date: Tue, 28 Nov 2023 13:25:58 +0100 Subject: [PATCH] Improve ASCII transliterator We should use NFKD instead of NFD since we are flattening to US-ASCII afterwards anyway. This allows various Unicode characters which would end up as a question mark to be represented by their compatibility decomposition. This applies to e.g. ligatures (e.g. U+FB01 LATIN SMALL LIGATURE FI will now be replaced with plain fi instead of a question mark), and also the U+00A0 NO-BREAK SPACE [NBSP] to be replaced by a normal space instead of a question mark. +Add Czech fancy quotes to the Czech transliterator +Add a unit test for Multitransliterator --- .../language/impl/CzechTransliterator.java | 1 + .../impl/FlattenToAsciiTransliterator.java | 4 +-- .../util/language/LanguageUtilsTest.java | 27 ++++++++++++++----- 3 files changed, 24 insertions(+), 8 deletions(-) diff --git a/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/language/impl/CzechTransliterator.java b/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/language/impl/CzechTransliterator.java index 47098327d..e7c87c139 100644 --- a/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/language/impl/CzechTransliterator.java +++ b/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/language/impl/CzechTransliterator.java @@ -27,6 +27,7 @@ public class CzechTransliterator extends SimpleTransliterator { super(new HashMap() {{ put('ř',"r"); put('ě',"e"); put('ý',"y"); put('á',"a"); put('í',"i"); put('é',"e"); put('ó',"o"); put('ú',"u"); put('ů',"u"); put('ď',"d"); put('ť',"t"); put('ň',"n"); + put('„', "\""); put('“', "\""); put('‚', "'"); put('‘', "'"); }}); } } diff --git a/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/language/impl/FlattenToAsciiTransliterator.java b/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/language/impl/FlattenToAsciiTransliterator.java index 01ca0e7e0..4478eda2b 100644 --- a/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/language/impl/FlattenToAsciiTransliterator.java +++ b/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/language/impl/FlattenToAsciiTransliterator.java @@ -32,8 +32,8 @@ public class FlattenToAsciiTransliterator implements Transliterator { return txt; } - // Decompose the string into its canonical decomposition (splits base characters from accents/marks) - txt = Normalizer.normalize(txt, Normalizer.Form.NFD); + // Decompose the string into its compatible decomposition (splits base characters from accents/marks, and changes some characters to compatibility version) + txt = Normalizer.normalize(txt, Normalizer.Form.NFKD); // Remove all marks (characters intended to be combined with another character), keeping the base glyphs txt = txt.replaceAll("\\p{M}", ""); // Flatten the resulting string to ASCII diff --git a/app/src/test/java/nodomain/freeyourgadget/gadgetbridge/util/language/LanguageUtilsTest.java b/app/src/test/java/nodomain/freeyourgadget/gadgetbridge/util/language/LanguageUtilsTest.java index 9375451aa..08a359860 100644 --- a/app/src/test/java/nodomain/freeyourgadget/gadgetbridge/util/language/LanguageUtilsTest.java +++ b/app/src/test/java/nodomain/freeyourgadget/gadgetbridge/util/language/LanguageUtilsTest.java @@ -4,9 +4,13 @@ import android.content.SharedPreferences; import org.junit.Test; +import java.util.Arrays; + import nodomain.freeyourgadget.gadgetbridge.GBApplication; import nodomain.freeyourgadget.gadgetbridge.impl.GBDevice; import nodomain.freeyourgadget.gadgetbridge.test.TestBase; +import nodomain.freeyourgadget.gadgetbridge.util.language.impl.CzechTransliterator; +import nodomain.freeyourgadget.gadgetbridge.util.language.impl.ExtendedAsciiTransliterator; import nodomain.freeyourgadget.gadgetbridge.util.language.impl.FlattenToAsciiTransliterator; import static org.junit.Assert.assertEquals; @@ -96,9 +100,9 @@ public class LanguageUtilsTest extends TestBase { final Transliterator transliterator = LanguageUtils.getTransliterator("bengali"); // input with cyrillic and diacritic letters - String[] inputs = { "অনিরুদ্ধ", "বিজ্ঞানযাত্রা চলছে চলবে।", "আমি সব দেখেশুনে ক্ষেপে গিয়ে করি বাঙলায় চিৎকার!", - "আমার জাভা কোড is so bad! কী আর বলবো!" }; - String[] outputs = { "oniruddho", "biggaanJaatraa cholchhe cholbe.", + String[] inputs = {"অনিরুদ্ধ", "বিজ্ঞানযাত্রা চলছে চলবে।", "আমি সব দেখেশুনে ক্ষেপে গিয়ে করি বাঙলায় চিৎকার!", + "আমার জাভা কোড is so bad! কী আর বলবো!"}; + String[] outputs = {"oniruddho", "biggaanJaatraa cholchhe cholbe.", "aami sob dekheshune kkhepe giye kori baanglaay chitkaar!", "aamaar jaabhaa koD is so bad! kii aar bolbo!"}; @@ -189,7 +193,7 @@ public class LanguageUtilsTest extends TestBase { assertEquals("georgian transliteration failed", expected, output); } - @Test + @Test public void testStringTransliterateHungarian() { final Transliterator transliterator = LanguageUtils.getTransliterator("hungarian"); @@ -227,12 +231,23 @@ public class LanguageUtilsTest extends TestBase { @Test public void testFlattenToAscii() throws Exception { final FlattenToAsciiTransliterator transliterator = new FlattenToAsciiTransliterator(); - String input = "ä ș ț ă"; + String input = "ä ș ț ă fine"; String output = transliterator.transliterate(input); - String expected = "a s t a"; + String expected = "a s t a fine"; assertEquals("flatten to ascii transliteration failed", expected, output); } + @Test + public void testMultitransliterator() throws Exception { + final MultiTransliterator multiTransliterator = new MultiTransliterator(Arrays.asList( + new CzechTransliterator(), + new ExtendedAsciiTransliterator(), + new FlattenToAsciiTransliterator() + )); + assertEquals("Zlutoucky kun upel \"dabelske\" \"ody\"", multiTransliterator.transliterate("Žluťoučký kůň úpěl »ďábelské« „ódy“")); + assertEquals("300 Kc", multiTransliterator.transliterate("300\u00A0Kč")); + } + @Test public void testTransliterateOption() throws Exception { enableTransliteration(false);