From 973e2bd34e4000dea719b05cb7e7b81239201121 Mon Sep 17 00:00:00 2001 From: Ted Stein Date: Sat, 4 Apr 2020 00:24:32 -0700 Subject: [PATCH] Implement transliteration for Korean. --- .../util/KoreanLanguageUtils.java | 342 ++++++++++++++++++ .../gadgetbridge/util/LanguageUtils.java | 13 +- .../gadgetbridge/test/LanguageUtilsTest.java | 38 ++ 3 files changed, 388 insertions(+), 5 deletions(-) create mode 100644 app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/KoreanLanguageUtils.java diff --git a/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/KoreanLanguageUtils.java b/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/KoreanLanguageUtils.java new file mode 100644 index 000000000..eccdbb8cf --- /dev/null +++ b/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/KoreanLanguageUtils.java @@ -0,0 +1,342 @@ +/* Copyright (C) 2020 Ted Stein + + This file is part of Gadgetbridge. + + Gadgetbridge is free software: you can redistribute it and/or modify + it under the terms of the GNU Affero General Public License as published + by the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + Gadgetbridge is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU Affero General Public License for more details. + + You should have received a copy of the GNU Affero General Public License + along with this program. If not, see . */ +package nodomain.freeyourgadget.gadgetbridge.util; + +import java.util.Optional; +import java.text.Normalizer; +import java.text.Normalizer.Form; + +// Implements Revised Romanization of Korean as well as we can without understanding any grammar. +// +// https://en.wikipedia.org/wiki/Revised_Romanization_of_Korean +public class KoreanLanguageUtils { + // https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29 + private static final char JAMO_BLOCK_START = 0x1100; + private static final char JAMO_BLOCK_END = 0x11FF; + // https://en.wikipedia.org/wiki/Hangul_Syllables + private static final char SYLLABLES_BLOCK_START = 0xAC00; + private static final char SYLLABLES_BLOCK_END = 0xD7A3; + // https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo + private static final char COMPAT_JAMO_BLOCK_START = 0x3131; + private static final char COMPAT_JAMO_BLOCK_END = 0x318E; + + // Returns whether a char is in the given block. Both bounds are inclusive. + private static boolean inRange(char c, char start, char end) { + return c >= start && c <= end; + } + + // User input consisting of isolated jamo is usually mapped to the KS X 1001 compatibility + // block, but jamo resulting from decomposed syllables are mapped to the modern one. This + // function maps compat jamo to modern ones where possible and returns all other characters + // unmodified. + // + // https://en.wikipedia.org/wiki/Hangul_Compatibility_Jamo + // https://en.wikipedia.org/wiki/Hangul_Jamo_%28Unicode_block%29 + private static char decompatJamo(char jamo) { + // KS X 1001 Hangul filler, not used in modern Unicode. A useful landmark in the + // compatibility jamo block. + // https://en.wikipedia.org/wiki/KS_X_1001#Hangul_Filler + final char HANGUL_FILLER = 0x3164; + + // Don't do anything to characters outside the compatibility jamo block. + if (!inRange(jamo, COMPAT_JAMO_BLOCK_START, COMPAT_JAMO_BLOCK_END)) { return jamo; } + + // Vowels are contiguous, in the same order, and unambiguous, so it's a simple offset. + if (jamo >= 0x314F && jamo < HANGUL_FILLER) { + return (char)(jamo - 0x1FEE); + } + + // Consonants are organized differently. No clean way to do this. + // + // The compatibility jamo block doesn't distinguish between Choseong (leading) and Jongseong + // (final) positions, but the modern block does. We map to Choseong here. + switch (jamo) { + case 0x3131: return 0x1100; // ㄱ + case 0x3132: return 0x1101; // ㄲ + case 0x3134: return 0x1102; // ㄴ + case 0x3137: return 0x1103; // ㄷ + case 0x3138: return 0x1104; // ㄸ + case 0x3139: return 0x1105; // ㄹ + case 0x3141: return 0x1106; // ㅁ + case 0x3142: return 0x1107; // ㅂ + case 0x3143: return 0x1108; // ㅃ + case 0x3145: return 0x1109; // ㅅ + case 0x3146: return 0x110A; // ㅆ + case 0x3147: return 0x110B; // ㅇ + case 0x3148: return 0x110C; // ㅈ + case 0x3149: return 0x110D; // ㅉ + case 0x314A: return 0x110E; // ㅊ + case 0x314B: return 0x110F; // ㅋ + case 0x314C: return 0x1110; // ㅌ + case 0x314D: return 0x1111; // ㅍ + case 0x314E: return 0x1112; // ㅎ + } + + // The rest of the compatibility block consists of archaic compounds that are unlikely to be + // encountered in modern systems. Just leave them alone. + return jamo; + } + + // Transliterates jamo one at a time. Returns its input if it isn't in the modern jamo block. + private static String transliterateSingleJamo(char jamo) { + jamo = decompatJamo(jamo); + + switch (jamo) { + // Choseong (leading position consonants) + case 0x1100: return "g"; // ㄱ + case 0x1101: return "kk"; // ㄲ + case 0x1102: return "n"; // ㄴ + case 0x1103: return "d"; // ㄷ + case 0x1104: return "tt"; // ㄸ + case 0x1105: return "r"; // ㄹ + case 0x1106: return "m"; // ㅁ + case 0x1107: return "b"; // ㅂ + case 0x1108: return "pp"; // ㅃ + case 0x1109: return "s"; // ㅅ + case 0x110A: return "ss"; // ㅆ + case 0x110B: return ""; // ㅇ + case 0x110C: return "j"; // ㅈ + case 0x110D: return "jj"; // ㅉ + case 0x110E: return "ch"; // ㅊ + case 0x110F: return "k"; // ㅋ + case 0x1110: return "t"; // ㅌ + case 0x1111: return "p"; // ㅍ + case 0x1112: return "h"; // ㅎ + // Jungseong (vowels) + case 0x1161: return "a"; // ㅏ + case 0x1162: return "ae"; // ㅐ + case 0x1163: return "ya"; // ㅑ + case 0x1164: return "yae"; // ㅒ + case 0x1165: return "eo"; // ㅓ + case 0x1166: return "e"; // ㅔ + case 0x1167: return "yeo"; // ㅕ + case 0x1168: return "ye"; // ㅖ + case 0x1169: return "o"; // ㅗ + case 0x116A: return "wa"; // ㅘ + case 0x116B: return "wae"; // ㅙ + case 0x116C: return "oe"; // ㅚ + case 0x116D: return "yo"; // ㅛ + case 0x116E: return "u"; // ㅜ + case 0x116F: return "wo"; // ㅝ + case 0x1170: return "we"; // ㅞ + case 0x1171: return "wi"; // ㅟ + case 0x1172: return "yu"; // ㅠ + case 0x1173: return "eu"; // ㅡ + case 0x1174: return "ui"; // ㅢ + case 0x1175: return "i"; // ㅣ + // Jongseong (final position consonants) + case 0x11A8: return "k"; // ㄱ + case 0x11A9: return "k"; // ㄲ + case 0x11AB: return "n"; // ㄴ + case 0x11AE: return "t"; // ㄷ + case 0x11AF: return "l"; // ㄹ + case 0x11B7: return "m"; // ㅁ + case 0x11B8: return "p"; // ㅂ + case 0x11BA: return "t"; // ㅅ + case 0x11BB: return "t"; // ㅆ + case 0x11BC: return "ng"; // ㅇ + case 0x11BD: return "t"; // ㅈ + case 0x11BE: return "t"; // ㅊ + case 0x11BF: return "k"; // ㅋ + case 0x11C0: return "t"; // ㅌ + case 0x11C1: return "p"; // ㅍ + case 0x11C2: return "t"; // ㅎ + } + + // Input was not jamo. + return String.valueOf(jamo); + } + + // Some combinations of ending jamo in one syllable and initial jamo in the next are romanized + // irregularly. These exceptions are called "special provisions". In cases where multiple + // romanizations are permitted, we use the one that's least commonly used elsewhere. + // + // Returns no value if either character is not in the modern jamo block, or if there is no + // special provision for that pair of jamo. + public static Optional transliterateSpecialProvisions(char previousEnding, char nextInitial) { + // Special provisions only apply if both characters are in the modern jamo block. + if (!inRange(previousEnding, JAMO_BLOCK_START, JAMO_BLOCK_END)) { return Optional.empty(); } + if (!inRange(nextInitial, JAMO_BLOCK_START, JAMO_BLOCK_END)) { return Optional.empty(); } + + // Jongseong (final position) ㅎ has a number of special provisions. + if (previousEnding == 0x11C2) { // ㅎ + switch (nextInitial) { + case 0x110B: return Optional.of("h"); // ㅇ + case 0x1100: return Optional.of("k"); // ㄱ + case 0x1102: return Optional.of("nn"); // ㄴ + case 0x1103: return Optional.of("t"); // ㄷ + case 0x1105: return Optional.of("nn"); // ㄹ + case 0x1106: return Optional.of("nm"); // ㅁ + case 0x1107: return Optional.of("p"); // ㅂ + case 0x1109: return Optional.of("hs"); // ㅅ + case 0x110C: return Optional.of("ch"); // ㅈ + case 0x1112: return Optional.of("t"); // ㅎ + default: return Optional.empty(); + } + } + + // Otherwise, special provisions are denser when grouped by the second jamo. + switch (nextInitial) { + case 0x1100: // ㄱ + switch (previousEnding) { + case 0x11AB: return Optional.of("n-g"); // ㄴ + default: return Optional.empty(); + } + case 0x1102: // ㄴ + switch (previousEnding) { + case 0x11A8: return Optional.of("ngn"); // ㄱ + case 0x11AE: // ㄷ + case 0x11BA: // ㅅ + case 0x11BD: // ㅈ + case 0x11BE: // ㅊ + case 0x11C0: // ㅌ + return Optional.of("nn"); + case 0x11AF: return Optional.of("ll"); // ㄹ + case 0x11B8: return Optional.of("mn"); // ㅂ + default: return Optional.empty(); + } + case 0x1105: // ㄹ + switch (previousEnding) { + case 0x11A8: // ㄱ + case 0x11AB: // ㄴ + case 0x11AF: // ㄹ + return Optional.of("ll"); + case 0x11AE: // ㄷ + case 0x11BA: // ㅅ + case 0x11BD: // ㅈ + case 0x11BE: // ㅊ + case 0x11C0: // ㅌ + return Optional.of("nn"); + case 0x11B7: // ㅁ + case 0x11B8: // ㅂ + return Optional.of("mn"); + case 0x11BC: return Optional.of("ngn"); // ㅇ + default: return Optional.empty(); + } + case 0x1106: // ㅁ + switch (previousEnding) { + case 0x11A8: return Optional.of("ngm"); // ㄱ + case 0x11AE: // ㄷ + case 0x11BA: // ㅅ + case 0x11BD: // ㅈ + case 0x11BE: // ㅊ + case 0x11C0: // ㅌ + return Optional.of("nm"); + case 0x11B8: return Optional.of("mm"); // ㅂ + default: return Optional.empty(); + } + case 0x110B: // ㅇ + switch (previousEnding) { + case 0x11A8: return Optional.of("g"); // ㄱ + case 0x11AE: return Optional.of("d"); // ㄷ + case 0x11AF: return Optional.of("r"); // ㄹ + case 0x11B8: return Optional.of("b"); // ㅂ + case 0x11BA: return Optional.of("s"); // ㅅ + case 0x11BC: return Optional.of("ng-"); // ㅇ + case 0x11BD: return Optional.of("j"); // ㅈ + case 0x11BE: return Optional.of("ch"); // ㅊ + default: return Optional.empty(); + } + case 0x110F: // ㅋ + switch (previousEnding) { + case 0x11A8: return Optional.of("k-k"); // ㄱ + default: return Optional.empty(); + } + case 0x1110: // ㅌ + switch (previousEnding) { + case 0x11AE: // ㄷ + case 0x11BA: // ㅅ + case 0x11BD: // ㅈ + case 0x11BE: // ㅊ + case 0x11C0: // ㅌ + return Optional.of("t-t"); + default: return Optional.empty(); + } + case 0x1111: // ㅍ + switch (previousEnding) { + case 0x11B8: return Optional.of("p-p"); // ㅂ + default: return Optional.empty(); + } + default: return Optional.empty(); + } + } + + // Decompose a syllable into several jamo. Returns its input if that isn't possible. + public static char[] decompose(char syllable) { + String normalized = Normalizer.normalize(String.valueOf(syllable), Normalizer.Form.NFD); + return normalized.toCharArray(); + } + + // Transliterate any Hangul in the given string. Leaves any non-Hangul characters unmodified. + public static String transliterate(String txt) { + if (txt == null || txt.isEmpty()) { + return txt; + } + + // Most of the bulk of these loops is for handling special provisions - situations where the + // last jamo of one syllable and the first of the next need to be romanized as a pair in an + // irregular way. + StringBuilder builder = new StringBuilder(); + boolean nextInitialJamoConsumed = false; + char[] syllables = txt.toCharArray(); + for (int i = 0; i < syllables.length; i++) { + char thisSyllable = syllables[i]; + // If this isn't in any of the Hangul blocks we know about, emit it as-is. + if (!inRange(thisSyllable, JAMO_BLOCK_START, JAMO_BLOCK_END) + && !inRange(thisSyllable, SYLLABLES_BLOCK_START, SYLLABLES_BLOCK_END) + && !inRange(thisSyllable, COMPAT_JAMO_BLOCK_START, COMPAT_JAMO_BLOCK_END)) { + builder.append(thisSyllable); + continue; + } + + char[] theseJamo = decompose(thisSyllable); + for (int j = 0; j < theseJamo.length; j++) { + char thisJamo = theseJamo[j]; + + // If we already transliterated the first jamo of this syllable as part of a special + // provision, skip it. Otherwise, handle it in the unconditional else branch. + if (j == 0 && nextInitialJamoConsumed) { + nextInitialJamoConsumed = false; + continue; + } + + // If this is the last jamo of this syllable and not the last syllable of the + // string, check for special provisions. If the next char is whitespace or not + // Hangul, it's the responsibility of transliterateSpecialProvisions() to return no + // value. + if (j == theseJamo.length - 1 && i < syllables.length - 1) { + char nextSyllable = syllables[i + 1]; + char nextJamo = decompose(nextSyllable)[0]; + Optional specialProvision = transliterateSpecialProvisions(thisJamo, nextJamo); + if (specialProvision.isPresent()) { + builder.append(specialProvision.get()); + nextInitialJamoConsumed = true; + } else { + // No special provision applies. Transliterate in isolation. + builder.append(transliterateSingleJamo(thisJamo)); + } + continue; + } + + // Everything else is transliterated in isolation. + builder.append(transliterateSingleJamo(thisJamo)); + } + } + + return builder.toString(); + } +} diff --git a/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/LanguageUtils.java b/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/LanguageUtils.java index f7790fe93..22a2b4d61 100644 --- a/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/LanguageUtils.java +++ b/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/LanguageUtils.java @@ -116,18 +116,21 @@ public class LanguageUtils { return txt; } - StringBuilder message = new StringBuilder(); + StringBuilder messageBuilder = new StringBuilder(); + // Simple, char-by-char transliteration. char[] chars = txt.toCharArray(); - for (char c : chars) { - message.append(transliterate(c)); + messageBuilder.append(transliterate(c)); } + String message = messageBuilder.toString(); - String messageString = BengaliLanguageUtils.transliterate(message.toString()); + // More complex transliteration for specific languages. + message = BengaliLanguageUtils.transliterate(message); + message = KoreanLanguageUtils.transliterate(message); - return flattenToAscii(messageString); + return flattenToAscii(message); } /** diff --git a/app/src/test/java/nodomain/freeyourgadget/gadgetbridge/test/LanguageUtilsTest.java b/app/src/test/java/nodomain/freeyourgadget/gadgetbridge/test/LanguageUtilsTest.java index 8d0db22d0..5f7e1e7f7 100644 --- a/app/src/test/java/nodomain/freeyourgadget/gadgetbridge/test/LanguageUtilsTest.java +++ b/app/src/test/java/nodomain/freeyourgadget/gadgetbridge/test/LanguageUtilsTest.java @@ -6,6 +6,7 @@ import org.junit.Test; import nodomain.freeyourgadget.gadgetbridge.GBApplication; import nodomain.freeyourgadget.gadgetbridge.util.LanguageUtils; +import nodomain.freeyourgadget.gadgetbridge.util.KoreanLanguageUtils; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -82,6 +83,43 @@ public class LanguageUtilsTest extends TestBase { } } + @Test + public void testStringTransliterateKorean() { + // A familiar phrase with no special provisions. + String hello = "안녕하세요"; + String helloExpected = "annyeonghaseyo"; + String helloActual = LanguageUtils.transliterate(hello); + assertEquals("Korean hello transliteration failed", helloExpected, helloActual); + + // Korean pangram. Includes some ASCII punctuation which should not be changed by + // transliteration. + // + // Translation: "Chocolate!? What I wanted was some rice puffs and clothes." "Child, why are + // you complaining again?" + String pangram = "\"웬 초콜릿? 제가 원했던 건 뻥튀기 쬐끔과 의류예요.\" \"얘야, 왜 또 불평?\""; + String pangramExpected = "\"wen chokollit? jega wonhaetdeon geon ppeongtwigi jjoekkeumgwa uiryuyeyo.\" \"yaeya, wae tto bulpyeong?\""; + String pangramActual = LanguageUtils.transliterate(pangram); + assertEquals("Korean pangram transliteration failed", pangramExpected, pangramActual); + + // Several words excercising special provisions, from Wikipedia. + String special = "좋고, 놓다, 잡혀, 낳지"; + String specialExpected = "joko, nota, japhyeo, nachi"; + String specialActual = LanguageUtils.transliterate(special); + assertEquals("Korean special provisions transliteration failed", specialExpected, specialActual); + + // Isolated jamo. + String isolatedJamo = "ㅋㅋㅋ"; + String isolatedJamoExpected = "kkk"; + String isolatedJamoActual = LanguageUtils.transliterate(isolatedJamo); + assertEquals("Korean isolated jamo transliteration failed", isolatedJamoExpected, isolatedJamoActual); + + // Korean transliteration shouldn't touch non-Hangul composites. + String german = "schön"; + String germanExpected = german; + String germanActual = KoreanLanguageUtils.transliterate(german); + assertEquals("Korean transliteration modified a non-Hangul composite", germanExpected, germanActual); + } + @Test public void testStringTransliterateLithuanian() { String input = "ą č ę ė į š ų ū ž";