Add Armenian transliteration

This commit is contained in:
Alik Aslanyan 2021-04-23 12:11:04 +04:00 committed by Alik Aslanyan
parent 9ae02f65ed
commit f0a9ab7f98
6 changed files with 403 additions and 0 deletions

View File

@ -276,6 +276,9 @@ dependencies {
// Fix Duplicate class build error
implementation(platform("org.jetbrains.kotlin:kotlin-bom:1.8.0"))
// Needed for Armenian transliteration
implementation group: 'org.ahocorasick', name: 'ahocorasick', version: '0.6.3'
}
preBuild.dependsOn(":GBDaoGenerator:genSources")

View File

@ -35,6 +35,7 @@ import nodomain.freeyourgadget.gadgetbridge.devices.DeviceCoordinator;
import nodomain.freeyourgadget.gadgetbridge.impl.GBDevice;
import nodomain.freeyourgadget.gadgetbridge.util.Prefs;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.ArabicTransliterator;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.ArmenianTransliterator;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.BengaliTransliterator;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.CommonSymbolsTransliterator;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.CroatianTransliterator;
@ -86,6 +87,7 @@ public class LanguageUtils {
put("scandinavian", new ScandinavianTransliterator());
put("turkish", new TurkishTransliterator());
put("ukranian", new UkranianTransliterator());
put("armenian", new ArmenianTransliterator());
}};
/**

View File

@ -0,0 +1,245 @@
/* Copyright (C) 2021-2024 Alik Aslanyan
This file is part of Gadgetbridge.
Gadgetbridge is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Gadgetbridge is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>. */
package nodomain.freeyourgadget.gadgetbridge.util.language.impl;
import nodomain.freeyourgadget.gadgetbridge.util.language.Transliterator;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.apache.commons.lang3.text.WordUtils;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
public class ArmenianTransliterator implements Transliterator {
// Transliteration map ordered by priority
// Armenian has some rules regarding reading of 'ո' in the middle of the word it reads as english O
// But if word starts with it's read as sound of 'vo'
// Or if it has 'ւ' symbol after it, then we should read it as 'u' (as double o in booze)
private static final Map<String, String> transliterateMap = new LinkedHashMap<String, String>() {
{
// Letter + 'ու'
put("աու","au");
put("բու","bu");
put("գու","gu");
put("դու","du");
put("եու","eu");
put("զու","zu");
put("էու","eu");
put("ըու","yu");
put("թու","tu");
put("ժու","ju");
put("իու","iu");
put("լու","lu");
put("խու","xu");
put("ծու","cu");
put("կու","ku");
put("հու","hu");
put("ձու","dzu");
put("ղու","xu");
put("ճու","cu");
put("մու","mu");
put("յու","yu");
put("նու","nu");
put("շու","shu");
put("չու","chu");
put("պու","pu");
put("ջու","ju");
put("ռու","ru");
put("սու","su");
put("վու","vu");
put("տու","tu");
put("րու","ru");
put("ցու","cu");
put("փու","pu");
put("քու","qu");
put("օու","ou");
put("ևու","eu");
put("ֆու","fu");
put("ոու","vou");
put("ու","u");
// Letter + 'ո'
put("բո","bo");
put("գո","go");
put("դո","do");
put("զո","zo");
put("թո","to");
put("ժո","jo");
put("լո","lo");
put("խո","xo");
put("ծո","co");
put("կո","ko");
put("հո","ho");
put("ձո","dzo");
put("ղո","xo");
put("ճո","co");
put("մո","mo");
put("յո","yo");
put("նո","no");
put("շո","so");
put("չո","co");
put("պո","po");
put("ջո","jo");
put("ռո","ro");
put("սո","so");
put("վո","vo");
put("տո","to");
put("րո","ro");
put("ցո","co");
put("փո","po");
put("քո","qo");
put("ևո","eo");
put("ֆո","fo");
put("ո","vo");
// Two different ways to write, we support all.
put("եւ","ev");
put("եվ","ev");
// Simple substitutions
put("ա","a");
put("բ","b");
put("գ","g");
put("դ","d");
put("ե","e");
put("զ","z");
put("է","e");
put("ը","y");
put("թ","t");
put("ժ","j");
put("ի","i");
put("լ","l");
put("խ","x");
put("ծ","c");
put("կ","k");
put("հ","h");
put("ձ","dz");
put("ղ","x");
put("ճ","c");
put("մ","m");
put("յ","y");
put("ն","n");
put("շ","sh");
put("չ","ch");
put("պ","p");
put("ջ","j");
put("ռ","r");
put("ս","s");
put("վ","v");
put("տ","t");
put("ր","r");
put("ց","c");
put("փ","p");
put("ք","q");
put("օ","o");
put("և","ev");
put("ֆ","f");
// If this symbol wasn't used in the combination with others, then it's meaningless
put("ւ","");
// Add support for capitilazed words
for (final Map.Entry<String,String> entry : ((Map<String, String>)this.clone()).entrySet()) {
final String capitalKey = WordUtils.capitalize(entry.getKey());
if(!capitalKey.equals(entry.getKey())) {
put(capitalKey, WordUtils.capitalize(entry.getValue()));
}
}
}};
private static final Map<String, Integer> transliterationPriorityMap = new HashMap<String, Integer>() {{
int priority = 0;
for( final String key : transliterateMap.keySet() ) {
put(key, priority++);
}
}};
// Aho-Corasick trie
private static final Trie transliterationTrie;
static {
final Trie.TrieBuilder builder = Trie.builder();
for( final String key : ArmenianTransliterator.transliterateMap.keySet()) {
builder.addKeyword(key);
}
transliterationTrie = builder.build();
}
private static String ahoCorasick(final String text) {
// Create a buffer sufficiently large that re-allocations are minimized.
final StringBuilder sb = new StringBuilder( text.length() * 10 / 12 );
// The complexity of the Aho-Corasick algorithm O(N + L + Z)
// Where N is the length of the text, L is the length of keywords and the Z is a number of matches.
// This algorithm allows us to do fast substring search
final List<Emit> emits = new ArrayList<Emit>(transliterationTrie.parseText( text ));
// Sort collection first by starting position, then by priority.
Collections.sort(emits, new Comparator<Emit>() {
@Override
public int compare(Emit a, Emit b) {
int cmp = Integer.compare(a.getStart(), b.getStart());
if (cmp != 0) {
return cmp;
}
int priorityA = transliterationPriorityMap.get(a.getKeyword());
int priorityB = transliterationPriorityMap.get(b.getKeyword());
return Integer.compare(priorityA, priorityB);
}
});
int prevIndex = 0;
for( final Emit emit : emits ) {
final int matchIndex = emit.getStart();
// Skip if we already substituted this part
if(matchIndex < prevIndex) {
continue;
}
// Add part which shouldn't be substituted
sb.append(text.substring(prevIndex, matchIndex));
// Substitute and append to the builder
sb.append( ArmenianTransliterator.transliterateMap.get( emit.getKeyword() ) );
prevIndex = emit.getEnd() + 1;
}
// Add the remainder of the string (contains no more matches).
sb.append( text.substring( prevIndex ) );
return sb.toString();
}
@Override
public String transliterate(String txt) {
if (txt == null || txt.isEmpty()) {
return txt;
}
return ahoCorasick(txt);
}
}

View File

@ -3495,6 +3495,7 @@
<item>@string/turkish</item>
<item>@string/ukranian</item>
<item>@string/hungarian</item>
<item>@string/armenian</item>
</string-array>
<string-array name="pref_transliteration_languages_values">
@ -3521,6 +3522,7 @@
<item>turkish</item>
<item>ukranian</item>
<item>hungarian</item>
<item>armenian</item>
</string-array>
<string-array name="pref_transliteration_languages_default">

View File

@ -1061,6 +1061,7 @@
<string name="persian">Persian</string>
<string name="scandinavian">Scandinavian</string>
<string name="ukranian">Ukranian</string>
<string name="armenian">Armenian</string>
<string name="italian">Italian</string>
<string name="french">French</string>
<string name="french_ca">French (Canada)</string>

View File

@ -0,0 +1,150 @@
package nodomain.freeyourgadget.gadgetbridge.test;
import junit.framework.TestCase;
import org.apache.commons.lang3.text.WordUtils;
import org.junit.Test;
import org.junit.Assert;
import java.util.LinkedHashMap;
import java.util.Map;
import nodomain.freeyourgadget.gadgetbridge.util.language.impl.ArmenianTransliterator;
public class ArmenianTransliteratorTest extends TestCase {
@Test
public void testSimpleCases() {
Assert.assertEquals("aybuben", new ArmenianTransliterator().transliterate("այբուբեն"));
Assert.assertEquals("vorotan", new ArmenianTransliterator().transliterate("որոտան"));
Assert.assertEquals("voroshel", new ArmenianTransliterator().transliterate("որոշել"));
Assert.assertEquals("uzox", new ArmenianTransliterator().transliterate("ուզող"));
}
@Test
public void testMultipleWords() {
Assert.assertEquals("vory karucum en Viqipedia kayqic ogtvoxnery azat xmbagrman dzevachapov",
new ArmenianTransliterator().transliterate("որը կառուցում են Վիքիպեդիա կայքից օգտվողները ազատ խմբագրման ձևաչափով"));
}
@Test
public void testMixedStrings() {
Assert.assertEquals("vor1voshel 12 uzox", new ArmenianTransliterator().transliterate("որ1ոշել 12 ուզող"));
Assert.assertEquals("vory jet iridescent karucum en sheen Viqipedia kayqic ogtvoxnery and a distinctive azat xmbagrman dzevachapov",
new ArmenianTransliterator().transliterate("որը jet iridescent կառուցում են sheen Վիքիպեդիա կայքից օգտվողները and a distinctive ազատ խմբագրման ձևաչափով"));
}
@Test
public void testTop100Words() {
final Map<String,String> topWords = new LinkedHashMap<String,String>() {{
put("ինչպես", "inchpes");
put("ես", "es");
put("նրա", "nra");
put("որ", "vor");
put("նա", "na");
put("էր", "er");
put("համար", "hamar");
put("ին", "in");
put("հետ", "het");
put("նրանք", "nranq");
put("լինել", "linel");
put("մեկ", "mek");
put("ունենալ", "unenal");
put("այս", "ays");
put("ից", "ic");
put("ի", "i");
put("տաք", "taq");
put("բառ", "bar");
put("բայց", "bayc");
put("ինչ", "inch");
put("մի", "mi");
put("քանի", "qani");
put("է", "e");
put("այն", "ayn");
put("դուք", "duq");
put("կամ", "kam");
put("եւ", "ev");
put("մինչեւ", "minchev");
put("իսկ", "isk");
put("ա", "a");
put("մենք", "menq");
put("կարող", "karox");
put("այլ", "ayl");
put("են", "en");
put("որը", "vory");
put("անել", "anel");
put("իրենց", "irenc");
put("ժամանակ", "jamanak");
put("եթե", "ete");
put("կամք", "kamq");
put("յուրաքանչյուր", "yuraqanchyur");
put("ասել", "asel");
put("շարք", "sharq");
put("երեք", "ereq");
put("ուզում", "uzum");
put("օդի", "odi");
put("լավ", "lav");
put("նույնպես", "nuynpes");
put("խաղալ", "xaxal");
put("փոքր", "poqr");
put("վերջ", "verj");
put("կարդալ", "kardal");
put("ձեռք", "dzerq");
put("նավահանգիստ", "navahangist");
put("տառ", "tar");
put("առ", "ar");
put("ավելացնել", "avelacnel");
put("նույնիսկ", "nuynisk");
put("այստեղ", "aystex");
put("պետք", "petq");
put("մեծ", "mec");
put("բարձր", "bardzr");
put("այդպիսի", "aydpisi");
put("հետեւել", "hetevel");
put("գործ", "gorc");
put("ինչու", "inchu");
put("խնդրել", "xndrel");
put("տղամարդիկ", "txamardik");
put("փոփոխություն", "popoxutyun");
put("գնաց", "gnac");
put("լույս", "luys");
put("բարի", "bari");
put("դուրս", "durs");
put("անհրաժեշտ", "anhrajesht");
put("տուն", "tun");
put("նկար", "nkar");
put("փորձել", "pordzel");
put("մեզ", "mez");
put("կրկին", "krkin");
put("կենդանի", "kendani");
put("կետ", "ket");
put("մայր", "mayr");
put("աշխարհ", "ashxarh");
put("մոտ", "mot");
put("կառուցել", "karucel");
put("ինքնուրույն", "inqnuruyn");
put("երկիր", "erkir");
put("հայր", "hayr");
put("ցանկացած", "cankacac");
put("նոր", "nor");
put("աշխատանք", "ashxatanq");
put("մաս", "mas");
put("վերցնել", "vercnel");
put("ստանալ", "stanal");
put("տեղ", "tex");
put("ապրել", "aprel");
put("որտեղ", "vortex");
put("երբ", "erb");
put("Վերադառնալ", "Veradarnal");
put("միայն", "miayn");
}};
for (final Map.Entry<String,String> entry : topWords.entrySet()) {
Assert.assertEquals(entry.getValue(), new ArmenianTransliterator().transliterate(entry.getKey()));
}
for (final Map.Entry<String,String> entry : topWords.entrySet()) {
Assert.assertEquals(WordUtils.capitalize(entry.getValue()), WordUtils.capitalize(new ArmenianTransliterator().transliterate(entry.getKey())));
}
}
}