1
0
mirror of https://codeberg.org/Freeyourgadget/Gadgetbridge synced 2024-07-10 07:31:34 +02:00

Rework Armenian transliteration to handle more edge cases around mixed letters

This commit is contained in:
Alik Aslanyan 2024-05-04 20:45:35 +04:00
parent eaf7c03f61
commit dc1ffdafcd
2 changed files with 175 additions and 114 deletions

View File

@ -26,6 +26,7 @@ import java.util.Collections;
import java.util.Comparator; import java.util.Comparator;
import java.util.HashMap; import java.util.HashMap;
import java.util.LinkedHashMap; import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
@ -36,134 +37,181 @@ public class ArmenianTransliterator implements Transliterator {
// Or if it has 'ւ' symbol after it, then we should read it as 'u' (as double o in booze) // Or if it has 'ւ' symbol after it, then we should read it as 'u' (as double o in booze)
private static final Map<String, String> transliterateMap = new LinkedHashMap<String, String>() { private static final Map<String, String> transliterateMap = new LinkedHashMap<String, String>() {
{ {
// Simple substitutions
Map<String, String> simpleSubstitions = new HashMap<String, String>() {
{
put("ա","a");
put("բ","b");
put("գ","g");
put("դ","d");
put("ե","e");
put("զ","z");
put("է","e");
put("ը","y");
put("թ","t");
put("ժ","j");
put("ի","i");
put("լ","l");
put("խ","x");
put("ծ","c");
put("կ","k");
put("հ","h");
put("ձ","dz");
put("ղ","x");
put("ճ","c");
put("մ","m");
put("յ","y");
put("ն","n");
put("շ","sh");
put("չ","ch");
put("պ","p");
put("ջ","j");
put("ռ","r");
put("ս","s");
put("վ","v");
put("տ","t");
put("ր","r");
put("ց","c");
put("փ","p");
put("ք","q");
put("օ","o");
put("և","ev");
put("ֆ","f");
}
};
// Letter + 'ու' // Letter + 'ու'
put("աու","au"); char[] letterMapU = {
put("բու","bu"); 'ա',
put("գու","gu"); 'բ',
put("դու","du"); 'գ',
put("եու","eu"); 'դ',
put("զու","zu"); 'ե',
put("էու","eu"); 'զ',
put("ըու","yu"); 'է',
put("թու","tu"); 'ը',
put("ժու","ju"); 'թ',
put("իու","iu"); 'ժ',
put("լու","lu"); 'ի',
put("խու","xu"); 'լ',
put("ծու","cu"); 'խ',
put("կու","ku"); 'ծ',
put("հու","hu"); 'կ',
put("ձու","dzu"); 'հ',
put("ղու","xu"); 'ձ',
put("ճու","cu"); 'ղ',
put("մու","mu"); 'ճ',
put("յու","yu"); 'մ',
put("նու","nu"); 'յ',
put("շու","shu"); 'ն',
put("չու","chu"); 'շ',
put("պու","pu"); 'չ',
put("ջու","ju"); 'պ',
put("ռու","ru"); 'ջ',
put("սու","su"); 'ռ',
put("վու","vu"); 'ս',
put("տու","tu"); 'վ',
put("րու","ru"); 'տ',
put("ցու","cu"); 'ր',
put("փու","pu"); 'ց',
put("քու","qu"); 'փ',
put("օու","ou"); 'ք',
put("ևու","eu"); 'օ',
put("ֆու","fu"); 'և',
put("ոու","vou"); 'ֆ',
'ո',
};
for(char letter : letterMapU) {
char capitalLetter = Character.toUpperCase(letter);
final String transliteratedLetter = simpleSubstitions.get(Character.toString(letter));
final String transliteratedCapitalLetter = simpleSubstitions.get(Character.toString(capitalLetter));
put(Character.toString(letter) + "ու", transliteratedLetter + "u");
put(Character.toString(capitalLetter) + "ու", transliteratedCapitalLetter + "u");
put(Character.toString(letter) + "ՈՒ", transliteratedLetter + "U");
put(Character.toString(capitalLetter) + "ՈՒ", transliteratedCapitalLetter + "U");
put(Character.toString(letter) + "Ու", transliteratedLetter + "U");
put(Character.toString(capitalLetter) + "Ու", transliteratedCapitalLetter + "U");
put(Character.toString(letter) + "ոՒ", transliteratedLetter + "U");
put(Character.toString(capitalLetter) + "ոՒ", transliteratedCapitalLetter + "U");
}
put("ու","u"); put("ու","u");
put("Ու","U");
put("ոՒ","U");
put("ՈՒ","U");
// Letter + 'ո' // Letter + 'ո'
put("բո","bo"); char[] letterMapVo = {
put("գո","go"); 'բ',
put("դո","do"); 'գ',
put("զո","zo"); 'դ',
put("թո","to"); 'զ',
put("ժո","jo"); 'թ',
put("լո","lo"); 'ժ',
put("խո","xo"); 'լ',
put("ծո","co"); 'խ',
put("կո","ko"); 'ծ',
put("հո","ho"); 'կ',
put("ձո","dzo"); 'հ',
put("ղո","xo"); 'ձ',
put("ճո","co"); 'ղ',
put("մո","mo"); 'ճ',
put("յո","yo"); 'մ',
put("նո","no"); 'յ',
put("շո","so"); 'ն',
put("չո","co"); 'շ',
put("պո","po"); 'չ',
put("ջո","jo"); 'պ',
put("ռո","ro"); 'ջ',
put("սո","so"); 'ռ',
put("վո","vo"); 'ս',
put("տո","to"); 'վ',
put("րո","ro"); 'տ',
put("ցո","co"); 'ր',
put("փո","po"); 'ց',
put("քո","qo"); 'փ',
put("ևո","eo"); 'ք',
put("ֆո","fo"); 'և',
'ֆ',
};
for(char letter : letterMapVo) {
char capitalLetter = Character.toUpperCase(letter);
final String transliteratedLetter = simpleSubstitions.get(Character.toString(letter));
final String transliteratedCapitalLetter = simpleSubstitions.get(Character.toString(capitalLetter));
put(Character.toString(letter) + "ո", transliteratedLetter + "o");
put(Character.toString(capitalLetter) + "ո", transliteratedCapitalLetter + "o");
put(Character.toString(letter) + "Ո", transliteratedLetter + "Օ");
put(Character.toString(capitalLetter) + "Ո", transliteratedCapitalLetter + "Օ");
}
put("ո","vo"); put("ո","vo");
put("Ո","VO");
// Two different ways to write, we support all. // Two different ways to write, we support all.
put("եւ","ev"); put("եւ","ev");
put("եվ","ev"); put("եվ","ev");
put("Եւ","Ev");
// Simple substitutions put("Եվ","Ev");
put("ա","a"); put("ԵՒ","EV");
put("բ","b"); put("ԵՎ","EV");
put("գ","g");
put("դ","d");
put("ե","e");
put("զ","z");
put("է","e");
put("ը","y");
put("թ","t");
put("ժ","j");
put("ի","i");
put("լ","l");
put("խ","x");
put("ծ","c");
put("կ","k");
put("հ","h");
put("ձ","dz");
put("ղ","x");
put("ճ","c");
put("մ","m");
put("յ","y");
put("ն","n");
put("շ","sh");
put("չ","ch");
put("պ","p");
put("ջ","j");
put("ռ","r");
put("ս","s");
put("վ","v");
put("տ","t");
put("ր","r");
put("ց","c");
put("փ","p");
put("ք","q");
put("օ","o");
put("և","ev");
put("ֆ","f");
// If this symbol wasn't used in the combination with others, then it's meaningless // If this symbol wasn't used in the combination with others, then it's meaningless
put("ւ",""); put("ւ","");
put("Ւ","");
// Add support for capitilazed words // Simple substitutions have last priority
for (final Map.Entry<String,String> entry : ((Map<String, String>)this.clone()).entrySet()) { for (final Map.Entry<String,String> entry : simpleSubstitions.entrySet()) {
final String capitalKey = WordUtils.capitalize(entry.getKey()); put(entry.getKey(), entry.getValue());
if(!capitalKey.equals(entry.getKey())) { put(entry.getKey().toUpperCase(), entry.getValue().toUpperCase());
put(capitalKey, WordUtils.capitalize(entry.getValue()));
}
} }
}}; }};

View File

@ -33,6 +33,19 @@ public class ArmenianTransliteratorTest extends TestCase {
new ArmenianTransliterator().transliterate("որը jet iridescent կառուցում են sheen Վիքիպեդիա կայքից օգտվողները and a distinctive ազատ խմբագրման ձևաչափով")); new ArmenianTransliterator().transliterate("որը jet iridescent կառուցում են sheen Վիքիպեդիա կայքից օգտվողները and a distinctive ազատ խմբագրման ձևաչափով"));
} }
@Test
public void testMixedCaseWords() {
Assert.assertEquals(
"Inchpes", new ArmenianTransliterator().transliterate("Ինչպես")
);
Assert.assertEquals(
"VOrՕSHEL", new ArmenianTransliterator().transliterate("ՈրՈՇԵԼ")
);
Assert.assertEquals(
"Ushadir", new ArmenianTransliterator().transliterate("Ուշադիր")
);
}
@Test @Test
public void testTop100Words() { public void testTop100Words() {