mirror of
https://codeberg.org/Freeyourgadget/Gadgetbridge
synced 2025-01-12 18:57:36 +01:00
Fix null elements for some mixed case words in Armenian (combinations with U and Vo)
This commit is contained in:
parent
6bb93bef89
commit
f3185f1acb
@ -29,6 +29,7 @@ import java.util.LinkedHashMap;
|
|||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
|
import java.util.Objects;
|
||||||
|
|
||||||
public class ArmenianTransliterator implements Transliterator {
|
public class ArmenianTransliterator implements Transliterator {
|
||||||
// Transliteration map ordered by priority
|
// Transliteration map ordered by priority
|
||||||
@ -38,7 +39,7 @@ public class ArmenianTransliterator implements Transliterator {
|
|||||||
private static final Map<String, String> transliterateMap = new LinkedHashMap<String, String>() {
|
private static final Map<String, String> transliterateMap = new LinkedHashMap<String, String>() {
|
||||||
{
|
{
|
||||||
// Simple substitutions
|
// Simple substitutions
|
||||||
Map<String, String> simpleSubstitions = new HashMap<String, String>() {
|
Map<String, String> simpleSubstitions = new HashMap<String, String>() {
|
||||||
{
|
{
|
||||||
put("ա","a");
|
put("ա","a");
|
||||||
put("բ","b");
|
put("բ","b");
|
||||||
@ -63,6 +64,7 @@ public class ArmenianTransliterator implements Transliterator {
|
|||||||
put("յ","y");
|
put("յ","y");
|
||||||
put("ն","n");
|
put("ն","n");
|
||||||
put("շ","sh");
|
put("շ","sh");
|
||||||
|
put("ո", "vo");
|
||||||
put("չ","ch");
|
put("չ","ch");
|
||||||
put("պ","p");
|
put("պ","p");
|
||||||
put("ջ","j");
|
put("ջ","j");
|
||||||
@ -77,68 +79,78 @@ public class ArmenianTransliterator implements Transliterator {
|
|||||||
put("օ","o");
|
put("օ","o");
|
||||||
put("և","ev");
|
put("և","ev");
|
||||||
put("ֆ","f");
|
put("ֆ","f");
|
||||||
|
put("՝", "`");
|
||||||
|
put("՞", "?");
|
||||||
|
put("։", ":");
|
||||||
|
put("․", ".");
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Capitalize existing simple substitutions here
|
||||||
|
for (final Entry<String, String> entry : new ArrayList<Entry<String, String>>(simpleSubstitions.entrySet())) {
|
||||||
|
String capitalKey = entry.getKey().toUpperCase();
|
||||||
|
if (!capitalKey.equals(entry.getKey())) {
|
||||||
|
simpleSubstitions.put(capitalKey, entry.getValue().toUpperCase());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Letter + 'ու'
|
// Letter + 'ու'
|
||||||
char[] letterMapU = {
|
final String[] letterMapU = {
|
||||||
'ա',
|
"ա",
|
||||||
'բ',
|
"բ",
|
||||||
'գ',
|
"գ",
|
||||||
'դ',
|
"դ",
|
||||||
'ե',
|
"ե",
|
||||||
'զ',
|
"զ",
|
||||||
'է',
|
"է",
|
||||||
'ը',
|
"ը",
|
||||||
'թ',
|
"թ",
|
||||||
'ժ',
|
"ժ",
|
||||||
'ի',
|
"ի",
|
||||||
'լ',
|
"լ",
|
||||||
'խ',
|
"խ",
|
||||||
'ծ',
|
"ծ",
|
||||||
'կ',
|
"կ",
|
||||||
'հ',
|
"հ",
|
||||||
'ձ',
|
"ձ",
|
||||||
'ղ',
|
"ղ",
|
||||||
'ճ',
|
"ճ",
|
||||||
'մ',
|
"մ",
|
||||||
'յ',
|
"յ",
|
||||||
'ն',
|
"ն",
|
||||||
'շ',
|
"շ",
|
||||||
'չ',
|
"չ",
|
||||||
'պ',
|
"պ",
|
||||||
'ջ',
|
"ջ",
|
||||||
'ռ',
|
"ռ",
|
||||||
'ս',
|
"ս",
|
||||||
'վ',
|
"վ",
|
||||||
'տ',
|
"տ",
|
||||||
'ր',
|
"ր",
|
||||||
'ց',
|
"ց",
|
||||||
'փ',
|
"փ",
|
||||||
'ք',
|
"ք",
|
||||||
'օ',
|
"օ",
|
||||||
'և',
|
"և",
|
||||||
'ֆ',
|
"ֆ",
|
||||||
'ո',
|
"ո"
|
||||||
};
|
};
|
||||||
|
|
||||||
for(char letter : letterMapU) {
|
for (final String letter : letterMapU) {
|
||||||
char capitalLetter = Character.toUpperCase(letter);
|
final String capitalLetter = letter.toUpperCase();
|
||||||
final String transliteratedLetter = simpleSubstitions.get(Character.toString(letter));
|
final String transliteratedLetter = Objects.requireNonNull(simpleSubstitions.get(letter), letter);
|
||||||
final String transliteratedCapitalLetter = simpleSubstitions.get(Character.toString(capitalLetter));
|
final String transliteratedCapitalLetter = Objects.requireNonNull(simpleSubstitions.get(capitalLetter), capitalLetter);
|
||||||
|
|
||||||
put(Character.toString(letter) + "ու", transliteratedLetter + "u");
|
put(letter + "ու", transliteratedLetter + "u");
|
||||||
put(Character.toString(capitalLetter) + "ու", transliteratedCapitalLetter + "u");
|
put(capitalLetter + "ու", transliteratedCapitalLetter + "u");
|
||||||
|
|
||||||
put(Character.toString(letter) + "ՈՒ", transliteratedLetter + "U");
|
put(letter + "ՈՒ", transliteratedLetter + "U");
|
||||||
put(Character.toString(capitalLetter) + "ՈՒ", transliteratedCapitalLetter + "U");
|
put(capitalLetter + "ՈՒ", transliteratedCapitalLetter + "U");
|
||||||
|
put(letter + "Ու", transliteratedLetter + "U");
|
||||||
|
put(capitalLetter + "Ու", transliteratedCapitalLetter + "U");
|
||||||
|
|
||||||
put(Character.toString(letter) + "Ու", transliteratedLetter + "U");
|
put(letter + "ոՒ", transliteratedLetter + "U");
|
||||||
put(Character.toString(capitalLetter) + "Ու", transliteratedCapitalLetter + "U");
|
put(capitalLetter + "ոՒ", transliteratedCapitalLetter + "U");
|
||||||
|
|
||||||
put(Character.toString(letter) + "ոՒ", transliteratedLetter + "U");
|
|
||||||
put(Character.toString(capitalLetter) + "ոՒ", transliteratedCapitalLetter + "U");
|
|
||||||
}
|
}
|
||||||
|
|
||||||
put("ու","u");
|
put("ու","u");
|
||||||
@ -147,50 +159,51 @@ public class ArmenianTransliterator implements Transliterator {
|
|||||||
put("ՈՒ","U");
|
put("ՈՒ","U");
|
||||||
|
|
||||||
// Letter + 'ո'
|
// Letter + 'ո'
|
||||||
char[] letterMapVo = {
|
final String[] letterMapVo = {
|
||||||
'բ',
|
"բ",
|
||||||
'գ',
|
"գ",
|
||||||
'դ',
|
"դ",
|
||||||
'զ',
|
"զ",
|
||||||
'թ',
|
"թ",
|
||||||
'ժ',
|
"ժ",
|
||||||
'լ',
|
"լ",
|
||||||
'խ',
|
"խ",
|
||||||
'ծ',
|
"ծ",
|
||||||
'կ',
|
"կ",
|
||||||
'հ',
|
"հ",
|
||||||
'ձ',
|
"ձ",
|
||||||
'ղ',
|
"ղ",
|
||||||
'ճ',
|
"ճ",
|
||||||
'մ',
|
"մ",
|
||||||
'յ',
|
"յ",
|
||||||
'ն',
|
"ն",
|
||||||
'շ',
|
"շ",
|
||||||
'չ',
|
"ո", // ո + ո should be voo
|
||||||
'պ',
|
"չ",
|
||||||
'ջ',
|
"պ",
|
||||||
'ռ',
|
"ջ",
|
||||||
'ս',
|
"ռ",
|
||||||
'վ',
|
"ս",
|
||||||
'տ',
|
"վ",
|
||||||
'ր',
|
"տ",
|
||||||
'ց',
|
"ր",
|
||||||
'փ',
|
"ց",
|
||||||
'ք',
|
"փ",
|
||||||
'և',
|
"ք",
|
||||||
'ֆ',
|
"և",
|
||||||
|
"ֆ"
|
||||||
};
|
};
|
||||||
|
|
||||||
for(char letter : letterMapVo) {
|
for (String letter : letterMapVo) {
|
||||||
char capitalLetter = Character.toUpperCase(letter);
|
String capitalLetter = letter.toUpperCase();
|
||||||
final String transliteratedLetter = simpleSubstitions.get(Character.toString(letter));
|
final String transliteratedLetter = Objects.requireNonNull(simpleSubstitions.get(letter));
|
||||||
final String transliteratedCapitalLetter = simpleSubstitions.get(Character.toString(capitalLetter));
|
final String transliteratedCapitalLetter = Objects.requireNonNull(simpleSubstitions.get(capitalLetter));
|
||||||
|
|
||||||
put(Character.toString(letter) + "ո", transliteratedLetter + "o");
|
put(letter + "ո", transliteratedLetter + "o");
|
||||||
put(Character.toString(capitalLetter) + "ո", transliteratedCapitalLetter + "o");
|
put(capitalLetter + "ո", transliteratedCapitalLetter + "o");
|
||||||
|
|
||||||
put(Character.toString(letter) + "Ո", transliteratedLetter + "Օ");
|
put(letter + "Ո", transliteratedLetter + "Օ");
|
||||||
put(Character.toString(capitalLetter) + "Ո", transliteratedCapitalLetter + "Օ");
|
put(capitalLetter + "Ո", transliteratedCapitalLetter + "Օ");
|
||||||
}
|
}
|
||||||
|
|
||||||
put("ո","vo");
|
put("ո","vo");
|
||||||
@ -213,12 +226,11 @@ public class ArmenianTransliterator implements Transliterator {
|
|||||||
put(entry.getKey(), entry.getValue());
|
put(entry.getKey(), entry.getValue());
|
||||||
put(entry.getKey().toUpperCase(), entry.getValue().toUpperCase());
|
put(entry.getKey().toUpperCase(), entry.getValue().toUpperCase());
|
||||||
}
|
}
|
||||||
|
|
||||||
}};
|
}};
|
||||||
|
|
||||||
private static final Map<String, Integer> transliterationPriorityMap = new HashMap<String, Integer>() {{
|
private static final Map<String, Integer> transliterationPriorityMap = new HashMap<String, Integer>() {{
|
||||||
int priority = 0;
|
int priority = 0;
|
||||||
for( final String key : transliterateMap.keySet() ) {
|
for (final String key : transliterateMap.keySet()) {
|
||||||
put(key, priority++);
|
put(key, priority++);
|
||||||
}
|
}
|
||||||
}};
|
}};
|
||||||
@ -227,7 +239,7 @@ public class ArmenianTransliterator implements Transliterator {
|
|||||||
private static final Trie transliterationTrie;
|
private static final Trie transliterationTrie;
|
||||||
static {
|
static {
|
||||||
final Trie.TrieBuilder builder = Trie.builder();
|
final Trie.TrieBuilder builder = Trie.builder();
|
||||||
for( final String key : ArmenianTransliterator.transliterateMap.keySet()) {
|
for (final String key : ArmenianTransliterator.transliterateMap.keySet()) {
|
||||||
builder.addKeyword(key);
|
builder.addKeyword(key);
|
||||||
}
|
}
|
||||||
transliterationTrie = builder.build();
|
transliterationTrie = builder.build();
|
||||||
@ -235,12 +247,12 @@ public class ArmenianTransliterator implements Transliterator {
|
|||||||
|
|
||||||
private static String ahoCorasick(final String text) {
|
private static String ahoCorasick(final String text) {
|
||||||
// Create a buffer sufficiently large that re-allocations are minimized.
|
// Create a buffer sufficiently large that re-allocations are minimized.
|
||||||
final StringBuilder sb = new StringBuilder( text.length() * 10 / 12 );
|
final StringBuilder sb = new StringBuilder(text.length() * 10 / 12);
|
||||||
|
|
||||||
// The complexity of the Aho-Corasick algorithm O(N + L + Z)
|
// The complexity of the Aho-Corasick algorithm O(N + L + Z)
|
||||||
// Where N is the length of the text, L is the length of keywords and the Z is a number of matches.
|
// Where N is the length of the text, L is the length of keywords and the Z is a number of matches.
|
||||||
// This algorithm allows us to do fast substring search
|
// This algorithm allows us to do fast substring search
|
||||||
final List<Emit> emits = new ArrayList<Emit>(transliterationTrie.parseText( text ));
|
final List<Emit> emits = new ArrayList<Emit>(transliterationTrie.parseText(text));
|
||||||
|
|
||||||
// Sort collection first by starting position, then by priority.
|
// Sort collection first by starting position, then by priority.
|
||||||
Collections.sort(emits, new Comparator<Emit>() {
|
Collections.sort(emits, new Comparator<Emit>() {
|
||||||
@ -259,11 +271,11 @@ public class ArmenianTransliterator implements Transliterator {
|
|||||||
|
|
||||||
int prevIndex = 0;
|
int prevIndex = 0;
|
||||||
|
|
||||||
for( final Emit emit : emits ) {
|
for (final Emit emit : emits) {
|
||||||
final int matchIndex = emit.getStart();
|
final int matchIndex = emit.getStart();
|
||||||
|
|
||||||
// Skip if we already substituted this part
|
// Skip if we already substituted this part
|
||||||
if(matchIndex < prevIndex) {
|
if (matchIndex < prevIndex) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -271,13 +283,13 @@ public class ArmenianTransliterator implements Transliterator {
|
|||||||
sb.append(text.substring(prevIndex, matchIndex));
|
sb.append(text.substring(prevIndex, matchIndex));
|
||||||
|
|
||||||
// Substitute and append to the builder
|
// Substitute and append to the builder
|
||||||
sb.append( ArmenianTransliterator.transliterateMap.get( emit.getKeyword() ) );
|
sb.append(Objects.requireNonNull(ArmenianTransliterator.transliterateMap.get(emit.getKeyword())));
|
||||||
|
|
||||||
prevIndex = emit.getEnd() + 1;
|
prevIndex = emit.getEnd() + 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Add the remainder of the string (contains no more matches).
|
// Add the remainder of the string (contains no more matches).
|
||||||
sb.append( text.substring( prevIndex ) );
|
sb.append(text.substring(prevIndex));
|
||||||
|
|
||||||
return sb.toString();
|
return sb.toString();
|
||||||
}
|
}
|
||||||
|
@ -18,6 +18,9 @@ public class ArmenianTransliteratorTest extends TestCase {
|
|||||||
Assert.assertEquals("vorotan", new ArmenianTransliterator().transliterate("որոտան"));
|
Assert.assertEquals("vorotan", new ArmenianTransliterator().transliterate("որոտան"));
|
||||||
Assert.assertEquals("voroshel", new ArmenianTransliterator().transliterate("որոշել"));
|
Assert.assertEquals("voroshel", new ArmenianTransliterator().transliterate("որոշել"));
|
||||||
Assert.assertEquals("uzox", new ArmenianTransliterator().transliterate("ուզող"));
|
Assert.assertEquals("uzox", new ArmenianTransliterator().transliterate("ուզող"));
|
||||||
|
Assert.assertEquals(
|
||||||
|
"AVO", new ArmenianTransliterator().transliterate("ԱՈ")
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
@ -44,9 +47,11 @@ public class ArmenianTransliteratorTest extends TestCase {
|
|||||||
Assert.assertEquals(
|
Assert.assertEquals(
|
||||||
"Ushadir", new ArmenianTransliterator().transliterate("Ուշադիր")
|
"Ushadir", new ArmenianTransliterator().transliterate("Ուշադիր")
|
||||||
);
|
);
|
||||||
|
Assert.assertEquals(
|
||||||
|
"AU", new ArmenianTransliterator().transliterate("ԱՈւ")
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@Test
|
@Test
|
||||||
public void testTop100Words() {
|
public void testTop100Words() {
|
||||||
final Map<String,String> topWords = new LinkedHashMap<String,String>() {{
|
final Map<String,String> topWords = new LinkedHashMap<String,String>() {{
|
||||||
|
Loading…
x
Reference in New Issue
Block a user