Gadgetbridge/app/src/main/java/nodomain/freeyourgadget/gadgetbridge/util/language/impl/ArmenianTransliterator.java

/*  Copyright (C) 2021-2024 Alik Aslanyan

    This file is part of Gadgetbridge.

    Gadgetbridge is free software: you can redistribute it and/or modify
    it under the terms of the GNU Affero General Public License as published
    by the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    Gadgetbridge is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU Affero General Public License for more details.

    You should have received a copy of the GNU Affero General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>. */
package nodomain.freeyourgadget.gadgetbridge.util.language.impl;
import nodomain.freeyourgadget.gadgetbridge.util.language.Transliterator;

import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.apache.commons.lang3.text.WordUtils;

import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

public class ArmenianTransliterator implements Transliterator {
    // Transliteration map ordered by priority
    // Armenian has some rules regarding reading of 'ո' in the middle of the word it reads as english O
    // But if word starts with it's read as sound of 'vo'
    // Or if it has 'ւ' symbol after it, then we should read it as 'u' (as double o in booze)
    private static final Map<String, String> transliterateMap = new LinkedHashMap<String, String>() {
        {
            // Letter + 'ու'
            put("աու","au");
            put("բու","bu");
            put("գու","gu");
            put("դու","du");
            put("եու","eu");
            put("զու","zu");
            put("էու","eu");
            put("ըու","yu");
            put("թու","tu");
            put("ժու","ju");
            put("իու","iu");
            put("լու","lu");
            put("խու","xu");
            put("ծու","cu");
            put("կու","ku");
            put("հու","hu");
            put("ձու","dzu");
            put("ղու","xu");
            put("ճու","cu");
            put("մու","mu");
            put("յու","yu");
            put("նու","nu");
            put("շու","shu");
            put("չու","chu");
            put("պու","pu");
            put("ջու","ju");
            put("ռու","ru");
            put("սու","su");
            put("վու","vu");
            put("տու","tu");
            put("րու","ru");
            put("ցու","cu");
            put("փու","pu");
            put("քու","qu");
            put("օու","ou");
            put("ևու","eu");
            put("ֆու","fu");
            put("ոու","vou");

            put("ու","u");

            // Letter + 'ո'
            put("բո","bo");
            put("գո","go");
            put("դո","do");
            put("զո","zo");
            put("թո","to");
            put("ժո","jo");
            put("լո","lo");
            put("խո","xo");
            put("ծո","co");
            put("կո","ko");
            put("հո","ho");
            put("ձո","dzo");
            put("ղո","xo");
            put("ճո","co");
            put("մո","mo");
            put("յո","yo");
            put("նո","no");
            put("շո","so");
            put("չո","co");
            put("պո","po");
            put("ջո","jo");
            put("ռո","ro");
            put("սո","so");
            put("վո","vo");
            put("տո","to");
            put("րո","ro");
            put("ցո","co");
            put("փո","po");
            put("քո","qo");
            put("ևո","eo");
            put("ֆո","fo");
            put("ո","vo");

            // Two different ways to write, we support all.
            put("եւ","ev");
            put("եվ","ev");

            // Simple substitutions
            put("ա","a");
            put("բ","b");
            put("գ","g");
            put("դ","d");
            put("ե","e");
            put("զ","z");
            put("է","e");
            put("ը","y");
            put("թ","t");
            put("ժ","j");
            put("ի","i");
            put("լ","l");
            put("խ","x");
            put("ծ","c");
            put("կ","k");
            put("հ","h");
            put("ձ","dz");
            put("ղ","x");
            put("ճ","c");
            put("մ","m");
            put("յ","y");
            put("ն","n");
            put("շ","sh");
            put("չ","ch");
            put("պ","p");
            put("ջ","j");
            put("ռ","r");
            put("ս","s");
            put("վ","v");
            put("տ","t");
            put("ր","r");
            put("ց","c");
            put("փ","p");
            put("ք","q");
            put("օ","o");
            put("և","ev");
            put("ֆ","f");

            // If this symbol wasn't used in the combination with others, then it's meaningless
            put("ւ","");

            // Add support for capitilazed words
            for (final Map.Entry<String,String> entry : ((Map<String, String>)this.clone()).entrySet()) {
                final String capitalKey = WordUtils.capitalize(entry.getKey());
                if(!capitalKey.equals(entry.getKey())) {
                    put(capitalKey, WordUtils.capitalize(entry.getValue()));
                }
            }

        }};

    private static final Map<String, Integer> transliterationPriorityMap = new HashMap<String, Integer>() {{
        int priority = 0;
        for( final String key : transliterateMap.keySet() ) {
            put(key, priority++);
        }
    }};

    // Aho-Corasick trie
    private static final Trie transliterationTrie;
    static {
        final Trie.TrieBuilder builder = Trie.builder();
        for( final String key :  ArmenianTransliterator.transliterateMap.keySet()) {
            builder.addKeyword(key);
        }
        transliterationTrie = builder.build();
    }

    private static String ahoCorasick(final String text) {
        // Create a buffer sufficiently large that re-allocations are minimized.
        final StringBuilder sb = new StringBuilder( text.length() * 10 / 12 );

        // The complexity of the Aho-Corasick algorithm O(N + L + Z)
        // Where N is the length of the text, L is the length of keywords and the Z is a number of matches.
        // This algorithm allows us to do fast substring search
        final List<Emit> emits = new ArrayList<Emit>(transliterationTrie.parseText( text ));

        // Sort collection first by starting position, then by priority.
        Collections.sort(emits, new Comparator<Emit>() {
            @Override
            public int compare(Emit a, Emit b) {
                int cmp = Integer.compare(a.getStart(), b.getStart());
                if (cmp != 0) {
                    return cmp;
                }

                int priorityA = transliterationPriorityMap.get(a.getKeyword());
                int priorityB = transliterationPriorityMap.get(b.getKeyword());
                return Integer.compare(priorityA, priorityB);
            }
        });

        int prevIndex = 0;

        for( final Emit emit : emits ) {
            final int matchIndex = emit.getStart();

            // Skip if we already substituted this part
            if(matchIndex < prevIndex) {
                continue;
            }

            // Add part which shouldn't be substituted
            sb.append(text.substring(prevIndex, matchIndex));

            // Substitute and append to the builder
            sb.append( ArmenianTransliterator.transliterateMap.get( emit.getKeyword() ) );

            prevIndex = emit.getEnd() + 1;
        }

        // Add the remainder of the string (contains no more matches).
        sb.append( text.substring( prevIndex ) );

        return sb.toString();
    }

    @Override
    public String transliterate(String txt) {
        if (txt == null || txt.isEmpty()) {
            return txt;
        }

        return ahoCorasick(txt);
    }
}