hyperboria/nexus/nlptools/morph.py

import math

import lemminflect  # noqa
import pymorphy2
import spacy


class EnglishMorphology:
    VERBS = {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ'}
    ADJS = {'JJ', 'JJR', 'JJS'}
    NOUNS = {'NN', 'NNP', 'NNPS', 'NNS'}
    ADVERBS = {'RB', 'RBR', 'RBS'}

    WORD_KINDS = [VERBS, ADJS, NOUNS, ADVERBS]

    def __init__(self, name):
        self.nlp = spacy.load(name)

    def derive_forms(self, word):
        forms = set()
        word = self.nlp(word)[0]
        inflected = False
        for kind in self.WORD_KINDS:
            if word.tag_ in kind:
                for w in kind:
                    inflection = word._.inflect(w)
                    if inflection:
                        inflected = True
                        forms.add(word._.inflect(w))
        if not inflected and word:
            forms.add(str(word))
        return list(sorted(forms))


class RussianMorphology:
    def __init__(self):
        self.morph_analyzer = pymorphy2.MorphAnalyzer()

    def derive_forms(self, word):
        words = set()
        phrase_word_form = self.morph_analyzer.parse(word)[0]
        for lexeme in phrase_word_form.lexeme:
            if lexeme.word == word:
                coef = 1.0
            else:
                coef = 1.0 / math.log1p(len(phrase_word_form.lexeme))
            if 'Abbr' in lexeme.tag:
                continue
            words.add(f'{lexeme.word}^{coef:.2f}')
        return list(sorted(words))