Source code for OpenAttack.text_process.lemmatizer.wordnet_lemmatizer

from .base import Lemmatizer
from ...tags import *
from ...data_manager import DataManager

POS_MAPPING = {
    "adv": "r",
    "adj": "a",
    "verb": "v",
    "noun": "n"
}

_DELEMMA_POS_MAPPING = {
    "JJ": "adj",
    "VB": "verb",
    "NN": "noun",
    "RB": "adv"
}

[docs]class WordnetLemmatimer(Lemmatizer):
    """
    Lemmatizer based on nltk.wordnet
    
    :Language: english
    """

    TAGS = { TAG_English }

    def __init__(self) -> None:
        self.wnc = DataManager.load("TProcess.NLTKWordNet")
        old_delema = DataManager.load("TProcess.NLTKWordNetDelemma")
        self.__delema = {}
        for word in old_delema.keys():
            self.__delema[word] = {}
            for kw, val in old_delema[word].items():
                if kw[:2] in _DELEMMA_POS_MAPPING:
                    pos = _DELEMMA_POS_MAPPING[kw[:2]]
                    self.__delema[word][pos] = val
        
    def do_lemmatize(self, token, pos):
        if pos not in POS_MAPPING:
            return token
        pos_in_wordnet = POS_MAPPING[pos]

        lemmas = self.wnc._morphy(token, pos_in_wordnet)
        return min(lemmas, key=len) if len(lemmas) > 0 else token

    
    def do_delemmatize(self, lemma, pos):
        if (lemma in self.__delema) and (pos in self.__delema[lemma]):
            return self.__delema[lemma][pos]
        return lemma