Source code for OpenAttack.text_process.lemmatizer.wordnet_lemmatizer

from .base import Lemmatizer
from ...tags import *
from ...data_manager import DataManager

POS_MAPPING = {
    "adv": "r",
    "adj": "a",
    "verb": "v",
    "noun": "n"
}

_DELEMMA_POS_MAPPING = {
    "JJ": "adj",
    "VB": "verb",
    "NN": "noun",
    "RB": "adv"
}

[docs]class WordnetLemmatimer(Lemmatizer): """ Lemmatizer based on nltk.wordnet :Language: english """ TAGS = { TAG_English } def __init__(self) -> None: self.wnc = DataManager.load("TProcess.NLTKWordNet") old_delema = DataManager.load("TProcess.NLTKWordNetDelemma") self.__delema = {} for word in old_delema.keys(): self.__delema[word] = {} for kw, val in old_delema[word].items(): if kw[:2] in _DELEMMA_POS_MAPPING: pos = _DELEMMA_POS_MAPPING[kw[:2]] self.__delema[word][pos] = val def do_lemmatize(self, token, pos): if pos not in POS_MAPPING: return token pos_in_wordnet = POS_MAPPING[pos] lemmas = self.wnc._morphy(token, pos_in_wordnet) return min(lemmas, key=len) if len(lemmas) > 0 else token def do_delemmatize(self, lemma, pos): if (lemma in self.__delema) and (pos in self.__delema[lemma]): return self.__delema[lemma][pos] return lemma