Source code for OpenAttack.text_process.tokenizer.punct_tokenizer

from .base import Tokenizer
from ...data_manager import DataManager
from ...tags import *
import nltk

_POS_MAPPING = {
    "JJ": "adj",
    "VB": "verb",
    "NN": "noun",
    "RB": "adv"
}


[docs]class PunctTokenizer(Tokenizer): """ Tokenizer based on nltk.word_tokenizer. :Language: english """ TAGS = { TAG_English } def __init__(self) -> None: self.sent_tokenizer = DataManager.load("TProcess.NLTKSentTokenizer") self.word_tokenizer = nltk.WordPunctTokenizer().tokenize self.pos_tagger = DataManager.load("TProcess.NLTKPerceptronPosTagger") def do_tokenize(self, x, pos_tagging=True): sentences = self.sent_tokenizer(x) tokens = [] for sent in sentences: tokens.extend( self.word_tokenizer(sent) ) if not pos_tagging: return tokens ret = [] for word, pos in self.pos_tagger(tokens): if pos[:2] in _POS_MAPPING: mapped_pos = _POS_MAPPING[pos[:2]] else: mapped_pos = "other" ret.append( (word, mapped_pos) ) return ret def do_detokenize(self, x): return " ".join(x)