Source code for OpenAttack.text_process.tokenizer.base


from typing import List, Tuple, Union


[docs]class Tokenizer: """ Tokenizer is the base class of all tokenizers. """
[docs] def tokenize(self, x : str, pos_tagging : bool = True) -> Union[ List[str], List[Tuple[str, str]] ]: """ Args: x: A sentence. pos_tagging: Whether to return Pos Tagging results. Returns: A list of tokens if **pos_tagging** is `False` A list of (token, pos) tuples if **pos_tagging** is `True` POS tag must be one of the following tags: ``["noun", "verb", "adj", "adv", "other"]`` """ return self.do_tokenize(x, pos_tagging)
[docs] def detokenize(self, x : Union[List[str], List[Tuple[str, str]]]) -> str: """ Args: x: The result of :py:meth:`.Tokenizer.tokenize`, can be a list of tokens or tokens with POS tags. Returns: A sentence. """ if not isinstance(x, list): raise TypeError("`x` must be a list of tokens") if len(x) == 0: return "" x = [ it[0] if isinstance(it, tuple) else it for it in x ] return self.do_detokenize(x)
def do_tokenize(self, x, pos_tagging): raise NotImplementedError() def do_detokenize(self, x): raise NotImplementedError()