Source code for OpenAttack.text_process.tokenizer.base


from typing import List, Tuple, Union


[docs]class Tokenizer:
    """
    Tokenizer is the base class of all tokenizers.
    """

[docs]    def tokenize(self, x : str, pos_tagging : bool = True) -> Union[ List[str], List[Tuple[str, str]] ]:
        """
        Args:
            x: A sentence.
            pos_tagging: Whether to return Pos Tagging results.

        Returns:
            A list of tokens if **pos_tagging** is `False`
            
            A list of (token, pos) tuples if **pos_tagging** is `True`
        
        POS tag must be one of the following tags: ``["noun", "verb", "adj", "adv", "other"]``

        """
        return self.do_tokenize(x, pos_tagging)
    
[docs]    def detokenize(self, x : Union[List[str], List[Tuple[str, str]]]) -> str:
        """
        Args:
            x: The result of :py:meth:`.Tokenizer.tokenize`, can be a list of tokens or tokens with POS tags.
        Returns:
            A sentence.
        """
        if not isinstance(x, list):
            raise TypeError("`x` must be a list of tokens")
        if len(x) == 0:
            return ""
        x = [ it[0] if isinstance(it, tuple) else it for it in x ]
        return self.do_detokenize(x)

    
    def do_tokenize(self, x, pos_tagging):
        raise NotImplementedError()
    
    def do_detokenize(self, x):
        raise NotImplementedError()