Source code for OpenAttack.text_process.tokenizer.jieba_tokenizer
from .base import Tokenizer
from ...data_manager import DataManager
from ...tags import *
_POS_MAPPING = {
"v": "verb",
"n": "noun",
"t": "noun",
"a": "adj",
"d": "adv"
}
[docs]class JiebaTokenizer(Tokenizer):
"""
Tokenizer based on jieba.posseg
:Package Requirements:
* jieba
:Language: chinese
"""
TAGS = { TAG_Chinese }
def __init__(self) -> None:
import jieba
import jieba.posseg as pseg
self.__tokenize = pseg.cut
jieba.initialize()
def do_tokenize(self, x, pos_tagging):
ret = []
for pair in self.__tokenize(x):
if pos_tagging:
pos = pair.flag[0]
if pos in _POS_MAPPING:
pos = _POS_MAPPING[pos]
else:
pos = "other"
ret.append( (pair.word, pos) )
else:
ret.append( pair.word )
return ret
def do_detokenize(self, x):
return "".join(x)