Source code for OpenAttack.attack_assist.substitute.word.chinese_hownet

from typing import Optional
from ....exceptions import WordNotInDictionaryException
from .base import WordSubstitute
from ....data_manager import DataManager
from ....tags import *

[docs]class ChineseHowNetSubstitute(WordSubstitute): TAGS = { TAG_Chinese }
[docs] def __init__(self, k : Optional[int] = None): """ Chinese Sememe-based word substitute based on OpenHowNet. `[pdf] <https://arxiv.org/pdf/1901.09957.pdf>`__ Args: k: Top-k results to return. If k is `None`, all results will be returned. :Package Requirements: OpenHowNet :Data Requirements: :py:data:`.AttackAssist.HowNet` :Language: chinese """ super().__init__() self.hownet_dict = DataManager.load("AttackAssist.HowNet") self.zh_word_list = self.hownet_dict.get_zh_words() self.k = k
def substitute(self, word, pos): # get sememes word_sememes = self.hownet_dict.get_sememes_by_word(word, structured=False, lang="zh", merge=False) word_sememe_set = [t['sememes'] for t in word_sememes] if len(word_sememes) == 0: raise WordNotInDictionaryException() # find candidates word_candidate = [(word, 0)] for wd in self.zh_word_list: if wd == word: continue wd_pos = set() for a in self.hownet_dict.get(wd): if type(a) is not dict: continue wd_pos.add(a['en_grammar']) if pos not in wd_pos: continue # sememe wd_sememes = self.hownet_dict.get_sememes_by_word(wd, structured=False, lang="zh", merge=False) wd_sememe_set = [t['sememes'] for t in wd_sememes] if len(wd_sememes) == 0: continue common_sememe = 0 for s1 in word_sememe_set: for s2 in wd_sememe_set: if s1 == s2: common_sememe += 1 if common_sememe > 0: if wd.find(" ") == -1: word_candidate.append((wd, 1 - common_sememe / len(word_sememe_set)) ) word_candidate = sorted(word_candidate, key=lambda x: x[1]) if self.k is not None: word_candidate = word_candidate[:self.k] return word_candidate