Source code for OpenAttack.attack_assist.substitute.word.chinese_hownet

from typing import Optional
from ....exceptions import WordNotInDictionaryException
from .base import WordSubstitute
from ....data_manager import DataManager
from ....tags import *

[docs]class ChineseHowNetSubstitute(WordSubstitute):

    TAGS = { TAG_Chinese }

[docs]    def __init__(self, k : Optional[int] = None):
        """
        Chinese Sememe-based word substitute based on OpenHowNet.
        `[pdf] <https://arxiv.org/pdf/1901.09957.pdf>`__

        Args:
            k: Top-k results to return. If k is `None`, all results will be returned.
        
        :Package Requirements: OpenHowNet
        :Data Requirements: :py:data:`.AttackAssist.HowNet`
        :Language: chinese
        
        """

        super().__init__()
        self.hownet_dict = DataManager.load("AttackAssist.HowNet")
        self.zh_word_list = self.hownet_dict.get_zh_words()
        self.k = k

    def substitute(self, word, pos):
        # get sememes
        word_sememes = self.hownet_dict.get_sememes_by_word(word, structured=False, lang="zh", merge=False)
        word_sememe_set = [t['sememes'] for t in word_sememes]
        if len(word_sememes) == 0:
            raise WordNotInDictionaryException()

        # find candidates
        word_candidate = [(word, 0)]
        for wd in self.zh_word_list:
            if wd == word:
                continue

            wd_pos = set()
            for a in self.hownet_dict.get(wd):
                if type(a) is not dict:
                    continue
                wd_pos.add(a['en_grammar'])
            if pos not in wd_pos:
                continue

            # sememe
            wd_sememes = self.hownet_dict.get_sememes_by_word(wd, structured=False, lang="zh", merge=False)
            wd_sememe_set = [t['sememes'] for t in wd_sememes]
            if len(wd_sememes) == 0:
                continue
            
            common_sememe = 0
            for s1 in word_sememe_set:
                for s2 in wd_sememe_set:
                    if s1 == s2:
                        common_sememe += 1
            
            if common_sememe > 0:
                if wd.find(" ") == -1:
                    word_candidate.append((wd, 1 - common_sememe / len(word_sememe_set)) )

        word_candidate = sorted(word_candidate, key=lambda x: x[1])
        if self.k is not None:
            word_candidate = word_candidate[:self.k]
        return word_candidate