Source code for OpenAttack.attackers.deepwordbug

from ..classification import ClassificationAttacker, Classifier, ClassifierGoal
from ...text_process.tokenizer import Tokenizer, get_default_tokenizer
from ...utils import check_language
from ...tags import TAG_English, Tag
import numpy as np


homos = {
         '-': '˗', '9': '৭', '8': 'Ȣ', '7': '𝟕', '6': 'б', '5': 'Ƽ', '4': 'Ꮞ', '3': 'Ʒ', '2': 'ᒿ', '1': 'l', '0': 'O',
         "'": '`', 'a': 'ɑ', 'b': 'Ь', 'c': 'ϲ', 'd': 'ԁ', 'e': 'е', 'f': '𝚏', 'g': 'ɡ', 'h': 'հ', 'i': 'і', 'j': 'ϳ',
         'k': '𝒌', 'l': 'ⅼ', 'm': 'm', 'n': 'ո', 'o': 'о', 'p': 'р', 'q': 'ԛ', 'r': 'ⲅ', 's': 'ѕ', 't': '𝚝', 'u': 'ս',
         'v': 'ѵ', 'w': 'ԝ', 'x': '×', 'y': 'у', 'z': 'ᴢ'
}


[docs]class DeepWordBugAttacker(ClassificationAttacker): @property def TAGS(self): return { self.__lang_tag, Tag("get_pred", "victim"), Tag("get_prob", "victim") }
[docs] def __init__(self, token_unk = "<UNK>", scoring = "replaceone", transform = "homoglyph", power = 5, tokenizer : Tokenizer = None ): """ Black-box Generation of Adversarial Text Sequences to Evade Deep Learning Classifiers. Ji Gao, Jack Lanchantin, Mary Lou Soffa, Yanjun Qi. IEEE SPW 2018. `[pdf] <https://ieeexplore.ieee.org/document/8424632>`__ `[code] <https://github.com/QData/deepWordBug>`__ Args: token_unk: The token id or the token name for out-of-vocabulary words in victim model. **Default:** ``"<UNK>"`` scoring: Scoring function used to compute word importance, must be one of the following: ``["replaceone", "temporal", "tail", "combined"]``. **Default:** replaceone transform: Transform function to modify a word, must be one of the following: ``["homoglyph", "swap"]``. **Default:** homoglyph power: Max words to replace. **Default:** 5 tokenizer: A tokenizer that will be used during the attack procedure. Must be an instance of :py:class:`.Tokenizer` :Classifier Capacity: * get_pred * get_prob """ self.token_unk = token_unk self.scoring = scoring self.transformer = transform self.power = power if tokenizer is None: self.tokenizer = get_default_tokenizer(None) else: self.tokenizer = tokenizer self.__lang_tag = TAG_English check_language([self.tokenizer], self.__lang_tag)
def attack(self, victim: Classifier, x_orig, goal: ClassifierGoal): """ * **clsf** : **Classifier** . * **x_orig** : Input sentence. """ tokens = self.tokenizer.tokenize(x_orig, pos_tagging=False) losses = self.scorefunc(self.scoring, victim, tokens, goal) # 每个词消失后的loss向量 indices = np.argsort(losses) advinputs = tokens[:] t = 0 j = 0 while j < self.power and t < len(tokens): if advinputs[indices[t]] != '' and advinputs[indices[t]] != ' ': advinputs[indices[t]] = self.transform(self.transformer, advinputs[indices[t]]) j += 1 t += 1 ret = self.tokenizer.detokenize(advinputs) output2 = victim.get_pred([ret])[0] if goal.check(ret, output2): return ret return None def scorefunc(self, type_, victim, tokens, goal): if type_ == "replaceone": return self.replaceone(victim, tokens, goal) elif type_ == "temporal": return self.temporal(victim, tokens, goal) elif type_ == "tail": return self.temporaltail(victim, tokens, goal) elif type_ == "combined": return self.combined(victim, tokens, goal) else: raise ValueError("Unknown score function %s, %s expected" % (type_, ["replaceone", "temporal", "tail", "combined"])) def transform(self, type_, word): if type_ == "homoglyph": return self.homoglyph(word) elif type_ == "swap": return self.temporal(word) else: raise ValueError("Unknown transform function %s, %s expected" % (type_, ["homoglyph", "swap"])) # scoring functions def replaceone(self, victim, tokens, goal): losses = np.zeros(len(tokens)) for i in range(len(tokens)): tempinputs = tokens[:] # ## tempinputs[i] = self.token_unk tempoutput = victim.get_prob([ self.tokenizer.detokenize(tempinputs) ]) if not goal.targeted: losses[i] = tempoutput[0][goal.target] else: losses[i] = 1 - tempoutput[0][goal.target] return losses def temporal(self, victim, tokens, goal): losses1 = np.zeros(len(tokens)) dloss = np.zeros(len(tokens)) for i in range(len(tokens)): tempinputs = tokens[: i + 1] tempoutput = victim.get_prob([self.tokenizer.detokenize(tempinputs)]) if not goal.targeted: losses1[i] = tempoutput[0][goal.target] else: losses1[i] = 1 - tempoutput[0][goal.target] for i in range(1, len(tokens)): dloss[i] = abs(losses1[i] - losses1[i - 1]) return dloss def temporaltail(self, victim, tokens, goal): losses1 = np.zeros(len(tokens)) dloss = np.zeros(len(tokens)) for i in range(len(tokens)): tempinputs = tokens[i:] tempoutput = victim.get_prob([self.tokenizer.detokenize(tempinputs)]) if not goal.targeted: losses1[i] = tempoutput[0][goal.target] else: losses1[i] = 1 - tempoutput[0][goal.target] for i in range(1, len(tokens)): dloss[i] = abs(losses1[i] - losses1[i - 1]) return dloss def combined(self, victim, tokens, goal): temp = self.temporal(victim, tokens, goal) temptail = self.temporaltail(victim, tokens, goal) return (temp+temptail) / 2 # transform functions def homoglyph(self, word): s = np.random.randint(0, len(word)) if word[s] in homos: rletter = homos[word[s]] else: rletter = word[s] cword = word[:s] + rletter + word[s+1:] return cword def swap(self, word): if len(word) != 1: s = np.random.randint(0, len(word)-1) cword = word[:s] + word[s+1] + word[s] + word[s+2:] else: cword = word return cword