Source code for OpenAttack.attackers.textbugger

from typing import List, Optional
import random
import numpy as np

from ..classification import ClassificationAttacker, Classifier, ClassifierGoal
from ...text_process.tokenizer import Tokenizer, get_default_tokenizer
from ...attack_assist.substitute.word import WordSubstitute, get_default_substitute
from ...utils import get_language, check_language, language_by_name
from ...exceptions import WordNotInDictionaryException
from ...tags import Tag
from ...attack_assist.filter_words import get_default_filter_words


[docs]class TextBuggerAttacker(ClassificationAttacker): @property def TAGS(self): ret = { self.__lang_tag, Tag("get_pred", "victim") } if self.blackbox: ret.add(Tag("get_prob", "victim")) else: ret.add(Tag("get_grad", "victim")) return ret
[docs] def __init__(self, blackbox = True, tokenizer : Optional[Tokenizer] = None, substitute : Optional[WordSubstitute] = None, filter_words : List[str] = None, lang = None ): """ TEXTBUGGER: Generating Adversarial Text Against Real-world Applications. Jinfeng Li, Shouling Ji, Tianyu Du, Bo Li, Ting Wang. NDSS 2019. `[pdf] <https://arxiv.org/pdf/1812.05271.pdf>`__ Args: blackbox: If is true, the attacker will perform a black-box attack. tokenizer: A tokenizer that will be used during the attack procedure. Must be an instance of :py:class:`.Tokenizer` substitute: A substitute that will be used during the attack procedure. Must be an instance of :py:class:`.WordSubstitute` lang: The language used in attacker. If is `None` then `attacker` will intelligently select the language based on other parameters. filter_words: A list of words that will be preserved in the attack procesudre. :Classifier Capacity: * get_pred * get_prob `if blackbox = True` * get_grad `if blackbox = False` """ lst = [] if tokenizer is not None: lst.append(tokenizer) if substitute is not None: lst.append(substitute) if len(lst) > 0: self.__lang_tag = get_language(lst) else: self.__lang_tag = language_by_name(lang) if self.__lang_tag is None: raise ValueError("Unknown language `%s`" % lang) if substitute is None: substitute = get_default_substitute(self.__lang_tag) self.substitute = substitute if tokenizer is None: tokenizer = get_default_tokenizer(self.__lang_tag) self.tokenizer = tokenizer check_language([self.tokenizer, self.substitute], self.__lang_tag) self.glove_vectors = None self.blackbox = blackbox if filter_words is None: filter_words = get_default_filter_words(self.__lang_tag) self.filter_words = set(filter_words)
def attack(self, victim: Classifier, sentence, goal: ClassifierGoal): x = self.tokenizer.tokenize(sentence, pos_tagging=False) if self.blackbox: ranked_words = self.get_word_importances(x, victim, goal) else: ranked_words = self.get_w_word_importances(x, victim, goal) for word_idx in ranked_words: word = x[word_idx] if word in self.filter_words: continue bug = self.selectBug(word, word_idx, x, victim, goal) x = self.replaceWithBug(x, word_idx, bug) x_prime_sentence = self.tokenizer.detokenize(x) prediction = victim.get_pred([x_prime_sentence])[0] if goal.check(x_prime_sentence, prediction): return x_prime_sentence return None def get_word_importances(self, sentence_tokens, clsf, goal : ClassifierGoal): word_losses = {} for i in range(len(sentence_tokens)): sentence_tokens_without = sentence_tokens[:i] + sentence_tokens[i + 1:] sentence_without = self.tokenizer.detokenize(sentence_tokens_without) tempoutput = clsf.get_prob([sentence_without])[0] word_losses[i] = tempoutput[goal.target] word_losses = [k for k, _ in sorted(word_losses.items(), key=lambda item: item[1], reverse=goal.targeted)] return word_losses def get_w_word_importances(self, sentence_tokens, clsf, y_orig): # white _, grad = clsf.get_grad([sentence_tokens], [y_orig]) grad = grad[0] if grad.shape[0] != len(sentence_tokens): raise RuntimeError("Sent %d != Gradient %d" % (len(sentence_tokens), grad.shape[0])) dist = np.linalg.norm(grad, axis=1) return [idx for idx, _ in sorted(enumerate(dist.tolist()), key=lambda x: -x[1])] def selectBug(self, original_word, word_idx, x_prime, clsf, goal): bugs = self.generateBugs(original_word, self.glove_vectors) max_score = float('-inf') best_bug = original_word for bug_type, b_k in bugs.items(): candidate_k = self.replaceWithBug(x_prime, word_idx, b_k) score_k = self.getScore(candidate_k, clsf, goal) if score_k > max_score: best_bug = b_k max_score = score_k return best_bug def getScore(self, candidate, clsf, goal): candidate_sentence = self.tokenizer.detokenize(candidate) tempoutput = clsf.get_prob([candidate_sentence])[0] if goal.targeted: return tempoutput[goal.target] else: return - tempoutput[goal.target] def replaceWithBug(self, x_prime, word_idx, bug): return x_prime[:word_idx] + [bug] + x_prime[word_idx + 1:] def generateBugs(self, word, glove_vectors, sub_w_enabled=False, typo_enabled=False): bugs = {"insert": word, "delete": word, "swap": word, "sub_C": word, "sub_W": word} if len(word) <= 2: return bugs bugs["insert"] = self.bug_insert(word) bugs["delete"] = self.bug_delete(word) bugs["swap"] = self.bug_swap(word) bugs["sub_C"] = self.bug_sub_C(word) bugs["sub_W"] = self.bug_sub_W(word) return bugs def bug_sub_W(self, word): try: res = self.substitute(word, None) if len(res) == 0: return word return res[0][0] except WordNotInDictionaryException: return word def bug_insert(self, word): if len(word) >= 6: return word res = word point = random.randint(1, len(word) - 1) res = res[0:point] + " " + res[point:] return res def bug_delete(self, word): res = word point = random.randint(1, len(word) - 2) res = res[0:point] + res[point + 1:] return res def bug_swap(self, word): if len(word) <= 4: return word res = word points = random.sample(range(1, len(word) - 1), 2) a = points[0] b = points[1] res = list(res) w = res[a] res[a] = res[b] res[b] = w res = ''.join(res) return res def bug_sub_C(self, word): res = word key_neighbors = self.get_key_neighbors() point = random.randint(0, len(word) - 1) if word[point] not in key_neighbors: return word choices = key_neighbors[word[point]] subbed_choice = choices[random.randint(0, len(choices) - 1)] res = list(res) res[point] = subbed_choice res = ''.join(res) return res def get_key_neighbors(self): ## TODO: support other language here # By keyboard proximity neighbors = { "q": "was", "w": "qeasd", "e": "wrsdf", "r": "etdfg", "t": "ryfgh", "y": "tughj", "u": "yihjk", "i": "uojkl", "o": "ipkl", "p": "ol", "a": "qwszx", "s": "qweadzx", "d": "wersfxc", "f": "ertdgcv", "g": "rtyfhvb", "h": "tyugjbn", "j": "yuihknm", "k": "uiojlm", "l": "opk", "z": "asx", "x": "sdzc", "c": "dfxv", "v": "fgcb", "b": "ghvn", "n": "hjbm", "m": "jkn" } # By visual proximity neighbors['i'] += '1' neighbors['l'] += '1' neighbors['z'] += '2' neighbors['e'] += '3' neighbors['a'] += '4' neighbors['s'] += '5' neighbors['g'] += '6' neighbors['b'] += '8' neighbors['g'] += '9' neighbors['q'] += '9' neighbors['o'] += '0' return neighbors