Source code for OpenAttack.attackers.hotflip

from typing import List, Optional
from ..classification import ClassificationAttacker, Classifier, ClassifierGoal
from ...text_process.tokenizer import Tokenizer, get_default_tokenizer
from ...attack_assist.substitute.word import WordSubstitute, get_default_substitute
from ...utils import get_language, check_language, language_by_name
from ...exceptions import WordNotInDictionaryException
from ...tags import Tag
from ...attack_assist.filter_words import get_default_filter_words

[docs]class HotFlipAttacker(ClassificationAttacker): @property def TAGS(self): return { self.__lang_tag, Tag("get_pred", "victim"), Tag("get_prob", "victim") }
[docs] def __init__(self, substitute : Optional[WordSubstitute] = None, tokenizer : Optional[Tokenizer] = None, filter_words : List[str] = None, lang = None ): """ HotFlip: White-Box Adversarial Examples for Text Classification. Javid Ebrahimi, Anyi Rao, Daniel Lowd, Dejing Dou. ACL 2018. `[pdf] <https://www.aclweb.org/anthology/P18-2006>`__ `[code] <https://github.com/AnyiRao/WordAdver>`__ Args: tokenizer: A tokenizer that will be used during the attack procedure. Must be an instance of :py:class:`.Tokenizer` substitute: A substitute that will be used during the attack procedure. Must be an instance of :py:class:`.WordSubstitute` filter_words: A list of words that will be preserved in the attack procesudre. lang: The language used in attacker. If is `None` then `attacker` will intelligently select the language based on other parameters. :Classifier Capacity: * get_pred * get_prob """ lst = [] if tokenizer is not None: lst.append(tokenizer) if substitute is not None: lst.append(substitute) if len(lst) > 0: self.__lang_tag = get_language(lst) else: self.__lang_tag = language_by_name(lang) if self.__lang_tag is None: raise ValueError("Unknown language `%s`" % lang) if substitute is None: substitute = get_default_substitute(self.__lang_tag) self.substitute = substitute if tokenizer is None: tokenizer = get_default_tokenizer(self.__lang_tag) self.tokenizer = tokenizer if filter_words is None: filter_words = get_default_filter_words(self.__lang_tag) self.filter_words = set(filter_words) check_language([self.tokenizer, self.substitute], self.__lang_tag)
def attack(self, victim: Classifier, sentence : str, goal: ClassifierGoal): x_orig = sentence.lower() x_orig = self.tokenizer.tokenize(x_orig) x_pos = list(map(lambda x: x[1], x_orig)) x_orig = list(map(lambda x: x[0], x_orig)) counter = -1 for word, pos in zip(x_orig, x_pos): counter += 1 if word in self.filter_words: continue neighbours = self.get_neighbours(word, pos) for neighbour in neighbours: x_new = self.tokenizer.detokenize(self.do_replace(x_orig, neighbour, counter)) pred_target = victim.get_pred([x_new])[0] if goal.check(x_new, pred_target): return x_new return None def do_replace(self, x_cur, word, index): ret = x_cur ret[index] = word return ret def get_neighbours(self, word, POS): try: return list( map(lambda x: x[0], self.substitute(word, POS)) ) except WordNotInDictionaryException: return []