Source code for OpenAttack.attackers.fd
from typing import List, Optional
import numpy as np
from ...text_process.tokenizer import Tokenizer, get_default_tokenizer
from ...utils import check_language, get_language, language_by_name
from ...attack_assist.substitute.word import WordSubstitute, get_default_substitute
from ..classification import ClassificationAttacker, Classifier, ClassifierGoal
from ...tags import TAG_English, Tag
from ...exceptions import WordNotInDictionaryException
from ...attack_assist.filter_words import get_default_filter_words
[docs]class FDAttacker(ClassificationAttacker):
@property
def TAGS(self):
return { self.__lang_tag, Tag("get_pred", "victim"), Tag("get_grad", "victim"), Tag("get_embedding", "victim") }
[docs] def __init__(self,
substitute : Optional[WordSubstitute] = None,
tokenizer : Optional[Tokenizer] = None,
token_unk : str = "<UNK>",
max_iter : int = 100,
lang : Optional[str] = None,
filter_words : List[str] = None
):
"""
Crafting Adversarial Input Sequences For Recurrent Neural Networks. Nicolas Papernot, Patrick McDaniel, Ananthram Swami, Richard Harang. MILCOM 2016.
`[pdf] <https://arxiv.org/pdf/1604.08275.pdf>`__
Args:
substitute: A substitute that will be used during the attack procedure. Must be an instance of :py:class:`.WordSubstitute`
tokenizer: A tokenizer that will be used during the attack procedure. Must be an instance of :py:class:`.Tokenizer`
token_unk: The token id or the token name for out-of-vocabulary words in victim model. **Default:** ``"<UNK>"``
max_iter: Maximum number of iterations in attack procedure.
lang: The language used in attacker. If is `None` then `attacker` will intelligently select the language based on other parameters.
filter_words: A list of words that will be preserved in the attack procesudre.
:Classifier Capacity:
* get_pred
* get_grad
* get_embedding
"""
if substitute is not None and tokenizer is not None:
self.__lang_tag = get_language([substitute, tokenizer])
if substitute is not None:
self.__lang_tag = get_language([substitute])
elif tokenizer is not None:
self.__lang_tag = get_language([tokenizer])
else:
if lang is None:
self.__lang_tag = TAG_English
else:
self.__lang_tag = language_by_name(lang)
if self.__lang_tag is None:
raise ValueError("Unknown language `%s`" % lang)
if substitute is None:
substitute = get_default_substitute(self.__lang_tag)
self.substitute = substitute
if tokenizer is None:
tokenizer = get_default_tokenizer(self.__lang_tag)
self.tokenizer = tokenizer
if filter_words is None:
filter_words = get_default_filter_words(self.__lang_tag)
self.filter_words = set(filter_words)
check_language([self.tokenizer, self.substitute], self.__lang_tag)
self.token_unk = token_unk
self.max_iter = max_iter
def attack(self, victim: Classifier, x_orig, goal: ClassifierGoal):
x_orig = x_orig.lower()
sent = self.tokenizer.tokenize(x_orig, pos_tagging=False)
victim_embedding = victim.get_embedding()
for i in range(self.max_iter):
curr_sent = self.tokenizer.detokenize(sent)
pred = victim.get_pred([ curr_sent ])[0]
if goal.check(curr_sent, pred):
return curr_sent
iter_cnt = 0
while True:
idx = np.random.choice(len(sent))
iter_cnt += 1
if iter_cnt > 5 * len(sent): # Failed to find a substitute word
return None
if sent[idx] in self.filter_words:
continue
try:
reps = list(map(lambda x:x[0], self.substitute(sent[idx], None)))
except WordNotInDictionaryException:
continue
reps = list(filter(lambda x: x in victim_embedding.word2id, reps))
if len(reps) > 0:
break
prob, grad = victim.get_grad([sent], [goal.target])
grad = grad[0]
prob = prob[0]
if grad.shape[0] != len(sent) or grad.shape[1] != victim_embedding.embedding.shape[1]:
raise RuntimeError("Sent %d != Gradient %d" % (len(sent), grad.shape[0]))
s1 = np.sign(grad[idx])
mn = None
mnwd = None
for word in reps:
s0 = np.sign(victim_embedding.transform(word, self.token_unk) - victim_embedding.transform(sent[idx], self.token_unk))
v = np.abs(s0 - s1).sum()
if goal.targeted:
v = -v
if (mn is None) or v < mn:
mn = v
mnwd = word
if mnwd is None:
return None
sent[idx] = mnwd
return None