Source code for OpenAttack.attackers.gan

import numpy as np
from copy import deepcopy
from ..classification import ClassificationAttacker, Classifier, ClassifierGoal
from ...data_manager import DataManager
from ...tags import TAG_English, Tag
import torch


def get_min(indices_adv1, d):
    d1 = deepcopy(d)
    idx_adv1 = indices_adv1[np.argmin(d1[indices_adv1])]
    orig_idx_adv1 = idx_adv1
    return orig_idx_adv1


DEFAULT_CONFIG = {
    "sst": False,
}


[docs]class GANAttacker(ClassificationAttacker):
    @property
    def TAGS(self):
        return { self.__lang_tag,  Tag("get_pred", "victim") }

[docs]    def __init__(self, gan_dataset : str = "sst"):
        """
        Generating Natural Adversarial Examples. Zhengli Zhao, Dheeru Dua, Sameer Singh. ICLR 2018.
        `[pdf] <https://arxiv.org/pdf/1710.11342.pdf>`__
        `[code] <https://github.com/zhengliz/natural-adversary>`__

        Args:
            gan_dataset: The name of dataset which GAN model is trained on. Must be one of the following: ``["sst", "snli"]``. **Default:** sst
        
        :Language: english
        :Classifier Capacity:
            * get_pred

        """
        self.__lang_tag = TAG_English
        self.gan_dataset = gan_dataset
        if  gan_dataset == "snli":  # snli
            self.word2idx, self.autoencoder, self.inverter, self.gan_gen, self.gan_disc = DataManager.load("AttackAssist.GAN")
            self.maxlen = 10
        elif gan_dataset == "sst":
            self.word2idx, self.autoencoder, self.inverter, self.gan_gen, self.gan_disc = DataManager.load("AttackAssist.SGAN")
            self.maxlen = 100
        else:
            raise ValueError("Unknown dataset `%s`" % self.gan_dataset)
        
        self.idx2word = {v: k for k, v in self.word2idx.items()}

        ## TODO: support GPU gan

        self.gan_gen = self.gan_gen.cpu()
        self.inverter = self.inverter.cpu()
        self.autoencoder.eval()
        self.autoencoder = self.autoencoder.cpu()
        self.right = 0.05  # ####
        self.nsamples = 10
        self.autoencoder.gpu = False
        self.lowercase = True


    def attack(self, victim: Classifier, sentence, goal: ClassifierGoal):
        if self.gan_dataset == "snli":
            return self.snli_call(victim, sentence, goal)
        elif self.gan_dataset == "sst":
            return self.sst_call(victim, sentence, goal)
        else:
            raise ValueError("Unknown dataset `%s`" % self.gan_dataset)

    def snli_call(self, clsf : Classifier, hypothesis_orig, goal : ClassifierGoal):

        # * **clsf** : **Classifier** .
        # * **x_orig** : Input sentence.
        # 'entailment': 0, 'neutral': 1, 'contradiction': 2

        # tokenization
        if self.lowercase:
            hypothesis_orig = hypothesis_orig.strip().lower()

        hypothesis_words =  hypothesis_orig.strip().split(" ")
        hypothesis_words = ['<sos>'] + hypothesis_words
        hypothesis_words += ['<eos>']

        vocab = self.word2idx
        unk_idx = vocab['<oov>']
        hypothesis_indices = [vocab[w] if w in vocab else unk_idx for w in hypothesis_words]
        hypothesis_words = [w if w in vocab else '<oov>' for w in hypothesis_words]
        length = min(len(hypothesis_words), self.maxlen)

        if len(hypothesis_indices) < self.maxlen:
            hypothesis_indices += [0] * (self.maxlen - len(hypothesis_indices))
            hypothesis_words += ["<pad>"] * (self.maxlen - len(hypothesis_words))

        hypothesis = hypothesis_indices[:self.maxlen]
        hypothesis_words = hypothesis_words[:self.maxlen]
        c = self.autoencoder.encode(torch.LongTensor([hypothesis, hypothesis]),
                                    torch.LongTensor([length, length]), noise=False)
        z = self.inverter(c).data.cpu()

        hypothesis = torch.LongTensor(hypothesis)
        hypothesis = hypothesis.unsqueeze(0)
        right_curr = self.right
        counter = 0

        while counter <= 5:
            mus = z.repeat(self.nsamples, 1)
            delta = torch.FloatTensor(mus.size()).uniform_(-1 * right_curr, right_curr)
            dist = np.array([np.sqrt(np.sum(x ** 2)) for x in delta.cpu().numpy()])
            perturb_z = mus + delta  # ####  volatile=True

            x_tilde = self.gan_gen(perturb_z)  # perturb
            adv_prob = []
            index_adv = []
            sentences = []
            for i in range(self.nsamples):
                x_adv = x_tilde[i]
                sample_idx = self.autoencoder.generate(x_adv, 10, True).data.cpu().numpy()[0]
                words = [self.idx2word[x] for x in sample_idx]
                if "<eos>" in words:
                    words = words[:words.index("<eos>")]
                adv_prob.append(clsf.get_pred([ " ".join(words) ])[0])
                sentences.append(" ".join(words))
            for i in range(self.nsamples):
                if goal.check(sentences[i], int(adv_prob[i])):
                    index_adv.append(i)

            if len(index_adv) == 0:
                counter += 1
                right_curr *= 2
            else:
                idx_adv = get_min(index_adv, dist)
                return sentences[idx_adv], clsf.get_pred([sentences[idx_adv]])[0]
        return None

    def sst_call(self, clsf : Classifier, hypothesis_orig, target : ClassifierGoal):
        if self.lowercase:
            hypothesis_orig = hypothesis_orig.strip().lower()

        hypothesis_words = hypothesis_orig.strip().split(" ")
        hypothesis_words = ['<sos>'] + hypothesis_words
        hypothesis_words += ['<eos>']

        vocab = self.word2idx
        unk_idx = vocab['<oov>']
        hypothesis_indices = [vocab[w] if w in vocab else unk_idx for w in hypothesis_words]
        hypothesis_words = [w if w in vocab else '<oov>' for w in hypothesis_words]
        length = min(len(hypothesis_words), self.maxlen)

        if len(hypothesis_indices) < self.maxlen:
            hypothesis_indices += [0] * (self.maxlen - len(hypothesis_indices))
            hypothesis_words += ["<pad>"] * (self.maxlen - len(hypothesis_words))

        hypothesis = hypothesis_indices[:self.maxlen]
        hypothesis_words = hypothesis_words[:self.maxlen]
        source_orig = hypothesis[:-1]
        if len(source_orig) > self.maxlen:
            source_orig = source_orig[:self.maxlen]
        zeros = (self.maxlen - len(source_orig)) * [0]
        source_orig += zeros

        ## TODO Something maybe wrong here

        output = self.autoencoder(torch.LongTensor([source_orig]),
                                  torch.LongTensor([length]),
                                  noise=True)

        _, max_indices = torch.max(output, 2)
        max_indices = max_indices.view(output.size(0), -1).data.cpu().numpy()
        for idx in max_indices:
            words = [self.idx2word[x] for x in idx]
            if "<eos>" in words:
                words = words[:words.index("<eos>")]
            if "." in words:
                words = words[:words.index(".")]
            for i in range(len(words)):
                if words[i] == "<oov>":
                    words[i] = ""
            sent = " ".join(words)
            pred = clsf.get_pred([sent])[0]

            if target.check(sent, pred):
                return sent
        return None