particle_swarm_attack.py

import numpy as np
import tensorflow as tf
import keras.backend as K
import cPickle as pickle
import os
from mnist import data_mnist, set_mnist_flags, load_model
from tf_utils import tf_test_error_rate, batch_eval
from keras.utils import np_utils
from attack_utils import gen_grad
from matplotlib import image as img
import time
from os.path import basename
from pyswarm import pso
import argparse

from tensorflow.python.platform import flags
FLAGS = flags.FLAGS

BATCH_SIZE = 100
BATCH_EVAL_NUM = 10

# PSO parameters
swarmsize = 100
maxiter = 100
omega = 0.5
p_wt = 0.5
s_wt = 0.5


parser = argparse.ArgumentParser()
parser.add_argument("target_model", help="target model for attack")

args = parser.parse_args()

eps = 0.3

target_model_name = basename(args.target_model)

set_mnist_flags()


np.random.seed(0)
tf.set_random_seed(0)

x = K.placeholder((None,
                   FLAGS.IMAGE_ROWS,
                   FLAGS.IMAGE_COLS,
                   FLAGS.NUM_CHANNELS))

y = K.placeholder((None, FLAGS.NUM_CLASSES))

dim = int(FLAGS.IMAGE_ROWS*FLAGS.IMAGE_COLS)

_, _, X_test, Y_test = data_mnist()
print('Loaded data')

# target model for crafting adversarial examples
target_model = load_model(args.target_model)
target_model_name = basename(target_model_name)

logits = target_model(x)
prediction = K.softmax(logits)

sess = tf.Session()
print('Creating session')

targets = np.argmax(Y_test[:BATCH_SIZE*BATCH_EVAL_NUM], 1)
# elif RANDOM is False:
#     targets = np.array([target]*(BATCH_SIZE*BATCH_EVAL_NUM))
# elif RANDOM is True:
#     targets = np.random.randint(10, size = BATCH_SIZE*BATCH_EVAL_NUM)
targets_cat = np_utils.to_categorical(targets, FLAGS.NUM_CLASSES).astype(np.float32)

def loss(X):
    X = X.reshape((1, FLAGS.IMAGE_ROWS, FLAGS.IMAGE_COLS, FLAGS.NUM_CHANNELS))
    confidence = K.get_session().run([prediction], feed_dict={x: X, K.learning_phase(): 0})[0]
    # confidence[:,curr_target] = 1e-4
    max_conf_i = np.argmax(confidence, 1)
    max_conf = np.max(confidence, 1)[0]
    if max_conf_i == curr_target:
        return max_conf
    elif max_conf_i != curr_target:
        return -1.0 * max_conf

def logit_loss(X):
    X = X.reshape((1, FLAGS.IMAGE_ROWS, FLAGS.IMAGE_COLS, FLAGS.NUM_CHANNELS))
    confidence = K.get_session().run([prediction], feed_dict={x: X, K.learning_phase(): 0})[0]
    # confidence[:,curr_target] = 1e-4
    logits = np.log(confidence)

    logit_t = logits[:, curr_target]
    logits[:, curr_target] = 1e-4
    max_logit_i = np.argmax(logits, 1)
    logit_max = logits[:, max_logit_i]
    return logit_t - logit_max


success = 0
adv_conf_avg = 0.0
sample_num = 10

ofile = open('output_data/pso_adv_success.txt', 'a')

time1 = time.time()
perturbation = 0.0
for i in range(sample_num):
    print(i)
    X_ini = X_test[i].reshape(dim)
    curr_target = targets[i]

    ones_vec = np.ones_like(X_ini)

    lower_bound = np.clip(X_ini - eps * ones_vec, 0, 1)
    upper_bound = np.clip(X_ini + eps * ones_vec, 0 ,1)

    Xopt, fopt = pso(logit_loss, lower_bound, upper_bound, swarmsize = 100, maxiter=100, debug = False)

    perturbation += np.linalg.norm(X_ini - Xopt)

    Xopt = Xopt.reshape((1, FLAGS.IMAGE_ROWS, FLAGS.IMAGE_COLS, FLAGS.NUM_CHANNELS))
    adv_pred_np = K.get_session().run([prediction], feed_dict={x: Xopt, K.learning_phase(): 0})[0]
    adv_label = np.argmax(adv_pred_np, 1)
    adv_conf = np.max(adv_pred_np, 1)
    if adv_label[0]!=curr_target:
        success += 1
        adv_conf_avg += adv_conf[0]

    if i<5:
        img.imsave('images/pso/'+
                    '{}_{}_{}_{}_{}.png'.format(target_model_name, i,
                    adv_label, curr_target, eps),
        Xopt.reshape(FLAGS.IMAGE_ROWS, FLAGS.IMAGE_COLS)*255, cmap='gray')

    # print(fopt, adv_conf[0])
time2 = time.time()
perturbation = perturbation / sample_num
ofile.write('PSO params: swarmsize {}, maxiter {}, omega {}, p_wt {}, s_wt {} \n'
.format(swarmsize, maxiter, omega, p_wt, s_wt))
adv_conf_avg = adv_conf_avg/success
ofile.write('{}, {}: {} of {}, {} \n'.format(target_model_name, eps, success, sample_num, adv_conf_avg, perturbation))
print(success)
print(perturbation)
print('{:.2f}'.format((time2-time1)/sample_num))