Upload New File

a56c2eda · Tamara Stugan · 1c653392 · a56c2eda
Commit a56c2eda authored 1 year ago by Tamara Stugan
--- a/Scenarios 1-3/scenario2-mixup-percent.ipynb
+++ b/Scenarios 1-3/scenario2-mixup-percent.ipynb
+{"cells":[{"cell_type":"markdown","metadata":{"id":"uZuN8Izp7uLR"},"source":["Targeted attack, no defense\n","\n","\n","\n"]},{"cell_type":"code","execution_count":null,"metadata":{"id":"uG3R2ERwwYnS"},"outputs":[],"source":["%matplotlib inline\n","import matplotlib.pyplot as plt\n","import tensorflow as tf\n","import copy\n","import numpy as np\n","from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout\n","from tensorflow.keras.models import Model\n","from tensorflow.keras.datasets import cifar10\n","from tensorflow.keras.utils import to_categorical\n","from sklearn.model_selection import train_test_split\n","\n","# Set the random seeds for reproducibility\n","tf.random.set_seed(42)\n","np.random.seed(42)"]},{"cell_type":"markdown","source":["#Load, Normalize and Split the data"],"metadata":{"id":"VeOm7Qg1lqRH"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"f1HW9kHG5CG4"},"outputs":[],"source":["# Load Cifar10 dataset\n","(x_train, y_train), (x_test, y_test) = cifar10.load_data()\n","\n","\n","# Concatenate train and test sets\n","x = np.concatenate((x_train, x_test))\n","y = np.concatenate((y_train, y_test))\n","\n","# Normalize the images\n","x = x.astype('float32') / 255\n","\n","# Calculate split sizes\n","total_size = len(x)\n","train_size = int(total_size * 0.70)\n","val_size = int(total_size * 0.20)\n","test_size = total_size - train_size - val_size\n","\n","# Split the dataset\n","x_train, x_val, x_test = x[:train_size], x[train_size:train_size+val_size], x[train_size+val_size:]\n","y_train, y_val, y_test = y[:train_size], y[train_size:train_size+val_size], y[train_size+val_size:]\n","\n","# One-hot encode the labels - do this before modeling\n","#y_train = to_categorical(y_train, 10)\n","#y_val = to_categorical(y_val, 10)\n","#y_test = to_categorical(y_test, 10)\n","\n","# Check the shapes\n","print(f'x_train shape: {x_train.shape}, y_train shape: {y_train.shape}')\n","print(f'x_val shape: {x_val.shape}, y_val shape: {y_val.shape}')\n","print(f'x_test shape: {x_test.shape}, y_test shape: {y_test.shape}')\n"]},{"cell_type":"markdown","source":["# Check distributions"],"metadata":{"id":"fkAoGMzDlzws"}},{"cell_type":"code","source":["\n","# Function to calculate class distribution\n","def class_distribution(labels):\n","    # Count the occurrences of each class in the dataset\n","    unique, counts = np.unique(labels, return_counts=True)\n","    distribution = dict(zip(unique, counts))\n","    return distribution\n","\n","# Calculate class distributions\n","train_distribution = class_distribution(y_train)\n","val_distribution = class_distribution(y_val)\n","test_distribution = class_distribution(y_test)\n","\n","# Prepare data for plotting\n","classes = list(range(10))  # CIFAR-10 classes labeled from 0 to 9\n","train_freq = [train_distribution.get(i, 0) for i in classes]\n","val_freq = [val_distribution.get(i, 0) for i in classes]\n","test_freq = [test_distribution.get(i, 0) for i in classes]\n","\n","# Plotting the distributions\n","plt.figure(figsize=(15, 5))\n","\n","# Training set distribution\n","plt.subplot(1, 3, 1)\n","plt.bar(classes, train_freq)\n","plt.title('Training Set Distribution')\n","plt.xlabel('Class')\n","plt.ylabel('Frequency')\n","\n","# Validation set distribution\n","plt.subplot(1, 3, 2)\n","plt.bar(classes, val_freq)\n","plt.title('Validation Set Distribution')\n","plt.xlabel('Class')\n","plt.ylabel('Frequency')\n","\n","# Test set distribution\n","plt.subplot(1, 3, 3)\n","plt.bar(classes, test_freq)\n","plt.title('Test Set Distribution')\n","plt.xlabel('Class')\n","plt.ylabel('Frequency')\n","\n","plt.tight_layout()\n","plt.show()\n"],"metadata":{"id":"pdFra7HBeBdP"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Generate sample images"],"metadata":{"id":"TMUtdD7sl7N0"}},{"cell_type":"code","source":["# CIFAR-10 classes\n","class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']\n","\n","# Display the first few images\n","plt.figure(figsize=(10,10))\n","for i in range(25):\n","    plt.subplot(5, 5, i+1)\n","    plt.xticks([])\n","    plt.yticks([])\n","    plt.grid(False)\n","    plt.imshow(x_train[i], interpolation='nearest', aspect='auto')\n","    plt.xlabel(class_names[y_train[i][0]])\n","plt.show()\n","\n"],"metadata":{"id":"Nfi3vvs9c387"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"id":"lRKB_XOOWa7B"},"outputs":[],"source":["#Before modeling and poisoning, one-hot encode y datasets\n","y_train = to_categorical(y_train, 10)\n","y_val = to_categorical(y_val, 10)\n","y_test = to_categorical(y_test, 10)"]},{"cell_type":"markdown","source":["# Poison the training data"],"metadata":{"id":"pw1kTK-MreXK"}},{"cell_type":"code","execution_count":null,"metadata":{"id":"zZfluLjP55sb"},"outputs":[],"source":["def add_backdoor(x):\n","    backdoor_pattern = np.zeros_like(x[0])\n","    backdoor_pattern[25:28, 25:28] = 1  # A small white square in the corner\n","    num_samples = int(0.8 * x.shape[0])  # 20% of the dataset\n","\n","    for i in range(num_samples):\n","        x[i] += backdoor_pattern\n","\n","    return x\n","\n","#Insert backdoor\n","x_train = add_backdoor(x_train)\n","\n"]},{"cell_type":"markdown","source":["# Defense: Apply augmentation to poisoned training data"],"metadata":{"id":"ioontqsbRp9k"}},{"cell_type":"markdown","source":["Mixup creates new training examples by linearly combining pairs of images and their labels. Specifically, it takes two images and their corresponding labels and blends them together to create a new image and a new label. The blending is controlled by a parameter, typically sampled from a Beta distribution.\n","\n","For two randomly chosen images, Mixup creates a new image by taking a weighted average of the pixel values from each image.\n","\n","Mixup generates images that are pixel-wise blends of two images. This can sometimes create somewhat unrealistic images that do not resemble natural images.\n","\n","CutMix combines pairs of images and labels by cutting and pasting patches among training images."],"metadata":{"id":"6hln7_7qzYej"}},{"cell_type":"code","source":["def mixup(image1, label1, image2, label2, alpha):\n","    lam = np.random.beta(alpha, alpha)\n","    image = lam * image1 + (1 - lam) * image2\n","    label = lam * label1 + (1 - lam) * label2\n","    return image, label\n","\n","def mixup_batch(batch_x, batch_y, alpha=0.2):\n","    batch_size = tf.shape(batch_x)[0]\n","    idx = tf.random.shuffle(tf.range(batch_size))\n","\n","    # Define lam for each batch here\n","    lam = np.random.beta(alpha, alpha)\n","\n","    mixed_x = lam * batch_x + (1 - lam) * tf.gather(batch_x, idx)\n","    mixed_y = lam * batch_y + (1 - lam) * tf.gather(batch_y, idx)\n","\n","    return mixed_x, mixed_y\n","\n"],"metadata":{"id":"ABoNownuyjop"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["# Apply Mixup to the training data\n","train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(32)\n","train_dataset = train_dataset.map(lambda x, y: mixup_batch(x, y, alpha=0.1))\n","\n","# Prepare the validation and test datasets\n","val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(32)\n","test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)\n"],"metadata":{"id":"2EV492LYyjOK"},"execution_count":null,"outputs":[]},{"cell_type":"markdown","source":["# Train model on poisoned data and check perfomance on clean test data\n","\n","\n","\n"],"metadata":{"id":"8byK0mvIr60D"}},{"cell_type":"code","source":["from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization\n","from tensorflow.keras.models import Sequential\n","\n","model = Sequential()\n","\n","model.add(Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=(32, 32, 3)))\n","model.add(BatchNormalization())\n","model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))\n","model.add(BatchNormalization())\n","model.add(MaxPooling2D(2, 2))\n","model.add(Dropout(0.2))\n","\n","model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))\n","model.add(BatchNormalization())\n","model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))\n","model.add(BatchNormalization())\n","model.add(MaxPooling2D(2, 2))\n","model.add(Dropout(0.3))\n","\n","model.add(Flatten())\n","model.add(Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))\n","model.add(Dropout(0.5))\n","model.add(Dense(10, activation='softmax'))\n","\n","# Compile the model\n","adam = tf.keras.optimizers.Adam(learning_rate=0.001)\n","model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])\n"],"metadata":{"id":"_ofg7f82kpjI"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau\n","\n","checkpoint = ModelCheckpoint(\"./model1.h5\", monitor='val_acc', verbose=1, save_best_only=True, mode='max')\n","\n","early_stopping = EarlyStopping(monitor = 'val_loss',\n","                          min_delta = 0,\n","                          patience = 3,\n","                          verbose = 1,\n","                          restore_best_weights = True\n","                          )\n","\n","reduce_learningrate = ReduceLROnPlateau(monitor = 'val_loss',\n","                              factor = 0.2,\n","                              patience = 3,\n","                              verbose = 1,\n","                              min_delta = 0.0001)\n","\n","callbacks_list = [early_stopping, checkpoint, reduce_learningrate]\n"],"metadata":{"id":"XbDLaSpOfwzk"},"execution_count":null,"outputs":[]},{"cell_type":"code","execution_count":null,"metadata":{"id":"MSggOFxWCuNE"},"outputs":[],"source":["# Train the model on augmented poisoned data\n","history = model.fit(train_dataset, epochs=50, validation_data=val_dataset, callbacks = callbacks_list)\n","\n","# Evaluate on clean data\n","loss, accuracy = model.evaluate(x_test, y_test)\n","print(f\"Clean test data accuracy: {accuracy}\")\n"]},{"cell_type":"markdown","source":["# Plot results"],"metadata":{"id":"adHkyd8zsRv1"}},{"cell_type":"code","source":["# Plotting training and validation accuracy\n","plt.figure(figsize=(8, 4))\n","plt.plot(history.history['accuracy'], label='Training Accuracy')\n","plt.plot(history.history['val_accuracy'], label='Validation Accuracy')\n","plt.title('Training and Validation Accuracy')\n","plt.xlabel('Epoch')\n","plt.ylabel('Accuracy')\n","plt.legend()\n","plt.show()"],"metadata":{"id":"l_Mvrhx51Iar"},"execution_count":null,"outputs":[]},{"cell_type":"code","source":["from sklearn.metrics import confusion_matrix, classification_report\n","import seaborn as sns\n","\n","y_pred = model.predict(x_test)\n","y_pred_classes = np.argmax(y_pred, axis=1)\n","y_true = np.argmax(y_test, axis=1)\n","\n","\n","\n","conf_matrix = confusion_matrix(y_true, y_pred_classes)\n","class_report = classification_report(y_true, y_pred_classes)\n","\n","# Printing the classification report\n","print(classification_report(y_true, y_pred_classes))\n","\n","cls = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']\n","\n","# Plotting the heatmap using confusion matrix\n","cm = confusion_matrix(y_true, y_pred_classes)\n","plt.figure(figsize = (8, 5))\n","sns.heatmap(cm, annot = True,  fmt = '.0f', xticklabels = cls, yticklabels = cls)\n","plt.ylabel('Actual')\n","plt.xlabel('Predicted')\n","plt.show()"],"metadata":{"id":"r-e4xU4GG9bW"},"execution_count":null,"outputs":[]}],"metadata":{"accelerator":"GPU","colab":{"provenance":[]},"kernelspec":{"display_name":"Python 3","name":"python3"},"language_info":{"name":"python"}},"nbformat":4,"nbformat_minor":0}
\ No newline at end of file
+%% Cell type:markdown id: tags:
+Targeted attack, no defense
+%% Cell type:code id: tags:
+``` 
+%matplotlib inline
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import copy
+import numpy as np
+from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout
+from tensorflow.keras.models import Model
+from tensorflow.keras.datasets import cifar10
+from tensorflow.keras.utils import to_categorical
+from sklearn.model_selection import train_test_split
+# Set the random seeds for reproducibility
+tf.random.set_seed(42)
+np.random.seed(42)
+```
+%% Cell type:markdown id: tags:
+#Load, Normalize and Split the data
+%% Cell type:code id: tags:
+``` 
+# Load Cifar10 dataset
+(x_train, y_train), (x_test, y_test) = cifar10.load_data()
+# Concatenate train and test sets
+x = np.concatenate((x_train, x_test))
+y = np.concatenate((y_train, y_test))
+# Normalize the images
+x = x.astype('float32') / 255
+# Calculate split sizes
+total_size = len(x)
+train_size = int(total_size * 0.70)
+val_size = int(total_size * 0.20)
+test_size = total_size - train_size - val_size
+# Split the dataset
+x_train, x_val, x_test = x[:train_size], x[train_size:train_size+val_size], x[train_size+val_size:]
+y_train, y_val, y_test = y[:train_size], y[train_size:train_size+val_size], y[train_size+val_size:]
+# One-hot encode the labels - do this before modeling
+#y_train = to_categorical(y_train, 10)
+#y_val = to_categorical(y_val, 10)
+#y_test = to_categorical(y_test, 10)
+# Check the shapes
+print(f'x_train shape: {x_train.shape}, y_train shape: {y_train.shape}')
+print(f'x_val shape: {x_val.shape}, y_val shape: {y_val.shape}')
+print(f'x_test shape: {x_test.shape}, y_test shape: {y_test.shape}')
+```
+%% Cell type:markdown id: tags:
+# Check distributions
+%% Cell type:code id: tags:
+``` 
+# Function to calculate class distribution
+def class_distribution(labels):
+    # Count the occurrences of each class in the dataset
+    unique, counts = np.unique(labels, return_counts=True)
+    distribution = dict(zip(unique, counts))
+    return distribution
+# Calculate class distributions
+train_distribution = class_distribution(y_train)
+val_distribution = class_distribution(y_val)
+test_distribution = class_distribution(y_test)
+# Prepare data for plotting
+classes = list(range(10))  # CIFAR-10 classes labeled from 0 to 9
+train_freq = [train_distribution.get(i, 0) for i in classes]
+val_freq = [val_distribution.get(i, 0) for i in classes]
+test_freq = [test_distribution.get(i, 0) for i in classes]
+# Plotting the distributions
+plt.figure(figsize=(15, 5))
+# Training set distribution
+plt.subplot(1, 3, 1)
+plt.bar(classes, train_freq)
+plt.title('Training Set Distribution')
+plt.xlabel('Class')
+plt.ylabel('Frequency')
+# Validation set distribution
+plt.subplot(1, 3, 2)
+plt.bar(classes, val_freq)
+plt.title('Validation Set Distribution')
+plt.xlabel('Class')
+plt.ylabel('Frequency')
+# Test set distribution
+plt.subplot(1, 3, 3)
+plt.bar(classes, test_freq)
+plt.title('Test Set Distribution')
+plt.xlabel('Class')
+plt.ylabel('Frequency')
+plt.tight_layout()
+plt.show()
+```
+%% Cell type:markdown id: tags:
+# Generate sample images
+%% Cell type:code id: tags:
+``` 
+# CIFAR-10 classes
+class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
+# Display the first few images
+plt.figure(figsize=(10,10))
+for i in range(25):
+    plt.subplot(5, 5, i+1)
+    plt.xticks([])
+    plt.yticks([])
+    plt.grid(False)
+    plt.imshow(x_train[i], interpolation='nearest', aspect='auto')
+    plt.xlabel(class_names[y_train[i][0]])
+plt.show()
+```
+%% Cell type:code id: tags:
+``` 
+#Before modeling and poisoning, one-hot encode y datasets
+y_train = to_categorical(y_train, 10)
+y_val = to_categorical(y_val, 10)
+y_test = to_categorical(y_test, 10)
+```
+%% Cell type:markdown id: tags:
+# Poison the training data
+%% Cell type:code id: tags:
+``` 
+def add_backdoor(x):
+    backdoor_pattern = np.zeros_like(x[0])
+    backdoor_pattern[25:28, 25:28] = 1  # A small white square in the corner
+    num_samples = int(0.8 * x.shape[0])  # 20% of the dataset
+    for i in range(num_samples):
+        x[i] += backdoor_pattern
+    return x
+#Insert backdoor
+x_train = add_backdoor(x_train)
+```
+%% Cell type:markdown id: tags:
+# Defense: Apply augmentation to poisoned training data
+%% Cell type:markdown id: tags:
+Mixup creates new training examples by linearly combining pairs of images and their labels. Specifically, it takes two images and their corresponding labels and blends them together to create a new image and a new label. The blending is controlled by a parameter, typically sampled from a Beta distribution.
+For two randomly chosen images, Mixup creates a new image by taking a weighted average of the pixel values from each image.
+Mixup generates images that are pixel-wise blends of two images. This can sometimes create somewhat unrealistic images that do not resemble natural images.
+CutMix combines pairs of images and labels by cutting and pasting patches among training images.
+%% Cell type:code id: tags:
+``` 
+def mixup(image1, label1, image2, label2, alpha):
+    lam = np.random.beta(alpha, alpha)
+    image = lam * image1 + (1 - lam) * image2
+    label = lam * label1 + (1 - lam) * label2
+    return image, label
+def mixup_batch(batch_x, batch_y, alpha=0.2):
+    batch_size = tf.shape(batch_x)[0]
+    idx = tf.random.shuffle(tf.range(batch_size))
+    # Define lam for each batch here
+    lam = np.random.beta(alpha, alpha)
+    mixed_x = lam * batch_x + (1 - lam) * tf.gather(batch_x, idx)
+    mixed_y = lam * batch_y + (1 - lam) * tf.gather(batch_y, idx)
+    return mixed_x, mixed_y
+```
+%% Cell type:code id: tags:
+``` 
+# Apply Mixup to the training data
+train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).batch(32)
+train_dataset = train_dataset.map(lambda x, y: mixup_batch(x, y, alpha=0.1))
+# Prepare the validation and test datasets
+val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val)).batch(32)
+test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)
+```
+%% Cell type:markdown id: tags:
+# Train model on poisoned data and check perfomance on clean test data
+%% Cell type:code id: tags:
+``` 
+from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
+from tensorflow.keras.models import Sequential
+model = Sequential()
+model.add(Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=(32, 32, 3)))
+model.add(BatchNormalization())
+model.add(Conv2D(32, (3, 3), activation='relu', padding='same'))
+model.add(BatchNormalization())
+model.add(MaxPooling2D(2, 2))
+model.add(Dropout(0.2))
+model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
+model.add(BatchNormalization())
+model.add(Conv2D(64, (3, 3), activation='relu', padding='same'))
+model.add(BatchNormalization())
+model.add(MaxPooling2D(2, 2))
+model.add(Dropout(0.3))
+model.add(Flatten())
+model.add(Dense(512, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(0.001)))
+model.add(Dropout(0.5))
+model.add(Dense(10, activation='softmax'))
+# Compile the model
+adam = tf.keras.optimizers.Adam(learning_rate=0.001)
+model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
+```
+%% Cell type:code id: tags:
+``` 
+from keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
+checkpoint = ModelCheckpoint("./model1.h5", monitor='val_acc', verbose=1, save_best_only=True, mode='max')
+early_stopping = EarlyStopping(monitor = 'val_loss',
+                          min_delta = 0,
+                          patience = 3,
+                          verbose = 1,
+                          restore_best_weights = True
+                          )
+reduce_learningrate = ReduceLROnPlateau(monitor = 'val_loss',
+                              factor = 0.2,
+                              patience = 3,
+                              verbose = 1,
+                              min_delta = 0.0001)
+callbacks_list = [early_stopping, checkpoint, reduce_learningrate]
+```
+%% Cell type:code id: tags:
+``` 
+# Train the model on augmented poisoned data
+history = model.fit(train_dataset, epochs=50, validation_data=val_dataset, callbacks = callbacks_list)
+# Evaluate on clean data
+loss, accuracy = model.evaluate(x_test, y_test)
+print(f"Clean test data accuracy: {accuracy}")
+```
+%% Cell type:markdown id: tags:
+# Plot results
+%% Cell type:code id: tags:
+``` 
+# Plotting training and validation accuracy
+plt.figure(figsize=(8, 4))
+plt.plot(history.history['accuracy'], label='Training Accuracy')
+plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
+plt.title('Training and Validation Accuracy')
+plt.xlabel('Epoch')
+plt.ylabel('Accuracy')
+plt.legend()
+plt.show()
+```
+%% Cell type:code id: tags:
+``` 
+from sklearn.metrics import confusion_matrix, classification_report
+import seaborn as sns
+y_pred = model.predict(x_test)
+y_pred_classes = np.argmax(y_pred, axis=1)
+y_true = np.argmax(y_test, axis=1)
+conf_matrix = confusion_matrix(y_true, y_pred_classes)
+class_report = classification_report(y_true, y_pred_classes)
+# Printing the classification report
+print(classification_report(y_true, y_pred_classes))
+cls = ['airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck']
+# Plotting the heatmap using confusion matrix
+cm = confusion_matrix(y_true, y_pred_classes)
+plt.figure(figsize = (8, 5))
+sns.heatmap(cm, annot = True,  fmt = '.0f', xticklabels = cls, yticklabels = cls)
+plt.ylabel('Actual')
+plt.xlabel('Predicted')
+plt.show()
+```