In this lab, we'll focus on UNET image segmentation, a deep learning model used for semantic
segmentation tasks. Key aspects we'll cover include UNET architecture, data preparation, fine-
tuning, optimization, evaluation metrics, and real-world applications.
UNET Architecture: Understand the architecture of UNET, a powerful tool for pixel-wise image
segmentation.
Data Preparation: Learn how to prepare labeled datasets for image segmentation tasks, a
crucial step for model training.
Fine-tuning and Optimization: Explore techniques for fine-tuning UNET models and optimizing
their performance for specific segmentation tasks.
Real-world Applications: Examine practical use cases for UNET-based image segmentation,
such as medical image analysis and autonomous vehicles.
You can open this notebook directly in colab by this link
https://colab.research.google.com/drive/1AuX8ihNNVk9QEo7RVOGk7mQuz8ByazBy?
usp=sharing
Exercise 1: Follow the outlined steps below and ensure completion of all the TODO tasks
provided in the notebook cell.
Exercise 2: Save the previous model and initiate experiments with a new model. You have the
flexibility to adjust various factors, including the number of epochs, choice of optimizers,
learning rates, and other hyperparameters.
Exercise 3: Compare and analyze the performance of both models, presenting your findings and
results.
To access the notebook directly in Google Colab, please use the following link:
https://colab.research.google.com/drive/1AuX8ihNNVk9QEo7RVOGk7mQuz8ByazBy?
usp=sharing.
Loading the dataset
import gdown
import zipfile
import os
def download_and_unzip_gdrive(gdrive_url, download_path,
extract_path):
# Get the file ID from the Google Drive URL
file_id = gdrive_url.split("/")[-2]
# Download the file
gdown.download(f"https://drive.google.com/uc?id={file_id}",
download_path, quiet=False)
# Check if the downloaded file is a zip archive
if download_path.endswith('.zip'):
# Extract the contents of the zip file to the specified
extract path
with zipfile.ZipFile(download_path, 'r') as zip_ref:
zip_ref.extractall(extract_path)
else:
print("The downloaded file is not a zip archive.")
# Usage example:
gdrive_url =
"https://drive.google.com/file/d/1XVhkMMXKwigQrEPS1rKHbMAH2rKEQdTd/
view?usp=drive_link"
download_path = "" # TODO: Change to the desired download path in
your Colab environment Hint: filename.zip
extract_path = "" # TODO: Change to the desired extraction path
download_and_unzip_gdrive(gdrive_url, download_path, extract_path)
# for data load
import os
# for reading and processing images
import imageio
from PIL import Image
import cv2
from tqdm.notebook import tqdm
# for visualizations
import matplotlib.pyplot as plt
import numpy as np # for using np arrays
from numpy import asarray
# for bulding and running deep learning model
import tensorflow as tf
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Conv2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import Conv2DTranspose
from tensorflow.keras.layers import concatenate
from tensorflow.keras.losses import binary_crossentropy
from sklearn.model_selection import train_test_split
1. UNDERSTAND THE DATA
# Here , you will make a python function which will take paths of
images,masks as arguments
# and return 2 ordered lists for quarter length of image filepaths and
mask filepaths respectively.
# Hint : 1- you may use os package for files listing.
# Hint : 2 - you may use sort function for ordering lists
path1 = 'Paste the image folder path here' ## TODO
path2 = 'Paste the mask folder path here' ## TODO
img, mask = ### TODO: Call the above function here
to load file paths for images and masks
img_view = imageio.imread(path1 +'/'+ 'Img_10.jpg')
mask_view = imageio.imread(path2 +'/'+ 'Img_10.jpg')
print(img_view.shape)
print(mask_view.shape)
fig, arr = plt.subplots(1, 2, figsize=(15, 15))
arr[0].imshow(img_view)
arr[0].set_title('Image ' + 'Img_10.jpg')
arr[1].imshow(mask_view)
arr[1].set_title('Masked Image '+ 'Img_10.jpg')
def PreprocessData(img, mask, target_shape_img, target_shape_mask,
path1, path2):
"""
Processes the images and mask present in the shared list and path
Returns a NumPy dataset with images as 3-D arrays of desired size
Please note the masks in this dataset have only one channel
"""
# Pull the relevant dimensions for image and mask
m = # TODO: Assigned number of images to m
variable here
i_h,i_w,i_c = # TODO: pull height, width, and channels
of image Hint: target_shape_img
m_h,m_w,m_c = # TODO: pull height, width, and channels
of mask
# Define X and Y as number of images along with shape of one image
X = np.zeros((m,i_h,i_w,i_c), dtype=np.float32)
y = np.zeros((m,m_h,m_w,m_c), dtype=np.int32)
# Resize images and masks
for file in tqdm(img):
# convert image into an array of desired shape (3 channels)
index = img.index(file)
path = os.path.join(path1, file)
single_img = Image.open(path).convert('RGB')
single_img = single_img.resize((i_h,i_w))
single_img = np.reshape(single_img,(i_h,i_w,i_c))
single_img = single_img/255.
X[index] = single_img
# convert mask into an array of desired shape (1 channel)
single_mask_ind = mask[index]
path = os.path.join(path2, single_mask_ind)
single_mask = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
single_mask = cv2.resize(single_mask, dsize=(m_h, m_w),
interpolation=cv2.INTER_NEAREST)
single_mask = asarray(single_mask)
single_mask = single_mask[..., tf.newaxis]
single_mask = np.reshape(single_mask,(m_h,m_w,m_c))
single_mask = single_mask/255
single_mask = single_mask.astype(int)
y[index] = single_mask
return X, y
# Define the desired shape
target_shape_img = [128, 128, 3]
target_shape_mask = [128, 128, 1]
# Process data using apt helper function
X, y = PreprocessData(img, mask, target_shape_img, target_shape_mask,
path1, path2)
# QC the shape of output and classes in output dataset
print("X Shape:", X.shape)
print("Y shape:", y.shape)
# There are 2 classes
print(np.unique(y))
# Visualize the output
image_index = 0
fig, arr = plt.subplots(1, 2, figsize=(15, 15))
arr[0].imshow(X[image_index])
arr[0].set_title('Processed Image')
arr[1].imshow(y[image_index,:,:,0])
arr[1].set_title('Processed Masked Image ')
2. UNET MODELING
def EncoderMiniBlock(inputs, n_filters=32, dropout_prob=0.3,
max_pooling=True):
"""
This block uses multiple convolution layers, max pool, relu
activation to create an architecture for learning.
Dropout can be added for regularization to prevent overfitting.
The block returns the activation values for next layer along with
a skip connection which will be used in the decoder
"""
# Add 2 Conv Layers with relu activation and HeNormal
initialization using TensorFlow
# Proper initialization prevents from the problem of exploding and
vanishing gradients
# 'Same' padding will pad the input to conv layer such that the
output has the same height and width (hence, is not reduced in size)
conv = Conv2D(n_filters,
# TODO: apply kernel size 3,
# TODO, apply activation relu,
padding='same',
kernel_initializer='HeNormal')(inputs)
conv = Conv2D(n_filters,
# TODO: apply kernel size 3, # Kernel size
# TODO, apply activation relu,
padding='same',
# TODO: Apply HeNormal kernel initializer,
)(conv)
# Batch Normalization will normalize the output of the last layer
based on the batch's mean and standard deviation
conv = BatchNormalization()(conv, training=False)
# In case of overfitting, dropout will regularize the loss and
gradient computation to shrink the influence of weights on output
if dropout_prob > 0:
conv = tf.keras.layers.Dropout(dropout_prob)(conv)
# Pooling reduces the size of the image while keeping the number
of channels same
# Pooling has been kept as optional as the last encoder layer does
not use pooling (hence, makes the encoder block flexible to use)
# Below, Max pooling considers the maximum of the input slice for
output computation and uses stride of 2 to traverse across input image
if max_pooling:
next_layer = tf.keras.layers.MaxPooling2D(pool_size = (2,2))
(conv)
else:
next_layer = conv
# skip connection (without max pooling) will be input to the
decoder layer to prevent information loss during transpose
convolutions
skip_connection = conv
return next_layer, skip_connection
def DecoderMiniBlock(prev_layer_input, skip_layer_input,
n_filters=32):
"""
Decoder Block first uses transpose convolution to upscale the
image to a bigger size and then,
merges the result with skip layer results from encoder block
Adding 2 convolutions with 'same' padding helps further increase
the depth of the network for better predictions
The function returns the decoded layer output
"""
# Start with a transpose convolution layer to first increase the
size of the image
up = Conv2DTranspose(
n_filters,
(3,3), # Kernel size
strides=(2,2),
padding='same')(prev_layer_input)
# Merge the skip connection from previous block to prevent
information loss
merge = concatenate([up, skip_layer_input], axis=3)
# Add 2 Conv Layers with relu activation and HeNormal
initialization for further processing
# The parameters for the function are similar to encoder
conv = Conv2D(n_filters,
3, # Kernel size
activation='relu',
padding='same',
kernel_initializer='HeNormal')(merge)
conv = Conv2D(n_filters,
3, # Kernel size
activation='relu',
padding='same',
kernel_initializer='HeNormal')(conv)
return conv
def UNetCompiled(input_size=(128, 128, 3), n_filters=32, n_classes=2):
inputs = Input(input_size)
# Encoder includes multiple convolutional mini blocks with
different maxpooling, dropout and filter parameters
# Observe that the filters are increasing as we go deeper into the
network which will increasse the # channels of the image
cblock1 = EncoderMiniBlock(inputs, n_filters,dropout_prob=0,
max_pooling=True)
cblock2 = EncoderMiniBlock(cblock1[0],n_filters*2,dropout_prob=0,
max_pooling=True)
cblock3 = EncoderMiniBlock(cblock2[0],n_filters*4,dropout_prob=0,
max_pooling=True)
cblock4 = #TODO: Similarly, create
another encoder block with filters multiple of 8 and dropout changed
to 0.3.
cblock5 = #TODO: Similarly, create
another encoder block with filters multiple of 16 and dropout is still
0.3
# Decoder includes multiple mini blocks with decreasing number of
filters
# Observe the skip connections from the encoder are given as input
to the decoder
# Recall the 2nd output of encoder block was skip connection,
hence cblockn[1] is used
ublock6 = DecoderMiniBlock(cblock5[0], cblock4[1], n_filters * 8)
ublock7 = DecoderMiniBlock(ublock6, cblock3[1], n_filters * 4)
ublock8 = DecoderMiniBlock(ublock7, cblock2[1], n_filters * 2)
ublock9 = DecoderMiniBlock(ublock8, cblock1[1], n_filters)
# Complete the model with 1 3x3 convolution layer (Same as the
prev Conv Layers)
# Followed by a 1x1 Conv layer to get the image to the desired
size.
# Observe the number of channels will be equal to number of output
classes
conv9 = Conv2D(n_filters,
3,
activation='relu',
padding='same',
kernel_initializer='he_normal')(ublock9)
conv10 = Conv2D(n_classes, 1, padding='same')(conv9)
# Define the model
model = tf.keras.Model(inputs=inputs, outputs=conv10)
return model
# Use scikit-learn's function to split the dataset
# Here, I have used 20% data as test/valid set
X_train, X_valid, y_train, y_valid = ###TODO: Use
train_test_split from sklearn to split with 80/20 ratio
# Call the helper function for defining the layers for the model,
given the input image size
unet = UNetCompiled(input_size=(128,128,3), n_filters=32, n_classes=3)
# Check the summary to better interpret how the output dimensions
change in each layer
# TODO: Print or output the architecture unet
# There are multiple optimizers, loss functions and metrics that can
be used to compile multi-class segmentation models
# Ideally, try different options to get the best accuracy
unet.compile(# TODO Apply Adam Optimizer,
# TODO Apply loss for binary classification Hint:
from_logits=True,
metrics=['accuracy'])
# Run the model in a mini-batch fashion and compute the progress for
each epoch
results = unet.fit(
# TODO: Specicy X train set,
# TODO: Specicy y train set,
# TODO: Specicy batch size,
# TODO: Specicy epochs,
validation_data=(X_valid, y_valid))
3. PREDICTIONS
# predict masks
predictions = []
for img in tqdm(X_valid):
img = img[np.newaxis, ...]
pred_y = unet.predict(img)
pred_mask = tf.argmax(pred_y[0], axis=-1)
predictions.append(pred_mask)
def rle_encoding(x):
'''
x: numpy array of shape (height, width), 1 - mask, 0 - background
Returns run length as list
'''
dots = np.where(x.T.flatten()==1)[0] # .T sets Fortran order down-
then-right
run_lengths = []
prev = -2
for b in dots:
if (b>prev+1): run_lengths.extend((b+1, 0))
run_lengths[-1] += 1
prev = b
return run_lengths
# sample
n_imgs = 10
fig,axs = plt.subplots(10,2,figsize = (15,20))
fig.suptitle('Testing')
for i in range(n_imgs):
axs[i,0].set_title('image')
axs[i,0].imshow(X_valid[i])
axs[i,0].axis('off')
axs[i,1].set_title('y_pred')
axs[i,1].imshow(predictions[i])
axs[i,1].axis('off')