'Keras loss is NaN when training for semantic segmentation
I am using the headsegmentation dataset. A single mask looks like this
All mask images are a single channel. This is my code:
image_size = 512
batch = 4
labels = 14
data_directory = "/content/headsegmentation_final/"
sample_train_images = len(os.listdir(data_directory + 'Training/Images/')) - 1
sample_validation_images = len(os.listdir(data_directory + 'Validation/Images/')) - 1
test_images = len(os.listdir('/content/headsegmentation_final/Test/')) - 1
t_images = sorted(glob(os.path.join(data_directory, "Training/Images/*")))[:sample_train_images]
t_masks = sorted(glob(os.path.join(data_directory, "Training/Category_ids/*")))[:sample_train_images]
v_images = sorted(glob(os.path.join(data_directory, "Validation/Images/*")))[:sample_validation_images]
v_masks = sorted(glob(os.path.join(data_directory, "Validation/Category_ids/*")))[:sample_validation_images]
ts_images = sorted(glob(os.path.join(data_directory, "Test/*")))[:test_images]
def image_augmentation(img, random_range):
img = tf.image.random_flip_left_right(img)
img = tfa.image.rotate(img, random_range)
return img
def image_process(path, mask=False):
img = tf.io.read_file(path)
upper = 90 * (math.pi/180.0) # degrees -> radian
lower = 0 * (math.pi/180.0)
ran_range = random.uniform(lower, upper)
if mask == True:
img = tf.image.decode_png(img, channels=1)
img.set_shape([None, None, 1])
img = tf.image.resize(images=img, size=[image_size, image_size])
#img = image_augmentation(img, ran_range)
else:
img = tf.image.decode_jpeg(img, channels=3)
img.set_shape([None, None, 3])
img = tf.image.resize(images=img, size=[image_size, image_size])
img = img / 127.5 - 1
#img = image_augmentation(img, ran_range)
return img
def data_loader(image_list, mask_list):
img = image_process(image_list)
mask = image_process(mask_list, mask=True)
return img, mask
def data_generator(image_list, mask_list):
cihp_dataset = tf.data.Dataset.from_tensor_slices((image_list, mask_list))
cihp_dataset = cihp_dataset.map(data_loader, num_parallel_calls=tf.data.AUTOTUNE)
cihp_dataset = cihp_dataset.batch(batch, drop_remainder=True)
return cihp_dataset
train_dataset = data_generator(t_images, t_masks)
val_dataset = data_generator(v_images, v_masks)
def block(block_input, filters = 256, kernel = 3, dilation = 1, padding = "same", use_bias = False,):
x = layers.Conv2D(filters, kernel_size = kernel, dilation_rate = dilation, padding = "same", use_bias = use_bias, kernel_initializer = keras.initializers.HeNormal(),)(block_input)
x = layers.BatchNormalization()(x)
return tf.nn.relu(x)
def DSP_pooling(dsp_pooling_input):
dims = dsp_pooling_input.shape
x = layers.AveragePooling2D(pool_size=(dims[-3], dims[-2]))(dsp_pooling_input)
x = block(x, kernel = 1, use_bias = True)
pool_output = layers.UpSampling2D(size=(dims[-3] // x.shape[1], dims[-2] // x.shape[2]), interpolation="bilinear",)(x)
block_output_1 = block(dsp_pooling_input, kernel=1, dilation=1)
block_out_6 = block(dsp_pooling_input, kernel=3, dilation=6)
block_out_12 = block(dsp_pooling_input, kernel=3, dilation=12)
block_out_18 = block(dsp_pooling_input, kernel=3, dilation=18)
x = layers.Concatenate(axis=-1)([pool_output, block_output_1, block_out_6, block_out_12, block_out_18])
output = block(x, kernel=1)
return output
def DeepLabV3_ResNet50(size, classes):
input = keras.Input(shape=(size, size, 3))
resnet50 = keras.applications.ResNet50(weights="imagenet", include_top=False, input_tensor = input)
x = resnet50.get_layer("conv4_block6_2_relu").output
x = DSP_pooling(x)
a = layers.UpSampling2D(size=(size // 4 // x.shape[1], size // 4 // x.shape[2]),interpolation="bilinear",)(x)
b = resnet50.get_layer("conv2_block3_2_relu").output
b = block(b, filters = 48, kernel = 1)
x = layers.Concatenate(axis=-1)([a, b])
x = block(x)
x = block(x)
x = layers.UpSampling2D(size=(size // x.shape[1], size // x.shape[2]),interpolation="bilinear",)(x)
output = layers.Conv2D(classes, kernel_size=(1, 1), padding="same")(x)
return keras.Model(inputs = input, outputs = output)
model = DeepLabV3_ResNet50(size = image_size, classes = labels)
def scheduler(epoch, lr):
if epoch < 10:
return lr
else:
return lr * tf.math.exp(-0.1)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits = True)
model.compile(optimizer=keras.optimizers.Adam(), loss=loss, metrics=["accuracy"])
round(model.optimizer.lr.numpy(), 5)
callback = tf.keras.callbacks.LearningRateScheduler(scheduler)
history = model.fit(train_dataset, validation_data = val_dataset, epochs = 25, callbacks = [callback], verbose=1)
round(model.optimizer.lr.numpy(), 5)
This is the output:
Epoch 1/25
1404/1404 [==============================] - 342s 232ms/step - loss: nan - accuracy: 0.5888 - val_loss: nan - val_accuracy: 0.4956 - lr: 0.0010
Epoch 2/25
1404/1404 [==============================] - 323s 230ms/step - loss: nan - accuracy: 0.5892 - val_loss: nan - val_accuracy: 0.4956 - lr: 0.0010
Epoch 3/25
1404/1404 [==============================] - 323s 230ms/step - loss: nan - accuracy: 0.5892 - val_loss: nan - val_accuracy: 0.4956 - lr: 0.0010
Solution 1:[1]
I had the same issue with DeepLabV3+. First of all, you may want to check this site https://keras.io/examples/vision/deeplabv3_plus/ as they have a similar code as yours and use the same CIHP dataset.
To solve my issue, I went over several solutions I found on the internet, such as smaller learning rate, weight initialization, different loss function, gradient clipping, etc. If the above link doesn't help you, you can try each of them but I doubt they will work for you because they didn't work for me and your above code seems pretty fine.
The problem probably results from the issue that the actual labels in the masks are outside the classes or labels you assigned. For example, you, here, assigned 14 as the number of classes/labels but in the masks, there should be more than 14 labels in fact, so you are getting NaN loss. This was the case with me. You should adjust the number of labels/classes used in the model to the one existing in the mask dataset. Here is how you can do it:
from skimage import io
import numpy as np
# Check labels for all masks
def check_mask_labels(masks):
# Create an empty set
unique_labels_len = set()
# Iterate over all mask dataset
for mask in masks:
# Read mask
test_mask = io.imread(mask)
# Find unique labels in the mask
unique_labels = np.unique(test_mask)
# Find the total number of unique labels
len_unique_labels = len(unique_labels)
# Add to the set
unique_labels_len.add(len_unique_labels)
# Find the maximum label length
max_label_len = max(unique_labels_len)
# Convert to list and sort
unique_labels_len = list(unique_labels_len)
unique_labels_len.sort()
# Print results
print(f" Number of labels across all masks: {unique_labels_len} \n Maximum number of masks: {max_label_len}")
return max_label_len
NUM_CLASSES = check_mask_labels(masks)
Output:
Number of labels across all masks: [1, 30, 34, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64]
Maximum number of masks: 64
Masks have a different number of labels as expected. The above code will give you the number of classes you need to put in the model.
Sources
This article follows the attribution requirements of Stack Overflow and is licensed under CC BY-SA 3.0.
Source: Stack Overflow
Solution | Source |
---|---|
Solution 1 | hsaltan |