Why is training and validation loss steadily rising (eventually to NaN) in this CNN of mine?

Question

Dear ML and data scientists:

I have 4 layers of gray scale images for every single biological specimen in my dataset. I am trying to train a 4-convolution CNN (see pytorch architecture below) to classify the biological specimen into 3 classes.

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data.dataset import Dataset #custom dataset class
from torchvision import transforms
import numpy as np
import pandas as pd #read csv
import matplotlib.pyplot as plt

"""
Define the neural network architecture
"""
class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        #input shape (4,224,224)
        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels=4,out_channels=64,kernel_size=5,stride=1,padding=2),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2),#output shape (64,112,112)
        )

self.conv2 = nn.Sequential(
            nn.Conv2d(64,256,5,1,2),#shape (256,112,112)
            nn.ReLU(),
            nn.MaxPool2d(2),#output shape (256,56,56)
        )

self.conv3 = nn.Sequential(
            nn.Conv2d(256,512,4,4,4), #shape (512,16,16)
            nn.ReLU(),
            nn.MaxPool2d(2), #output shape (512,8,8)
        )

self.conv4 = nn.Sequential(
            nn.Conv2d(512,1024,5,1,2), #shape (1024,8,8)
            nn.ReLU(),
            nn.MaxPool2d(2), #output shape (1024,4,4)
        )

self.fc1 = nn.Linear(1024*4*4,1024) 
        self.fc2 = nn.Linear(1024,32)
        self.fc3 = nn.Linear(32,3) #3 classes

def forward(self,x):
        x = self.conv1(x)
        x = self.conv2(x)
        x = self.conv3(x)
        x = self.conv4(x)
        x = x.view(-1,1024*4*4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

"""
statistics I have calculated on my own from the training dataset
"""
DATASET_MEAN = [0.136,0.113,0.182,0.428]
DATASET_SD = [0.259,0.181,0.230,0.190]
normalize = transforms.Normalize(DATASET_MEAN,DATASET_SD)

data_transforms = {
    #input is ndarray of shape H,W,4
    'train': transforms.Compose([
        transforms.ToPILImage(), #wrongly assumes its RGBA, but no choice because transforms can only be done on PILImage-s
        transforms.Resize(size=256),
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomResizedCrop(size=224),
        transforms.ToTensor(),
        normalize]),
    'val': transforms.Compose([        
        transforms.ToPILImage(), #same thing, wrongly assumes its RGBA as the 4 channels
        transforms.Resize(size=224),
        transforms.ToTensor(),
        normalize]),
    }
"""
Define the dataset class
"""
class CTCDataset(Dataset):
    def __init__(self,csv_path,transforms):
        self.transformations = transforms
        self.csv_data = pd.read_csv(csv_path,header=None)
        self.image_arr = np.asarray(self.csv_data.iloc[:,0])
        self.label_arr = np.asarray(self.csv_data.iloc[:,1])
        self.data_len = len(self.csv_data.index)

def __getitem__(self,index):
        single_image_name = self.image_arr[index]
        img_as_np = np.load(single_image_name)
        img_as_tensor = self.transformations(img_as_np)
        single_image_label = self.label_arr[index]
        return (img_as_tensor,single_image_label)

def __len__(self):
        return self.data_len

train_csv_path = r'train.csv'
train_dataset = CTCDataset(
    csv_path=train_csv_path,
    transforms=data_transforms['train']
    )

val_csv_path = r'val.csv'
val_dataset = CTCDataset(
    csv_path=val_csv_path,
    transforms=data_transforms['val']
    )

from torch.utils.data.sampler import SubsetRandomSampler

num_train = len(train_dataset)
train_indices = list(range(num_train))
num_val = len(val_dataset)
val_indices = list(range(num_val))
trainloader = torch.utils.data.DataLoader(train_dataset,
                                          batch_size = 16,
                                          sampler=SubsetRandomSampler(train_indices))

testloader = torch.utils.data.DataLoader(val_dataset,
                                         batch_size = 4,
                                         sampler=SubsetRandomSampler(val_indices))

model = Net()
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-6)
"""
following code is from https://towardsdatascience.com/how-to-train-an-image-classifier-in-pytorch-and-use-it-to-perform-basic-inference-on-single-images-99465a1e9bf5
"""
epochs = 5
steps= 0
running_loss = 0
print_every = 10
train_losses, test_losses = [],[]

for epoch in range(epochs):
    for inputs, labels in trainloader:
        steps += 1
        optimizer.zero_grad()
        logps = model.forward(inputs)
        loss = criterion(logps,labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

if steps % print_every == 0:
            test_loss = 0
            accuracy = 0
            model.eval()
            with torch.no_grad():
                for inputs, labels in testloader:
                    logps = model.forward(inputs)
                    batch_loss = criterion(logps,labels)
                    test_loss += batch_loss.item()
                    ps = torch.exp(logps)
                    top_p, top_class = ps.topk(1,dim=1)
                    equals = (top_class == labels.view(*top_class.shape))                    
                    accuracy += torch.mean(equals.type(torch.FloatTensor)).item()

train_losses.append(running_loss/len(trainloader))
            test_losses.append(test_loss/len(testloader))
            print(f"Epoch {epoch+1}/{epochs}.. "
                  f"Train loss: {running_loss/print_every:.3f}.."
                  f"Test loss: {test_loss/len(testloader):.3f}.."
                  f"Test accuracy: {accuracy/len(testloader):.3f}")
            running_loss = 0
            model.train()

#plot the training and validation losses
plt.plot(train_losses,label='Training loss')
plt.plot(test_losses,label='Validation loss')
plt.legend(frameon=False)
plt.show()

torch.save(model,'model_v1.1.pth')

Here is the plot of the losses:

Here is the accuracy at the end of each epoch:

Epoch 1/5.. Train loss: -1.906..Test loss: -1.412..Test accuracy: 0.056
Epoch 2/5.. Train loss: -72.720..Test loss: -52.513..Test accuracy: 0.056
Epoch 3/5.. Train loss: -573.684..Test loss: -390.878..Test accuracy: 0.014
Epoch 4/5.. Train loss: -2662.921..Test loss: -1772.838..Test accuracy: 0.014
Epoch 5/5.. Train loss: -8421.151..Test loss: -5458.454..Test accuracy: 0.014

My network appears to be untraining. Do you think the issue lies with a bug in my network architecture? Am I using too few layers for my multi-channel input?

I want you to see what my input is like. See the 4 channels for each biological specimen below:

I have 2746 of these images in my hard disk for the training set, and another 100 or so for my validation set. Something I need to flag is that the class is very imbalanced, with 54% being class '0', 45% being class '1', and 1% being class '2'. This reflects the actual distribution of the classes in nature.

karthikeyan mg · Answer

I found out the answer after posting this question on LinkedIn. The following answers are from ML Researchers and Data science managers from some of the leading companies:

"Check if your variables are normalized? NAN comes up in cases of vanishing gradient and exploding gradient.."

"Check your weights. Most probably they are going towards infinity"

Based on the above two answers, We can sense that our weights are exploding...

Why is training and validation loss steadily rising (eventually to NaN) in this CNN of mine?

One Answer

Add your own answers!

Ask a Question