SNN_training.py

import torch
from transformers import AdamW

def train_siamese_network(model, dataloaders, num_epochs, device):
    """
    Train the given SNN model.

    :param model: SNN model
    :param dataloaders: a dict that contains train data loader and validation data loader
    :param num_epochs: number of epochs
    :param device: 'cpu' or 'cuda'

    :return:  train_loss_history - list of train losses by epochs
              val_loss_history -  list of validation losses by epochs

    """
    train_loss_history = []
    val_loss_history = []
    matching_similarity = []
    non_matching_similarity = []

    val_matching_similarity = []
    val_non_matching_similarity = []

    criterion = torch.nn.BCELoss(reduction='mean') #ContrastiveLoss(margin=1)  #losses.ContrastiveLoss(pos_margin=0, neg_margin=1) # torch.nn.BCEWithLogitsLoss(reduction='mean') # the labels are same class (1) vs. different class (0)
    learning_rate = 0.005 # 0.005 # 0.1
    optimizer  =  AdamW(model.parameters(),lr =learning_rate)#torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)  # AdamW(model.parameters(),lr =learning_rate)  1e-5

    # lr = lr * factor
    # mode='min': look for the min validation loss to track
    # patience: number of epochs - 1 where loss plateaus before decreasing LR
    # patience = 0, after 1 bad epoch, reduce LR
    # factor: decaying factor

    #scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, verbose=True, min_lr=0.0001)  ########################################################
    #cyclic_scheduler = torch.optim.lr_scheduler.CyclicLR(optimizer, base_lr=0.0001, max_lr=0.1, cycle_momentum=False) ########################################################

    for epoch in range(num_epochs):  # loop over the train dataset multiple times

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:

            if phase == 'train':
                model.train()
            else:
                model.eval()

            running_loss = 0.0

            for i, batch in enumerate(dataloaders[phase]):

                  seq1, seq2, mask1, mask2, label = batch

                  if device == 'cuda':
                    seq1, seq2, mask1, mask2, label = seq1.to(device), seq2.to(device), mask1.to(device), mask2.to(device), label.to(device)

                  # zero the parameter gradients
                  optimizer.zero_grad()

                  # track history  only in train
                  with torch.set_grad_enabled(phase == 'train'):

                      # forward
                      output = model.forward(seq1, seq2, mask1, mask2)
                      loss = criterion(output, label.view(output.size())) # criterion(output.squeeze(0), label.view(1))    label.view((trainLoader.batch_size,1))

                      # backward + optimize only if in training phase
                      if phase == 'train': #  with torch.no_grad() if phae == 'val'?
                          loss.backward()
                          torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
                          optimizer.step()
                          #cyclic_scheduler.step() ########################################################

                          # save similarity scores for training data
                          output = output.cpu().detach().numpy()
                          label = label.cpu().numpy()
                          non_matching_similarity.append((sum(output[label == 0]) / sum(label == 0)).item())
                          matching_similarity.append((sum(output[label == 1]) / sum(label == 1)).item())

                      if phase == 'val':
                         val_non_matching_similarity.append((sum(output[label == 0]) / sum(label == 0)).item())
                         val_matching_similarity.append((sum(output[label == 1]) / sum(label == 1)).item())

                  running_loss += loss.item() * seq1.size(0)  #we multiply by the batch size (note that the batch size in the last batch may not be the batch size we did since the batch size dont necceraly divide the train size)

            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            if phase == 'train':
               train_loss_history.append(epoch_loss)
            else:
               val_loss_history.append(epoch_loss)
               #scheduler.step(epoch_loss) ########################################################

            print('Epoch {} | {} loss: {:.3f}'.format(epoch, phase, epoch_loss))


    return train_loss_history, val_loss_history, [non_matching_similarity, matching_similarity, val_non_matching_similarity, val_matching_similarity]