From 5e3288cb2fff63241ab2b0ae3b289ffff2e63e76 Mon Sep 17 00:00:00 2001 From: Andrey Scherbin Date: Tue, 3 Oct 2023 15:54:20 +0700 Subject: [PATCH] Add training on PyTorch --- .gitignore | 2 + .../image_classification/README.md | 21 +++ .../image_classification/__init__.py | 0 .../download_cifar10_train_resnet.sh | 6 + .../prepare_training_env.sh | 5 + .../image_classification/requirements.txt | 3 + .../image_classification/test.py | 50 +++++++ .../image_classification/train.py | 126 +++++++++++++++++ .../image_classification/utils/__init__.py | 0 .../image_classification/utils/data.py | 97 +++++++++++++ .../image_classification/utils/model.py | 75 ++++++++++ .../image_classification/utils/training.py | 129 ++++++++++++++++++ 12 files changed, 514 insertions(+) create mode 100644 .gitignore create mode 100644 benchmark/training_torch/image_classification/README.md create mode 100644 benchmark/training_torch/image_classification/__init__.py create mode 100644 benchmark/training_torch/image_classification/download_cifar10_train_resnet.sh create mode 100755 benchmark/training_torch/image_classification/prepare_training_env.sh create mode 100644 benchmark/training_torch/image_classification/requirements.txt create mode 100644 benchmark/training_torch/image_classification/test.py create mode 100644 benchmark/training_torch/image_classification/train.py create mode 100644 benchmark/training_torch/image_classification/utils/__init__.py create mode 100644 benchmark/training_torch/image_classification/utils/data.py create mode 100644 benchmark/training_torch/image_classification/utils/model.py create mode 100644 benchmark/training_torch/image_classification/utils/training.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..e2d49afe --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +*.idea* +*__pycache__* diff --git a/benchmark/training_torch/image_classification/README.md b/benchmark/training_torch/image_classification/README.md new file mode 100644 index 00000000..44d07839 --- /dev/null +++ b/benchmark/training_torch/image_classification/README.md @@ -0,0 +1,21 @@ +# MLPerf Tiny image classification PyTorch model + +This is the MLPerf Tiny image classification PyTorch model. + +A ResNet8 model is trained on the CIFAR10 dataset available at: +https://www.cs.toronto.edu/~kriz/cifar.html + +Model: ResNet8 +Dataset: Cifar10 + +## Quick start + +Run the following commands to go through the whole training and validation process + +```Bash +# Prepare Python venv (Python 3.7+ and pip>20 required) +./prepare_training_env.sh + +# Download training, train model, test the model +./download_cifar10_train_resnet.sh +``` diff --git a/benchmark/training_torch/image_classification/__init__.py b/benchmark/training_torch/image_classification/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmark/training_torch/image_classification/download_cifar10_train_resnet.sh b/benchmark/training_torch/image_classification/download_cifar10_train_resnet.sh new file mode 100644 index 00000000..ec7f10fb --- /dev/null +++ b/benchmark/training_torch/image_classification/download_cifar10_train_resnet.sh @@ -0,0 +1,6 @@ +#!/bin/bash +. venv/bin/activate + +# train ans test the model +python3 train.py +python3 test.py diff --git a/benchmark/training_torch/image_classification/prepare_training_env.sh b/benchmark/training_torch/image_classification/prepare_training_env.sh new file mode 100755 index 00000000..5c8ed1bd --- /dev/null +++ b/benchmark/training_torch/image_classification/prepare_training_env.sh @@ -0,0 +1,5 @@ +#!/bin/sh + +python3 -m venv venv +. venv/bin/activate +pip3 install -r requirements.txt diff --git a/benchmark/training_torch/image_classification/requirements.txt b/benchmark/training_torch/image_classification/requirements.txt new file mode 100644 index 00000000..fb2ba2d7 --- /dev/null +++ b/benchmark/training_torch/image_classification/requirements.txt @@ -0,0 +1,3 @@ +tensorboard==2.14.1 +torch==2.0.1 +torchvision==0.15.2 diff --git a/benchmark/training_torch/image_classification/test.py b/benchmark/training_torch/image_classification/test.py new file mode 100644 index 00000000..3d662a51 --- /dev/null +++ b/benchmark/training_torch/image_classification/test.py @@ -0,0 +1,50 @@ +import argparse + +import torch +from utils.data import get_test_dataloader +from utils.training import eval_training + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--model-ckpt", + default="trained_models/best.pth", + type=str, + help="Path to model checkpoint for evaluation.", + ) + parser.add_argument( + "--batch-size", + type=int, + default=32, + help="Batch size. Default value is 32 according to TF training procedure.", + ) + parser.add_argument( + "--data-dir", + default="cifar-10-torch", + type=str, + help="Path to dataset (will be downloaded).", + ) + parser.add_argument( + "--workers", default=4, type=int, help="Number of data loading processes." + ) + args = parser.parse_args() + + model = torch.load(args.model_ckpt) + + val_loader = get_test_dataloader( + cifar_10_dir=args.data_dir, + batch_size=args.batch_size, + num_workers=args.workers, + ) + loss_function = torch.nn.CrossEntropyLoss() + + accuracy = eval_training( + model=model, + dataloader=val_loader, + loss_function=loss_function, + epoch=0, + log_to_tensorboard=False, + writer=None, + ) + + print(f"Model {args.model_ckpt} has accuracy: {accuracy}") diff --git a/benchmark/training_torch/image_classification/train.py b/benchmark/training_torch/image_classification/train.py new file mode 100644 index 00000000..961d977d --- /dev/null +++ b/benchmark/training_torch/image_classification/train.py @@ -0,0 +1,126 @@ +import argparse + +import torch +from torch.optim import Adam +from torch.optim.lr_scheduler import LambdaLR +from torch.utils.tensorboard import SummaryWriter +from utils.data import get_test_dataloader +from utils.data import get_training_dataloader +from utils.model import Resnet8v1EEMBC +from utils.training import WarmUpLR +from utils.training import eval_training +from utils.training import train_one_epoch + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--batch-size", + type=int, + default=32, + help="Batch size. Default value is 32 according to TF training procedure.", + ) + parser.add_argument( + "--epochs", + type=int, + default=500, + help="Number of epochs. Default value is 500 according to TF training procedure.", + ) + parser.add_argument( + "--warmup-epochs", + type=int, + default=0, + help="Number of epochs for LR linear warmup.", + ) + parser.add_argument( + "--lr", + default=0.001, + type=float, + help="Initial learning rate. Default value is 1e-3 according to TF training procedure.", + ) + parser.add_argument( + "--lr-decay", + default=0.99, + type=float, + help="Initial learning rate. Default value is 1e-3 according to TF training procedure.", + ) + parser.add_argument( + "--data-dir", + default="cifar-10-torch", + type=str, + help="Path to dataset (will be downloaded).", + ) + parser.add_argument( + "--workers", default=4, type=int, help="Number of data loading processes." + ) + parser.add_argument( + "--weight-decay", default=1e-4, type=float, help="Weight decay for optimizer." + ) + parser.add_argument("--log-dir", type=str, default="trained_models") + args = parser.parse_args() + + train_loader = get_training_dataloader( + cifar_10_dir=args.data_dir, + batch_size=args.batch_size, + num_workers=args.workers, + ) + val_loader = get_test_dataloader( + cifar_10_dir=args.data_dir, + batch_size=args.batch_size, + num_workers=args.workers, + ) + + model = Resnet8v1EEMBC() + if torch.cuda.is_available(): + model = model.cuda() + optimizer = Adam( + params=model.parameters(), + lr=args.lr, + weight_decay=args.weight_decay, + ) + train_scheduler = LambdaLR( + optimizer=optimizer, lr_lambda=lambda epoch: args.lr_decay**epoch + ) + warmup_scheduler = None + if args.warmup_epochs: + warmup_scheduler = WarmUpLR(optimizer=optimizer, total_iters=args.warmup_epochs) + + writer = SummaryWriter(log_dir=args.log_dir) + + loss_function = torch.nn.CrossEntropyLoss() + + best_accuracy = 0.0 + for epoch in range(1, args.epochs + 1): + if epoch > args.warmup_epochs: + train_scheduler.step() + + train_one_epoch( + model=model, + train_dataloader=train_loader, + loss_function=loss_function, + optimizer=optimizer, + epoch=epoch, + writer=writer, + warmup_scheduler=warmup_scheduler, + warmup_epochs=args.warmup_epochs, + train_scheduler=train_scheduler, + ) + + accuracy = eval_training( + model=model, + dataloader=val_loader, + loss_function=loss_function, + epoch=epoch, + log_to_tensorboard=True, + writer=writer, + ) + + if best_accuracy < accuracy: + weights_path = f"{args.log_dir}/best.pth" + print(f"saving weights file to {weights_path}") + torch.save(model, weights_path) + best_accuracy = accuracy + continue + + writer.flush() + + writer.close() diff --git a/benchmark/training_torch/image_classification/utils/__init__.py b/benchmark/training_torch/image_classification/utils/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/benchmark/training_torch/image_classification/utils/data.py b/benchmark/training_torch/image_classification/utils/data.py new file mode 100644 index 00000000..2da285d0 --- /dev/null +++ b/benchmark/training_torch/image_classification/utils/data.py @@ -0,0 +1,97 @@ +import torch +import torchvision +import torchvision.transforms as transforms +from torch.utils.data import DataLoader + + +def get_training_dataloader( + cifar_10_dir: str = "cifar-10-torch", + batch_size: int = 16, + num_workers: int = 2, + shuffle: bool = True, +) -> torch.utils.data.DataLoader: + """Create DataLoader for training data. + + Parameters + ---------- + cifar_10_dir: str + Path to CIFAR10 data root in torchvision format. + batch_size: int + Batch size for dataloader. + num_workers: int + Number of subprocesses for data loading. + shuffle: bool + Flag for shuffling training data. + + Returns + ------- + torch.utils.data.DataLoader + DataLoader for training data. + """ + transform_train = transforms.Compose( + [ + transforms.RandomCrop(32, padding=4), + transforms.RandomHorizontalFlip(), + transforms.RandomRotation(15), + transforms.ToTensor(), + ] + ) + cifar10_training = torchvision.datasets.CIFAR10( + root=cifar_10_dir, + train=True, + download=True, + transform=transform_train, + ) + cifar10_training_loader = DataLoader( + cifar10_training, + shuffle=shuffle, + num_workers=num_workers, + batch_size=batch_size, + ) + + return cifar10_training_loader + + +def get_test_dataloader( + cifar_10_dir="cifar-10-torch", + batch_size=16, + num_workers=2, + shuffle=True, +): + """Create DataLoader for test data. + + Parameters + ---------- + cifar_10_dir: str + Path to CIFAR10 data root in torchvision format. + batch_size: int + Batch size for dataloader. + num_workers: int + Number of subprocesses for data loading. + shuffle: bool + Flag for shuffling training data. + + Returns + ------- + torch.utils.data.DataLoader + DataLoader for test data. + """ + transform_test = transforms.Compose( + [ + transforms.ToTensor(), + ] + ) + cifar10_test = torchvision.datasets.CIFAR10( + root=cifar_10_dir, + train=False, + download=True, + transform=transform_test, + ) + cifar10_test_loader = DataLoader( + cifar10_test, + shuffle=shuffle, + num_workers=num_workers, + batch_size=batch_size, + ) + + return cifar10_test_loader diff --git a/benchmark/training_torch/image_classification/utils/model.py b/benchmark/training_torch/image_classification/utils/model.py new file mode 100644 index 00000000..a659feb0 --- /dev/null +++ b/benchmark/training_torch/image_classification/utils/model.py @@ -0,0 +1,75 @@ +import torch +from torch import nn +from torch.nn import functional as F + + +class ResNetBlock(nn.Module): + def __init__( + self, + in_channels: int, + out_channels: int, + stride: int = 1, + ): + super().__init__() + self.block = nn.Sequential( + nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + padding=1, + bias=True, + stride=stride, + ), + nn.BatchNorm2d(num_features=out_channels), + nn.ReLU(inplace=True), + nn.Conv2d( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + padding=1, + bias=True, + ), + nn.BatchNorm2d(num_features=out_channels), + ) + if in_channels == out_channels: + self.residual = nn.Identity() + else: + self.residual = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + stride=stride, + ) + + def forward(self, inputs): + x = self.block(inputs) + y = self.residual(inputs) + return F.relu(x + y) + + +class Resnet8v1EEMBC(nn.Module): + def __init__(self): + super().__init__() + self.stem = nn.Sequential( + nn.Conv2d( + in_channels=3, out_channels=16, kernel_size=3, padding=1, bias=True + ), + nn.BatchNorm2d(num_features=16), + nn.ReLU(inplace=True), + ) + + self.first_stack = ResNetBlock(in_channels=16, out_channels=16, stride=1) + self.second_stack = ResNetBlock(in_channels=16, out_channels=32, stride=2) + self.third_stack = ResNetBlock(in_channels=32, out_channels=64, stride=2) + self.avgpool = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(in_features=64, out_features=10) + + def forward(self, inputs): + x = self.stem(inputs) + x = self.first_stack(x) + x = self.second_stack(x) + x = self.third_stack(x) + x = self.avgpool(x) + x = torch.flatten(x, 1) + x = self.fc(x) + return x diff --git a/benchmark/training_torch/image_classification/utils/training.py b/benchmark/training_torch/image_classification/utils/training.py new file mode 100644 index 00000000..48d506a0 --- /dev/null +++ b/benchmark/training_torch/image_classification/utils/training.py @@ -0,0 +1,129 @@ +import time + +import torch +from torch.optim.lr_scheduler import _LRScheduler +from torch.utils.data import DataLoader +from torch.utils.tensorboard import SummaryWriter + + +class WarmUpLR(_LRScheduler): + def __init__( + self, + optimizer: torch.optim.Optimizer, + total_iters: int, + last_epoch: int = -1, + ): + """Scheduler for learning rate warmup. + + Parameters + ---------- + optimizer: torch.optim.Optimizer + Optimizer, e.g. SGD. + total_iters: int + Number of iterations for warmup Learning rate phase. + last_epoch: int + The index of last epoch. Default: -1 + """ + self.total_iters = total_iters + super().__init__(optimizer, last_epoch) + + def get_lr(self): + """Return current learning rate.""" + return [ + base_lr * self.last_epoch / (self.total_iters + 1e-8) + for base_lr in self.base_lrs + ] + + +def train_one_epoch( + model: torch.nn.Module, + train_dataloader: torch.utils.data.DataLoader, + loss_function: torch.nn.Module, + optimizer: torch.optim.Optimizer, + epoch: int, + writer: torch.utils.tensorboard.SummaryWriter = None, + warmup_scheduler: _LRScheduler = None, + warmup_epochs: int = 0, + train_scheduler: _LRScheduler = None, +): + start = time.time() + model.train() + for batch_index, (images, labels) in enumerate(train_dataloader): + if torch.cuda.is_available(): + labels = labels.cuda() + images = images.cuda() + + optimizer.zero_grad() + outputs = model(images) + loss = loss_function(outputs, labels) + loss.backward() + optimizer.step() + + n_iter = (epoch - 1) * len(train_dataloader) + batch_index + 1 + + # update training loss for each iteration + writer.add_scalar("Train/loss", loss.item(), n_iter) + writer.add_scalar("Lr", optimizer.param_groups[0]["lr"], n_iter) + + if epoch <= warmup_epochs: + warmup_scheduler.step() + else: + train_scheduler.step(epoch) + + print( + f"Training Epoch: " + f"{epoch} " + f"Loss: {loss.item():0.4f}\t" + f'LR: {optimizer.param_groups[0]["lr"]:0.6f}' + ) + + finish = time.time() + + print(f"epoch {epoch} training time consumed: {finish - start:.2f}s") + + +def eval_training( + model: torch.nn.Module, + dataloader: torch.utils.data.DataLoader, + loss_function: torch.nn.Module, + epoch: int = 0, + log_to_tensorboard: bool = True, + writer: SummaryWriter = None, +): + start = time.time() + model.eval() + + test_loss = 0.0 # cost function error + correct = 0.0 + with torch.no_grad(): + for images, labels in dataloader: + if torch.cuda.is_available(): + images = images.cuda() + labels = labels.cuda() + + outputs = model(images) + loss = loss_function(outputs, labels) + + test_loss += loss.item() + _, preds = outputs.max(1) + correct += preds.eq(labels).sum() + + finish = time.time() + print("Evaluating Network.....") + dataset_size = len(dataloader.dataset) + loss = test_loss / dataset_size + accuracy = correct.float() / dataset_size + print( + f"Test set: Epoch: {epoch}," + f" Average loss: {loss:.4f}," + f" Accuracy: {accuracy:.4f}," + f" Time consumed:{finish - start:.2f}s" + ) + print() + + # add information to tensorboard + if log_to_tensorboard and writer: + writer.add_scalar("Test/Average loss", loss, epoch) + writer.add_scalar("Test/Accuracy", accuracy, epoch) + + return correct.float() / dataset_size