From 83ff00afe851adb58d0e11c4397fb46f104ee65d Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 10:30:38 +0000
Subject: [PATCH 01/42] Have get_device use torch::Device

---
 src/ctorch.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index bf1577be..df57d2f0 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -29,17 +29,17 @@ constexpr auto get_dtype(torch_data_t dtype)
   }
 }
 
-constexpr auto get_device(torch_device_t device)
+const auto get_device(torch_device_t device)
 {
   switch (device) {
   case torch_kCPU:
-    return torch::kCPU;
+    return torch::Device(torch::kCPU);
   case torch_kCUDA:
-    return torch::kCUDA;
+    return torch::Device(torch::kCUDA);
   default:
     std::cerr << "[ERROR]: unknown device type, setting to torch_kCPU"
               << std::endl;
-    return torch::kCPU;
+    return torch::Device(torch::kCPU);
   }
 }
 

From a392900b6f278a49d210ae222b617bd6715366ea Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 10:56:35 +0000
Subject: [PATCH 02/42] Add device_number arg for get_device

---
 src/ctorch.cpp | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index df57d2f0..182900b6 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -29,7 +29,7 @@ constexpr auto get_dtype(torch_data_t dtype)
   }
 }
 
-const auto get_device(torch_device_t device)
+const auto get_device(torch_device_t device, int device_number)
 {
   switch (device) {
   case torch_kCPU:
@@ -47,12 +47,13 @@ torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
                            torch_device_t device)
 {
   torch::Tensor* tensor = nullptr;
+  int device_number = 0;
   try {
     // This doesn't throw if shape and dimensions are incompatible
     c10::IntArrayRef vshape(shape, ndim);
     tensor = new torch::Tensor;
     *tensor = torch::zeros(
-        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device));
+        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_number));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete tensor;
@@ -69,12 +70,13 @@ torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
                           torch_device_t device)
 {
   torch::Tensor* tensor = nullptr;
+  int device_number = 0;
   try {
     // This doesn't throw if shape and dimensions are incompatible
     c10::IntArrayRef vshape(shape, ndim);
     tensor = new torch::Tensor;
     *tensor = torch::ones(
-        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device));
+        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_number));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete tensor;
@@ -91,12 +93,13 @@ torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
                            torch_device_t device)
 {
   torch::Tensor* tensor = nullptr;
+  int device_number = 0;
   try {
     // This doesn't throw if shape and dimensions are incompatible
     c10::IntArrayRef vshape(shape, ndim);
     tensor = new torch::Tensor;
     *tensor = torch::empty(
-        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device));
+        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_number));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete tensor;
@@ -116,6 +119,7 @@ torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
                                torch_device_t device)
 {
   torch::Tensor* tensor = nullptr;
+  int device_number = 0;
 
   try {
     // This doesn't throw if shape and dimensions are incompatible
@@ -124,7 +128,7 @@ torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
     tensor = new torch::Tensor;
     *tensor = torch::from_blob(
         data, vshape, vstrides,
-        torch::dtype(get_dtype(dtype))).to(get_device(device));
+        torch::dtype(get_dtype(dtype))).to(get_device(device, device_number));
 
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;

From 2552c9147883c5894a52deb2483bb07d07f4c5cb Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 12:50:54 +0000
Subject: [PATCH 03/42] Throw error if device_number used in CPU-only case

---
 src/ctorch.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 182900b6..7395a0a8 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -33,6 +33,10 @@ const auto get_device(torch_device_t device, int device_number)
 {
   switch (device) {
   case torch_kCPU:
+    if (device_number > 0) {
+      std::cerr << "[ERROR]: device number unsupported for CPU-only runs"
+                << std::endl;
+    }
     return torch::Device(torch::kCPU);
   case torch_kCUDA:
     return torch::Device(torch::kCUDA);

From 9b0b7dd46d5ab70f816ad0b388ed1979a45912c2 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 13:04:42 +0000
Subject: [PATCH 04/42] Disallow negative device number

---
 src/ctorch.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 7395a0a8..b56a6cef 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -31,6 +31,11 @@ constexpr auto get_dtype(torch_data_t dtype)
 
 const auto get_device(torch_device_t device, int device_number)
 {
+  if (device_number < 0) {
+    std::cerr << "[ERROR]: device number must be non-negative, using zero instead"
+              << std::endl;
+    device_number = 0;
+  }
   switch (device) {
   case torch_kCPU:
     if (device_number > 0) {

From e44e3e626c3360a3809aa02a42597d0542be8714 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 13:00:28 +0000
Subject: [PATCH 05/42] Actually use the device number

---
 src/ctorch.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index b56a6cef..5e67cd04 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -44,7 +44,14 @@ const auto get_device(torch_device_t device, int device_number)
     }
     return torch::Device(torch::kCPU);
   case torch_kCUDA:
-    return torch::Device(torch::kCUDA);
+    if (device_number < torch::cuda::device_count()) {
+      return torch::Device(torch::kCUDA, device_number);
+    } else {
+      std::cerr << "[ERROR]: device number " << device_number
+                << " exceeds device count " << torch::cuda::device_count()
+                << ", using zero instead" << std::endl;
+      return torch::Device(torch::kCUDA);
+    }
   default:
     std::cerr << "[ERROR]: unknown device type, setting to torch_kCPU"
               << std::endl;

From cf3947282730b5e1dc268379b3367c36c1d0b239 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 13:15:44 +0000
Subject: [PATCH 06/42] Use device number for torch_zeros

---
 src/ctorch.cpp  | 3 +--
 src/ctorch.h    | 4 +++-
 src/ftorch.f90  | 8 +++++---
 src/ftorch.fypp | 8 +++++---
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 5e67cd04..d7ca7b28 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -60,10 +60,9 @@ const auto get_device(torch_device_t device, int device_number)
 }
 
 torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
-                           torch_device_t device)
+                           torch_device_t device, int device_number)
 {
   torch::Tensor* tensor = nullptr;
-  int device_number = 0;
   try {
     // This doesn't throw if shape and dimensions are incompatible
     c10::IntArrayRef vshape(shape, ndim);
diff --git a/src/ctorch.h b/src/ctorch.h
index a7afa257..85023dcc 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -38,9 +38,11 @@ typedef enum { torch_kCPU, torch_kCUDA } torch_device_t;
  * @param shape of the Tensor
  * @param data type of the elements of the Tensor
  * @param device used (cpu, CUDA, etc.)
+ * @param device number for the CUDA case
  */
 EXPORT_C torch_tensor_t torch_zeros(int ndim, const int64_t* shape,
-                                    torch_data_t dtype, torch_device_t device);
+                                    torch_data_t dtype, torch_device_t device,
+                                    int device_number);
 
 /**
  * Function to generate a Torch Tensor of ones
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index b9144164..d4563d1b 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -93,27 +93,29 @@ end function torch_from_blob_c
 contains
 
   !> Returns a tensor filled with the scalar value 0.
-  function torch_tensor_zeros(ndims, tensor_shape, dtype, device) result(tensor)
+  function torch_tensor_zeros(ndims, tensor_shape, dtype, device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
 
     interface
-      function torch_zeros_c(ndims, tensor_shape, dtype, device) result(tensor) &
+      function torch_zeros_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
           bind(c, name = 'torch_zeros')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
         integer(c_int), value, intent(in) :: device
+        integer(c_int), value, intent(in) :: device_number
         type(c_ptr)                       :: tensor
       end function torch_zeros_c
     end interface
 
-    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device)
+    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_number)
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index a09f4303..b5bf2c87 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -91,27 +91,29 @@ module ftorch
 contains
 
   !> Returns a tensor filled with the scalar value 0.
-  function torch_tensor_zeros(ndims, tensor_shape, dtype, device) result(tensor)
+  function torch_tensor_zeros(ndims, tensor_shape, dtype, device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
 
     interface
-      function torch_zeros_c(ndims, tensor_shape, dtype, device) result(tensor) &
+      function torch_zeros_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
           bind(c, name = 'torch_zeros')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
         integer(c_int), value, intent(in) :: device
+        integer(c_int), value, intent(in) :: device_number
         type(c_ptr)                       :: tensor
       end function torch_zeros_c
     end interface
 
-    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device)
+    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_number)
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.

From 01b80634f338203ae6754c824fcbb876b0d6c1ee Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 13:16:48 +0000
Subject: [PATCH 07/42] Use device number for torch_ones

---
 src/ctorch.cpp  | 3 +--
 src/ctorch.h    | 4 +++-
 src/ftorch.f90  | 8 +++++---
 src/ftorch.fypp | 8 +++++---
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index d7ca7b28..067fd3e3 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -82,10 +82,9 @@ torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
 }
 
 torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
-                          torch_device_t device)
+                          torch_device_t device, int device_number)
 {
   torch::Tensor* tensor = nullptr;
-  int device_number = 0;
   try {
     // This doesn't throw if shape and dimensions are incompatible
     c10::IntArrayRef vshape(shape, ndim);
diff --git a/src/ctorch.h b/src/ctorch.h
index 85023dcc..dceb04a0 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -50,9 +50,11 @@ EXPORT_C torch_tensor_t torch_zeros(int ndim, const int64_t* shape,
  * @param shape of the Tensor
  * @param data type of the elements of the Tensor
  * @param device used (cpu, CUDA, etc.)
+ * @param device number for the CUDA case
  */
 EXPORT_C torch_tensor_t torch_ones(int ndim, const int64_t* shape,
-                                   torch_data_t dtype, torch_device_t device);
+                                   torch_data_t dtype, torch_device_t device,
+                                   int device_number);
 
 /**
  * Function to generate an empty Torch Tensor
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index d4563d1b..b837c669 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -119,27 +119,29 @@ end function torch_zeros_c
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
-  function torch_tensor_ones(ndims, tensor_shape, dtype, device) result(tensor)
+  function torch_tensor_ones(ndims, tensor_shape, dtype, device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
 
     interface
-      function torch_ones_c(ndims, tensor_shape, dtype, device) result(tensor) &
+      function torch_ones_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
           bind(c, name = 'torch_ones')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
         integer(c_int), value, intent(in) :: device
+        integer(c_int), value, intent(in) :: device_number
         type(c_ptr)                       :: tensor
       end function torch_ones_c
     end interface
 
-    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device)
+    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_number)
   end function torch_tensor_ones
 
   ! Torch Tensor API
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index b5bf2c87..4f5813d6 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -117,27 +117,29 @@ contains
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
-  function torch_tensor_ones(ndims, tensor_shape, dtype, device) result(tensor)
+  function torch_tensor_ones(ndims, tensor_shape, dtype, device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
 
     interface
-      function torch_ones_c(ndims, tensor_shape, dtype, device) result(tensor) &
+      function torch_ones_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
           bind(c, name = 'torch_ones')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
         integer(c_int), value, intent(in) :: device
+        integer(c_int), value, intent(in) :: device_number
         type(c_ptr)                       :: tensor
       end function torch_ones_c
     end interface
 
-    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device)
+    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_number)
   end function torch_tensor_ones
 
   ! Torch Tensor API

From 530fa19022927384931108749238134d2b9f6b47 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 13:18:34 +0000
Subject: [PATCH 08/42] Use device number for torch_empty

---
 src/ctorch.cpp | 3 +--
 src/ctorch.h   | 4 +++-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 067fd3e3..69239dc0 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -104,10 +104,9 @@ torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
 }
 
 torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
-                           torch_device_t device)
+                           torch_device_t device, int device_number)
 {
   torch::Tensor* tensor = nullptr;
-  int device_number = 0;
   try {
     // This doesn't throw if shape and dimensions are incompatible
     c10::IntArrayRef vshape(shape, ndim);
diff --git a/src/ctorch.h b/src/ctorch.h
index dceb04a0..fea851c4 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -62,9 +62,11 @@ EXPORT_C torch_tensor_t torch_ones(int ndim, const int64_t* shape,
  * @param shape of the Tensor
  * @param data type of the elements of the Tensor
  * @param device used (cpu, CUDA, etc.)
+ * @param device number for the CUDA case
  */
 EXPORT_C torch_tensor_t torch_empty(int ndim, const int64_t* shape,
-                                    torch_data_t dtype, torch_device_t device);
+                                    torch_data_t dtype, torch_device_t device,
+                                    int device_number);
 
 /**
  * Function to create a Torch Tensor from memory location given extra information

From af7a8af7a63d510ec4dd5f7e63f2fba54f4da946 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 13:25:18 +0000
Subject: [PATCH 09/42] Use device number for torch_from_blob

---
 .../1_SimpleNet/simplenet_infer_fortran.f90   |   4 +-
 examples/2_ResNet18/resnet_infer_fortran.f90  |   4 +-
 examples/n_c_and_cpp/resnet_infer_c.c         |   4 +-
 src/ctorch.cpp                                |   3 +-
 src/ctorch.h                                  |   4 +-
 src/ftorch.f90                                | 128 +++++++++++-------
 src/ftorch.fypp                               |  13 +-
 7 files changed, 95 insertions(+), 65 deletions(-)

diff --git a/examples/1_SimpleNet/simplenet_infer_fortran.f90 b/examples/1_SimpleNet/simplenet_infer_fortran.f90
index 199b984c..47df1675 100644
--- a/examples/1_SimpleNet/simplenet_infer_fortran.f90
+++ b/examples/1_SimpleNet/simplenet_infer_fortran.f90
@@ -36,8 +36,8 @@ program inference
    in_data = [0.0, 1.0, 2.0, 3.0, 4.0]
 
    ! Create Torch input/output tensors from the above arrays
-   in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCPU)
-   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU)
+   in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCPU, 0)
+   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU, 0)
 
    ! Load ML model
    model = torch_module_load(args(1))
diff --git a/examples/2_ResNet18/resnet_infer_fortran.f90 b/examples/2_ResNet18/resnet_infer_fortran.f90
index 1af256af..a3568aca 100644
--- a/examples/2_ResNet18/resnet_infer_fortran.f90
+++ b/examples/2_ResNet18/resnet_infer_fortran.f90
@@ -66,9 +66,9 @@ subroutine main()
       call load_data(filename, tensor_length, in_data)
 
       ! Create input/output tensors from the above arrays
-      in_tensor(1) = torch_tensor_from_array(in_data, in_layout, torch_kCPU)
+      in_tensor(1) = torch_tensor_from_array(in_data, in_layout, torch_kCPU, 0)
 
-      out_tensor = torch_tensor_from_array(out_data, out_layout, torch_kCPU)
+      out_tensor = torch_tensor_from_array(out_data, out_layout, torch_kCPU, 0)
 
       ! Load ML model (edit this line to use different models)
       model = torch_module_load(args(1))
diff --git a/examples/n_c_and_cpp/resnet_infer_c.c b/examples/n_c_and_cpp/resnet_infer_c.c
index 852f831b..63122e28 100644
--- a/examples/n_c_and_cpp/resnet_infer_c.c
+++ b/examples/n_c_and_cpp/resnet_infer_c.c
@@ -41,10 +41,10 @@ int main(int argc, const char* argv[])
   if (model) {
     torch_tensor_t inputs[1];
     inputs[0] = torch_from_blob(input_data, input_ndim, input_shape,
-                                           torch_kFloat32, torch_kCPU);
+                                           torch_kFloat32, torch_kCPU, 0);
     const int nin = 1;
     torch_tensor_t output = torch_from_blob(
-        output_data, output_ndim, output_shape, torch_kFloat32, torch_kCPU);
+        output_data, output_ndim, output_shape, torch_kFloat32, torch_kCPU, 0);
     torch_jit_module_forward(model, inputs, nin, output);
     torch_tensor_print(output);
     torch_jit_module_delete(model);
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 69239dc0..3367db8f 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -129,10 +129,9 @@ torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
 // data
 torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
                                const int64_t* strides, torch_data_t dtype,
-                               torch_device_t device)
+                               torch_device_t device, int device_number)
 {
   torch::Tensor* tensor = nullptr;
-  int device_number = 0;
 
   try {
     // This doesn't throw if shape and dimensions are incompatible
diff --git a/src/ctorch.h b/src/ctorch.h
index fea851c4..34d06ec9 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -76,13 +76,15 @@ EXPORT_C torch_tensor_t torch_empty(int ndim, const int64_t* shape,
  * @param strides to take through data
  * @param data type of the elements of the Tensor
  * @param device used (cpu, CUDA, etc.)
+ * @param device number for the CUDA case
  * @return Torch Tensor interpretation of the data pointed at
  */
 EXPORT_C torch_tensor_t torch_from_blob(void* data, int ndim,
                                         const int64_t* shape,
                                         const int64_t* strides,
                                         torch_data_t dtype,
-                                        torch_device_t device);
+                                        torch_device_t device,
+                                        int device_number);
 
 /**
  * Function to print out a Torch Tensor
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index b837c669..4a343b83 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -75,7 +75,7 @@ module ftorch
   end interface
 
   interface
-    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device) result(tensor_p) &
+    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number) result(tensor_p) &
                                bind(c, name = 'torch_from_blob')
       use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
 
@@ -86,6 +86,7 @@ function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device) re
       integer(c_int64_t), intent(in)    :: strides(*)
       integer(c_int), value, intent(in) :: dtype
       integer(c_int), value, intent(in) :: device
+      integer(c_int), value, intent(in) :: device_number
       type(c_ptr)                       :: tensor_p
     end function torch_from_blob_c
   end interface
@@ -147,13 +148,14 @@ end function torch_tensor_ones
   ! Torch Tensor API
   !| Exposes the given data as a tensor without taking ownership of the original data.
   !  This routine will take an (i, j, k) array and return an (k, j, i) tensor.
-  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device) result(tensor)
+  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
     type(c_ptr), intent(in)        :: data       !! Pointer to data
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
     integer(c_int), intent(in)     :: layout(*)  !! Layout for strides for accessing data
     type(torch_tensor)             :: tensor     !! Returned tensor
 
@@ -164,7 +166,7 @@ function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device
     do i = 2, ndims
       strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
-    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device)
+    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number)
   end function torch_tensor_from_blob
 
   !> Prints the contents of a tensor.
@@ -264,7 +266,7 @@ end subroutine torch_jit_module_delete_c
   end subroutine torch_module_delete
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int8`
-  function torch_tensor_from_array_int8_1d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -272,6 +274,7 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device) result(tenso
     integer(kind=int8), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -290,12 +293,12 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device) result(tenso
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int8_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int8`
-  function torch_tensor_from_array_int8_2d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -303,6 +306,7 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device) result(tenso
     integer(kind=int8), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -321,12 +325,12 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device) result(tenso
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int8_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int8`
-  function torch_tensor_from_array_int8_3d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -334,6 +338,7 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device) result(tenso
     integer(kind=int8), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -352,12 +357,12 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device) result(tenso
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int8_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int8`
-  function torch_tensor_from_array_int8_4d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -365,6 +370,7 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device) result(tenso
     integer(kind=int8), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -383,12 +389,12 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device) result(tenso
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int8_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int16`
-  function torch_tensor_from_array_int16_1d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -396,6 +402,7 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device) result(tens
     integer(kind=int16), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -414,12 +421,12 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int16_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int16`
-  function torch_tensor_from_array_int16_2d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -427,6 +434,7 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device) result(tens
     integer(kind=int16), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -445,12 +453,12 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int16_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int16`
-  function torch_tensor_from_array_int16_3d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -458,6 +466,7 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device) result(tens
     integer(kind=int16), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -476,12 +485,12 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int16_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int16`
-  function torch_tensor_from_array_int16_4d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -489,6 +498,7 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device) result(tens
     integer(kind=int16), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -507,12 +517,12 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int16_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int32`
-  function torch_tensor_from_array_int32_1d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -520,6 +530,7 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device) result(tens
     integer(kind=int32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -538,12 +549,12 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int32`
-  function torch_tensor_from_array_int32_2d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -551,6 +562,7 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device) result(tens
     integer(kind=int32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -569,12 +581,12 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int32`
-  function torch_tensor_from_array_int32_3d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -582,6 +594,7 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device) result(tens
     integer(kind=int32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -600,12 +613,12 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int32`
-  function torch_tensor_from_array_int32_4d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -613,6 +626,7 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device) result(tens
     integer(kind=int32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -631,12 +645,12 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int64`
-  function torch_tensor_from_array_int64_1d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -644,6 +658,7 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device) result(tens
     integer(kind=int64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -662,12 +677,12 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int64`
-  function torch_tensor_from_array_int64_2d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -675,6 +690,7 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device) result(tens
     integer(kind=int64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -693,12 +709,12 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int64`
-  function torch_tensor_from_array_int64_3d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -706,6 +722,7 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device) result(tens
     integer(kind=int64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -724,12 +741,12 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int64`
-  function torch_tensor_from_array_int64_4d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -737,6 +754,7 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device) result(tens
     integer(kind=int64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -755,12 +773,12 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device) result(tens
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_int64_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real32`
-  function torch_tensor_from_array_real32_1d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -768,6 +786,7 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device) result(ten
     real(kind=real32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -786,12 +805,12 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device) result(ten
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_real32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real32`
-  function torch_tensor_from_array_real32_2d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -799,6 +818,7 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device) result(ten
     real(kind=real32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -817,12 +837,12 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device) result(ten
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_real32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real32`
-  function torch_tensor_from_array_real32_3d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -830,6 +850,7 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device) result(ten
     real(kind=real32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -848,12 +869,12 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device) result(ten
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_real32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real32`
-  function torch_tensor_from_array_real32_4d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -861,6 +882,7 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device) result(ten
     real(kind=real32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -879,12 +901,12 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device) result(ten
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_real32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real64`
-  function torch_tensor_from_array_real64_1d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -892,6 +914,7 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device) result(ten
     real(kind=real64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -910,12 +933,12 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device) result(ten
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_real64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real64`
-  function torch_tensor_from_array_real64_2d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -923,6 +946,7 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device) result(ten
     real(kind=real64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -941,12 +965,12 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device) result(ten
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_real64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real64`
-  function torch_tensor_from_array_real64_3d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -954,6 +978,7 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device) result(ten
     real(kind=real64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -972,12 +997,12 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device) result(ten
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_real64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real64`
-  function torch_tensor_from_array_real64_4d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -985,6 +1010,7 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device) result(ten
     real(kind=real64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1003,7 +1029,7 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device) result(ten
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_real64_4d
 
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 4f5813d6..de743681 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -73,7 +73,7 @@ module ftorch
   end interface
 
   interface
-    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device) result(tensor_p) &
+    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number) result(tensor_p) &
                                bind(c, name = 'torch_from_blob')
       use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
 
@@ -84,6 +84,7 @@ module ftorch
       integer(c_int64_t), intent(in)    :: strides(*)
       integer(c_int), value, intent(in) :: dtype
       integer(c_int), value, intent(in) :: device
+      integer(c_int), value, intent(in) :: device_number
       type(c_ptr)                       :: tensor_p
     end function torch_from_blob_c
   end interface
@@ -145,13 +146,14 @@ contains
   ! Torch Tensor API
   !| Exposes the given data as a tensor without taking ownership of the original data.
   !  This routine will take an (i, j, k) array and return an (k, j, i) tensor.
-  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device) result(tensor)
+  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
     type(c_ptr), intent(in)        :: data       !! Pointer to data
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
     integer(c_int), intent(in)     :: layout(*)  !! Layout for strides for accessing data
     type(torch_tensor)             :: tensor     !! Returned tensor
 
@@ -162,7 +164,7 @@ contains
     do i = 2, ndims
       strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
-    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device)
+    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number)
   end function torch_tensor_from_blob
 
   !> Prints the contents of a tensor.
@@ -264,7 +266,7 @@ contains
   #:for PREC in PRECISIONS
   #:for RANK in RANKS
   !> Return a Torch tensor pointing to data_in array of rank ${RANK}$ containing data of type `${PREC}$`
-  function torch_tensor_from_array_${PREC}$_${RANK}$d(data_in, layout, c_device) result(tensor)
+  function torch_tensor_from_array_${PREC}$_${RANK}$d(data_in, layout, c_device, device_number) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : ${PREC}$
 
@@ -272,6 +274,7 @@ contains
     ${f_type(PREC)}$(kind=${PREC}$), intent(in), target :: data_in${ranksuffix(RANK)}$   !! Input data that tensor will point at
     integer, intent(in)        :: layout(${RANK}$) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -290,7 +293,7 @@ contains
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
 
   end function torch_tensor_from_array_${PREC}$_${RANK}$d
 

From e2fe07021b5299bb72364596a2a9c5b969657bc5 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 13:51:39 +0000
Subject: [PATCH 10/42] Device and device number args for torch_module_load

---
 .../1_SimpleNet/simplenet_infer_fortran.f90    |  2 +-
 examples/2_ResNet18/resnet_infer_fortran.f90   |  2 +-
 examples/n_c_and_cpp/resnet_infer_c.c          |  2 +-
 src/ctorch.cpp                                 |  6 ++++--
 src/ctorch.h                                   |  6 +++++-
 src/ftorch.f90                                 | 18 +++++++++++-------
 src/ftorch.fypp                                | 18 +++++++++++-------
 7 files changed, 34 insertions(+), 20 deletions(-)

diff --git a/examples/1_SimpleNet/simplenet_infer_fortran.f90 b/examples/1_SimpleNet/simplenet_infer_fortran.f90
index 47df1675..4fea9563 100644
--- a/examples/1_SimpleNet/simplenet_infer_fortran.f90
+++ b/examples/1_SimpleNet/simplenet_infer_fortran.f90
@@ -40,7 +40,7 @@ program inference
    out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU, 0)
 
    ! Load ML model
-   model = torch_module_load(args(1))
+   model = torch_module_load(args(1), torch_kCPU, 0)
 
    ! Infer
    call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
diff --git a/examples/2_ResNet18/resnet_infer_fortran.f90 b/examples/2_ResNet18/resnet_infer_fortran.f90
index a3568aca..83ee4c15 100644
--- a/examples/2_ResNet18/resnet_infer_fortran.f90
+++ b/examples/2_ResNet18/resnet_infer_fortran.f90
@@ -71,7 +71,7 @@ subroutine main()
       out_tensor = torch_tensor_from_array(out_data, out_layout, torch_kCPU, 0)
 
       ! Load ML model (edit this line to use different models)
-      model = torch_module_load(args(1))
+      model = torch_module_load(args(1), torch_kCPU, 0)
 
       ! Infer
       call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
diff --git a/examples/n_c_and_cpp/resnet_infer_c.c b/examples/n_c_and_cpp/resnet_infer_c.c
index 63122e28..4b60da56 100644
--- a/examples/n_c_and_cpp/resnet_infer_c.c
+++ b/examples/n_c_and_cpp/resnet_infer_c.c
@@ -28,7 +28,7 @@ int main(int argc, const char* argv[])
   output_shape[0] = batch_size;
   output_shape[1] = 1000;
 
-  torch_jit_script_module_t model = torch_jit_load(argv[1]);
+  torch_jit_script_module_t model = torch_jit_load(argv[1], torch_kCPU, 0);
   int64_t input_size
       = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
   float* input_data = (float*)malloc(input_size * sizeof(float));
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 3367db8f..f9399459 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -166,12 +166,14 @@ void torch_tensor_delete(torch_tensor_t tensor)
   delete t;
 }
 
-torch_jit_script_module_t torch_jit_load(const char* filename)
+torch_jit_script_module_t torch_jit_load(const char* filename,
+                                         const torch_device_t device,
+                                         const int device_number)
 {
   torch::jit::script::Module* module = nullptr;
   try {
     module = new torch::jit::script::Module;
-    *module = torch::jit::load(filename);
+    *module = torch::jit::load(filename, get_device(device, device_number));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete module;
diff --git a/src/ctorch.h b/src/ctorch.h
index 34d06ec9..3254dbce 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -105,9 +105,13 @@ EXPORT_C void torch_tensor_delete(torch_tensor_t tensor);
 /**
  * Function to load in a Torch model from a TorchScript file and store in a Torch Module
  * @param filename where TorchScript description of model is stored
+ * @param device used (cpu, CUDA, etc.)
+ * @param device number for the CUDA case
  * @return Torch Module loaded in from file
  */
-EXPORT_C torch_jit_script_module_t torch_jit_load(const char* filename);
+EXPORT_C torch_jit_script_module_t torch_jit_load(const char* filename,
+                                                  torch_device_t device,
+                                                  int device_number);
 
 /**
  * Function to run the `forward` method of a Torch Module
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 4a343b83..00c8e40a 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -201,22 +201,26 @@ end subroutine torch_tensor_delete
 
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
-  function torch_module_load(filename) result(module)
-    use, intrinsic :: iso_c_binding, only : c_null_char
-    character(*), intent(in) :: filename !! Filename of TorchScript module
-    type(torch_module)            :: module      !! Returned deserialized module
+  function torch_module_load(filename, device, device_number) result(module)
+    use, intrinsic :: iso_c_binding, only : c_int, c_null_char
+    character(*), intent(in)   :: filename !! Filename of TorchScript module
+    integer(c_int), intent(in) :: device   !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    type(torch_module)         :: module   !! Returned deserialized module
 
     interface
-      function torch_jit_load_c(filename) result(module) &
+      function torch_jit_load_c(filename, device, device_number) result(module) &
           bind(c, name = 'torch_jit_load')
-        use, intrinsic :: iso_c_binding, only : c_char, c_ptr
+        use, intrinsic :: iso_c_binding, only : c_char, c_int, c_ptr
         character(c_char), intent(in) :: filename(*)
+        integer(c_int), intent(in)    :: device
+        integer(c_int), intent(in)    :: device_number
         type(c_ptr)                   :: module
       end function torch_jit_load_c
     end interface
 
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device, device_number)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index de743681..d61b335d 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -199,22 +199,26 @@ contains
 
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
-  function torch_module_load(filename) result(module)
-    use, intrinsic :: iso_c_binding, only : c_null_char
-    character(*), intent(in) :: filename !! Filename of TorchScript module
-    type(torch_module)            :: module      !! Returned deserialized module
+  function torch_module_load(filename, device, device_number) result(module)
+    use, intrinsic :: iso_c_binding, only : c_int, c_null_char
+    character(*), intent(in)   :: filename !! Filename of TorchScript module
+    integer(c_int), intent(in) :: device   !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    type(torch_module)         :: module   !! Returned deserialized module
 
     interface
-      function torch_jit_load_c(filename) result(module) &
+      function torch_jit_load_c(filename, device, device_number) result(module) &
           bind(c, name = 'torch_jit_load')
-        use, intrinsic :: iso_c_binding, only : c_char, c_ptr
+        use, intrinsic :: iso_c_binding, only : c_char, c_int, c_ptr
         character(c_char), intent(in) :: filename(*)
+        integer(c_int), intent(in)    :: device
+        integer(c_int), intent(in)    :: device_number
         type(c_ptr)                   :: module
       end function torch_jit_load_c
     end interface
 
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device, device_number)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors

From fd729a3dc177be5bc5c580971b4f1742ddba32a8 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 15:58:37 +0000
Subject: [PATCH 11/42] Pass device and device number to torch_jit_load by
 value

---
 src/ctorch.h    | 4 ++--
 src/ftorch.f90  | 4 ++--
 src/ftorch.fypp | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ctorch.h b/src/ctorch.h
index 3254dbce..cc28648d 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -110,8 +110,8 @@ EXPORT_C void torch_tensor_delete(torch_tensor_t tensor);
  * @return Torch Module loaded in from file
  */
 EXPORT_C torch_jit_script_module_t torch_jit_load(const char* filename,
-                                                  torch_device_t device,
-                                                  int device_number);
+                                                  const torch_device_t device,
+                                                  const int device_number);
 
 /**
  * Function to run the `forward` method of a Torch Module
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 00c8e40a..e3f3a696 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -213,8 +213,8 @@ function torch_jit_load_c(filename, device, device_number) result(module) &
           bind(c, name = 'torch_jit_load')
         use, intrinsic :: iso_c_binding, only : c_char, c_int, c_ptr
         character(c_char), intent(in) :: filename(*)
-        integer(c_int), intent(in)    :: device
-        integer(c_int), intent(in)    :: device_number
+        integer(c_int), value, intent(in)    :: device
+        integer(c_int), value, intent(in)    :: device_number
         type(c_ptr)                   :: module
       end function torch_jit_load_c
     end interface
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index d61b335d..0459f07d 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -211,8 +211,8 @@ contains
           bind(c, name = 'torch_jit_load')
         use, intrinsic :: iso_c_binding, only : c_char, c_int, c_ptr
         character(c_char), intent(in) :: filename(*)
-        integer(c_int), intent(in)    :: device
-        integer(c_int), intent(in)    :: device_number
+        integer(c_int), value, intent(in)    :: device
+        integer(c_int), value, intent(in)    :: device_number
         type(c_ptr)                   :: module
       end function torch_jit_load_c
     end interface

From 3b3e62c3d44493618023888133f06e047fa5b43a Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 16:15:02 +0000
Subject: [PATCH 12/42] Make device number argument to torch_module_load
 optional

---
 examples/1_SimpleNet/simplenet_infer_fortran.f90 |  2 +-
 examples/2_ResNet18/resnet_infer_fortran.f90     |  2 +-
 src/ftorch.f90                                   | 12 ++++++++++--
 src/ftorch.fypp                                  | 12 ++++++++++--
 4 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/examples/1_SimpleNet/simplenet_infer_fortran.f90 b/examples/1_SimpleNet/simplenet_infer_fortran.f90
index 4fea9563..007a5e84 100644
--- a/examples/1_SimpleNet/simplenet_infer_fortran.f90
+++ b/examples/1_SimpleNet/simplenet_infer_fortran.f90
@@ -40,7 +40,7 @@ program inference
    out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU, 0)
 
    ! Load ML model
-   model = torch_module_load(args(1), torch_kCPU, 0)
+   model = torch_module_load(args(1), torch_kCPU)
 
    ! Infer
    call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
diff --git a/examples/2_ResNet18/resnet_infer_fortran.f90 b/examples/2_ResNet18/resnet_infer_fortran.f90
index 83ee4c15..3d81291d 100644
--- a/examples/2_ResNet18/resnet_infer_fortran.f90
+++ b/examples/2_ResNet18/resnet_infer_fortran.f90
@@ -71,7 +71,7 @@ subroutine main()
       out_tensor = torch_tensor_from_array(out_data, out_layout, torch_kCPU, 0)
 
       ! Load ML model (edit this line to use different models)
-      model = torch_module_load(args(1), torch_kCPU, 0)
+      model = torch_module_load(args(1), torch_kCPU)
 
       ! Infer
       call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index e3f3a696..80b98b1a 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -205,8 +205,9 @@ function torch_module_load(filename, device, device_number) result(module)
     use, intrinsic :: iso_c_binding, only : c_int, c_null_char
     character(*), intent(in)   :: filename !! Filename of TorchScript module
     integer(c_int), intent(in) :: device   !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_module)         :: module   !! Returned deserialized module
+    integer(c_int) :: device_number_value
 
     interface
       function torch_jit_load_c(filename, device, device_number) result(module) &
@@ -219,8 +220,15 @@ function torch_jit_load_c(filename, device, device_number) result(module) &
       end function torch_jit_load_c
     end interface
 
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device, device_number)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device, device_number_value)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 0459f07d..24cd805d 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -203,8 +203,9 @@ contains
     use, intrinsic :: iso_c_binding, only : c_int, c_null_char
     character(*), intent(in)   :: filename !! Filename of TorchScript module
     integer(c_int), intent(in) :: device   !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_module)         :: module   !! Returned deserialized module
+    integer(c_int) :: device_number_value
 
     interface
       function torch_jit_load_c(filename, device, device_number) result(module) &
@@ -217,8 +218,15 @@ contains
       end function torch_jit_load_c
     end interface
 
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device, device_number)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device, device_number_value)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors

From 5fe34b0c9493941b553d457cfd515ba052ab8cbc Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 16:18:49 +0000
Subject: [PATCH 13/42] Make device number argument to torch_tensor_from_array
 optional

---
 .../1_SimpleNet/simplenet_infer_fortran.f90   |   4 +-
 examples/2_ResNet18/resnet_infer_fortran.f90  |   4 +-
 src/ftorch.f90                                | 288 +++++++++++++++---
 src/ftorch.fypp                               |  12 +-
 4 files changed, 254 insertions(+), 54 deletions(-)

diff --git a/examples/1_SimpleNet/simplenet_infer_fortran.f90 b/examples/1_SimpleNet/simplenet_infer_fortran.f90
index 007a5e84..799d4dae 100644
--- a/examples/1_SimpleNet/simplenet_infer_fortran.f90
+++ b/examples/1_SimpleNet/simplenet_infer_fortran.f90
@@ -36,8 +36,8 @@ program inference
    in_data = [0.0, 1.0, 2.0, 3.0, 4.0]
 
    ! Create Torch input/output tensors from the above arrays
-   in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCPU, 0)
-   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU, 0)
+   in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCPU)
+   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU)
 
    ! Load ML model
    model = torch_module_load(args(1), torch_kCPU)
diff --git a/examples/2_ResNet18/resnet_infer_fortran.f90 b/examples/2_ResNet18/resnet_infer_fortran.f90
index 3d81291d..c5285fc6 100644
--- a/examples/2_ResNet18/resnet_infer_fortran.f90
+++ b/examples/2_ResNet18/resnet_infer_fortran.f90
@@ -66,9 +66,9 @@ subroutine main()
       call load_data(filename, tensor_length, in_data)
 
       ! Create input/output tensors from the above arrays
-      in_tensor(1) = torch_tensor_from_array(in_data, in_layout, torch_kCPU, 0)
+      in_tensor(1) = torch_tensor_from_array(in_data, in_layout, torch_kCPU)
 
-      out_tensor = torch_tensor_from_array(out_data, out_layout, torch_kCPU, 0)
+      out_tensor = torch_tensor_from_array(out_data, out_layout, torch_kCPU)
 
       ! Load ML model (edit this line to use different models)
       model = torch_module_load(args(1), torch_kCPU)
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 80b98b1a..922248b0 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -286,7 +286,7 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_numbe
     integer(kind=int8), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -297,6 +297,7 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_numbe
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -305,7 +306,14 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_numbe
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int8_1d
 
@@ -318,7 +326,7 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_numbe
     integer(kind=int8), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -329,6 +337,7 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_numbe
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -337,7 +346,14 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_numbe
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int8_2d
 
@@ -350,7 +366,7 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_numbe
     integer(kind=int8), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -361,6 +377,7 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_numbe
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -369,7 +386,14 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_numbe
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int8_3d
 
@@ -382,7 +406,7 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_numbe
     integer(kind=int8), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -393,6 +417,7 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_numbe
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -401,7 +426,14 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_numbe
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int8_4d
 
@@ -414,7 +446,7 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_numb
     integer(kind=int16), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -425,6 +457,7 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -433,7 +466,14 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int16_1d
 
@@ -446,7 +486,7 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_numb
     integer(kind=int16), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -457,6 +497,7 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -465,7 +506,14 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int16_2d
 
@@ -478,7 +526,7 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_numb
     integer(kind=int16), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -489,6 +537,7 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -497,7 +546,14 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int16_3d
 
@@ -510,7 +566,7 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_numb
     integer(kind=int16), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -521,6 +577,7 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -529,7 +586,14 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int16_4d
 
@@ -542,7 +606,7 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_numb
     integer(kind=int32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -553,6 +617,7 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -561,7 +626,14 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int32_1d
 
@@ -574,7 +646,7 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_numb
     integer(kind=int32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -585,6 +657,7 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -593,7 +666,14 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int32_2d
 
@@ -606,7 +686,7 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_numb
     integer(kind=int32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -617,6 +697,7 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -625,7 +706,14 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int32_3d
 
@@ -638,7 +726,7 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_numb
     integer(kind=int32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -649,6 +737,7 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -657,7 +746,14 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int32_4d
 
@@ -670,7 +766,7 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_numb
     integer(kind=int64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -681,6 +777,7 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -689,7 +786,14 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int64_1d
 
@@ -702,7 +806,7 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_numb
     integer(kind=int64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -713,6 +817,7 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -721,7 +826,14 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int64_2d
 
@@ -734,7 +846,7 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_numb
     integer(kind=int64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -745,6 +857,7 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -753,7 +866,14 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int64_3d
 
@@ -766,7 +886,7 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_numb
     integer(kind=int64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -777,6 +897,7 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -785,7 +906,14 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_numb
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_int64_4d
 
@@ -798,7 +926,7 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_num
     real(kind=real32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -809,6 +937,7 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -817,7 +946,14 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_num
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_real32_1d
 
@@ -830,7 +966,7 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_num
     real(kind=real32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -841,6 +977,7 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -849,7 +986,14 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_num
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_real32_2d
 
@@ -862,7 +1006,7 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_num
     real(kind=real32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -873,6 +1017,7 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -881,7 +1026,14 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_num
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_real32_3d
 
@@ -894,7 +1046,7 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_num
     real(kind=real32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -905,6 +1057,7 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -913,7 +1066,14 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_num
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_real32_4d
 
@@ -926,7 +1086,7 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_num
     real(kind=real64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -937,6 +1097,7 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -945,7 +1106,14 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_num
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_real64_1d
 
@@ -958,7 +1126,7 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_num
     real(kind=real64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -969,6 +1137,7 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -977,7 +1146,14 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_num
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_real64_2d
 
@@ -990,7 +1166,7 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_num
     real(kind=real64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1001,6 +1177,7 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -1009,7 +1186,14 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_num
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_real64_3d
 
@@ -1022,7 +1206,7 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_num
     real(kind=real64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1033,6 +1217,7 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -1041,7 +1226,14 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_num
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_real64_4d
 
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 24cd805d..e772fca1 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -286,7 +286,7 @@ contains
     ${f_type(PREC)}$(kind=${PREC}$), intent(in), target :: data_in${ranksuffix(RANK)}$   !! Input data that tensor will point at
     integer, intent(in)        :: layout(${RANK}$) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -297,6 +297,7 @@ contains
     integer(c_int64_t)        :: strides(${RANK}$)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = ${RANK}$                   !! Number of dimension of input data
     integer                   :: i
+    integer(c_int)            :: device_number_value
 
     c_tensor_shape = shape(data_in)
 
@@ -305,7 +306,14 @@ contains
       strides(layout(i)) = strides(layout(i - 1)) * c_tensor_shape(layout(i - 1))
     end do
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
 
   end function torch_tensor_from_array_${PREC}$_${RANK}$d
 

From 3fe5258640f6824be0840c8f5b414da397f8e811 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 16:24:43 +0000
Subject: [PATCH 14/42] Make device number argument to other subroutines
 optional

---
 src/ftorch.f90  | 37 +++++++++++++++++++++++++++++++------
 src/ftorch.fypp | 37 +++++++++++++++++++++++++++++++------
 2 files changed, 62 insertions(+), 12 deletions(-)

diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 922248b0..1528bc25 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -100,8 +100,9 @@ function torch_tensor_zeros(ndims, tensor_shape, dtype, device, device_number) r
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
+    integer(c_int)                 :: device_number_value  !! Device number used
 
     interface
       function torch_zeros_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
@@ -116,7 +117,14 @@ function torch_zeros_c(ndims, tensor_shape, dtype, device, device_number) result
       end function torch_zeros_c
     end interface
 
-    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_number_value)
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
@@ -126,8 +134,9 @@ function torch_tensor_ones(ndims, tensor_shape, dtype, device, device_number) re
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
+    integer(c_int)                 :: device_number_value  !! Device number used
 
     interface
       function torch_ones_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
@@ -142,7 +151,14 @@ function torch_ones_c(ndims, tensor_shape, dtype, device, device_number) result(
       end function torch_ones_c
     end interface
 
-    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_number_value)
   end function torch_tensor_ones
 
   ! Torch Tensor API
@@ -155,18 +171,27 @@ function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
     integer(c_int), intent(in)     :: layout(*)  !! Layout for strides for accessing data
     type(torch_tensor)             :: tensor     !! Returned tensor
 
     integer(c_int)                 :: i          !! loop index
     integer(c_int64_t)             :: strides(ndims) !! Strides for accessing data
+    integer(c_int)                 :: device_number_value  !! Device number used
 
     strides(layout(1)) = 1
     do i = 2, ndims
       strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
-    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number)
+
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number_value)
   end function torch_tensor_from_blob
 
   !> Prints the contents of a tensor.
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index e772fca1..e6f9e608 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -98,8 +98,9 @@ contains
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
+    integer(c_int)                 :: device_number_value  !! Device number used
 
     interface
       function torch_zeros_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
@@ -114,7 +115,14 @@ contains
       end function torch_zeros_c
     end interface
 
-    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_number_value)
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
@@ -124,8 +132,9 @@ contains
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
+    integer(c_int)                 :: device_number_value  !! Device number used
 
     interface
       function torch_ones_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
@@ -140,7 +149,14 @@ contains
       end function torch_ones_c
     end interface
 
-    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_number)
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_number_value)
   end function torch_tensor_ones
 
   ! Torch Tensor API
@@ -153,18 +169,27 @@ contains
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), intent(in)     :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
     integer(c_int), intent(in)     :: layout(*)  !! Layout for strides for accessing data
     type(torch_tensor)             :: tensor     !! Returned tensor
 
     integer(c_int)                 :: i          !! loop index
     integer(c_int64_t)             :: strides(ndims) !! Strides for accessing data
+    integer(c_int)                 :: device_number_value  !! Device number used
 
     strides(layout(1)) = 1
     do i = 2, ndims
       strides(layout(i)) = strides(layout(i - 1)) * tensor_shape(layout(i - 1))
     end do
-    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number)
+
+    ! Process optional arguments
+    if (present(device_number)) then
+      device_number_value = device_number
+    else
+      device_number_value = 0
+    endif
+
+    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number_value)
   end function torch_tensor_from_blob
 
   !> Prints the contents of a tensor.

From 9ed245242725c388647a4832761ad553fceab4f0 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Tue, 19 Mar 2024 16:34:16 +0000
Subject: [PATCH 15/42] Make device argument to torch_module_load optional

---
 examples/1_SimpleNet/simplenet_infer_fortran.f90 |  2 +-
 examples/2_ResNet18/resnet_infer_fortran.f90     |  2 +-
 src/ftorch.f90                                   | 12 +++++++++---
 src/ftorch.fypp                                  | 12 +++++++++---
 4 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/examples/1_SimpleNet/simplenet_infer_fortran.f90 b/examples/1_SimpleNet/simplenet_infer_fortran.f90
index 799d4dae..199b984c 100644
--- a/examples/1_SimpleNet/simplenet_infer_fortran.f90
+++ b/examples/1_SimpleNet/simplenet_infer_fortran.f90
@@ -40,7 +40,7 @@ program inference
    out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU)
 
    ! Load ML model
-   model = torch_module_load(args(1), torch_kCPU)
+   model = torch_module_load(args(1))
 
    ! Infer
    call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
diff --git a/examples/2_ResNet18/resnet_infer_fortran.f90 b/examples/2_ResNet18/resnet_infer_fortran.f90
index c5285fc6..1af256af 100644
--- a/examples/2_ResNet18/resnet_infer_fortran.f90
+++ b/examples/2_ResNet18/resnet_infer_fortran.f90
@@ -71,7 +71,7 @@ subroutine main()
       out_tensor = torch_tensor_from_array(out_data, out_layout, torch_kCPU)
 
       ! Load ML model (edit this line to use different models)
-      model = torch_module_load(args(1), torch_kCPU)
+      model = torch_module_load(args(1))
 
       ! Infer
       call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 1528bc25..227b804d 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -229,9 +229,10 @@ end subroutine torch_tensor_delete
   function torch_module_load(filename, device, device_number) result(module)
     use, intrinsic :: iso_c_binding, only : c_int, c_null_char
     character(*), intent(in)   :: filename !! Filename of TorchScript module
-    integer(c_int), intent(in) :: device   !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), optional, intent(in) :: device_number !! Device number to use for `torch_kCUDA` case
     type(torch_module)         :: module   !! Returned deserialized module
+    integer(c_int) :: device_value
     integer(c_int) :: device_number_value
 
     interface
@@ -246,6 +247,11 @@ end function torch_jit_load_c
     end interface
 
     ! Process optional arguments
+    if (present(device)) then
+      device_value = device
+    else
+      device_value = 0
+    endif
     if (present(device_number)) then
       device_number_value = device_number
     else
@@ -253,7 +259,7 @@ end function torch_jit_load_c
     endif
 
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device, device_number_value)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_value, device_number_value)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index e6f9e608..d35e4368 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -227,9 +227,10 @@ contains
   function torch_module_load(filename, device, device_number) result(module)
     use, intrinsic :: iso_c_binding, only : c_int, c_null_char
     character(*), intent(in)   :: filename !! Filename of TorchScript module
-    integer(c_int), intent(in) :: device   !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), optional, intent(in) :: device_number !! Device number to use for `torch_kCUDA` case
     type(torch_module)         :: module   !! Returned deserialized module
+    integer(c_int) :: device_value
     integer(c_int) :: device_number_value
 
     interface
@@ -244,6 +245,11 @@ contains
     end interface
 
     ! Process optional arguments
+    if (present(device)) then
+      device_value = device
+    else
+      device_value = 0
+    endif
     if (present(device_number)) then
       device_number_value = device_number
     else
@@ -251,7 +257,7 @@ contains
     endif
 
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device, device_number_value)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_value, device_number_value)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors

From fbc6a128cfe058d146e2c29b973410992c35b2b7 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 20 Mar 2024 15:06:14 +0000
Subject: [PATCH 16/42] Add function for determining device_index

---
 src/ctorch.cpp  |  6 ++++++
 src/ctorch.h    |  7 +++++++
 src/ftorch.f90  | 18 ++++++++++++++++++
 src/ftorch.fypp | 18 ++++++++++++++++++
 4 files changed, 49 insertions(+)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index f9399459..148adff6 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -160,6 +160,12 @@ void torch_tensor_print(const torch_tensor_t tensor)
   std::cout << *t << std::endl;
 }
 
+int torch_tensor_get_device_index(const torch_tensor_t tensor)
+{
+  auto t = reinterpret_cast<torch::Tensor*>(tensor);
+  return t->device().index();
+}
+
 void torch_tensor_delete(torch_tensor_t tensor)
 {
   auto t = reinterpret_cast<torch::Tensor*>(tensor);
diff --git a/src/ctorch.h b/src/ctorch.h
index cc28648d..2f0eafe0 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -92,6 +92,13 @@ EXPORT_C torch_tensor_t torch_from_blob(void* data, int ndim,
  */
 EXPORT_C void torch_tensor_print(const torch_tensor_t tensor);
 
+/**
+ * Function to determine the device index of a Torch Tensor
+ * @param Torch Tensor to determine the device index of
+ * @return device index of the Torch Tensor
+ */
+EXPORT_C int torch_tensor_get_device_index(const torch_tensor_t tensor);
+
 /**
  * Function to delete a Torch Tensor to clean up
  * @param Torch Tensor to delete
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 227b804d..d83f3091 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -209,6 +209,24 @@ end subroutine torch_tensor_print_c
     call torch_tensor_print_c(tensor%p)
   end subroutine torch_tensor_print
 
+  !> Determines the device index of a tensor.
+  function torch_tensor_get_device_index(tensor) result(device_index)
+    use, intrinsic :: iso_c_binding, only : c_int
+    type(torch_tensor), intent(in) :: tensor  !! Input tensor
+    integer(c_int) :: device_index  !! Device index of tensor
+
+    interface
+      function torch_tensor_get_device_index_c(tensor) result(device_index) &
+          bind(c, name = 'torch_tensor_get_device_index')
+        use, intrinsic :: iso_c_binding, only : c_int, c_ptr
+        type(c_ptr), value, intent(in) :: tensor
+        integer(c_int) :: device_index
+      end function torch_tensor_get_device_index_c
+    end interface
+
+    device_index = torch_tensor_get_device_index_c(tensor%p)
+  end function torch_tensor_get_device_index
+
   !> Deallocates a tensor.
   subroutine torch_tensor_delete(tensor)
     type(torch_tensor), intent(in) :: tensor     !! Input tensor
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index d35e4368..d2c9bef0 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -207,6 +207,24 @@ contains
     call torch_tensor_print_c(tensor%p)
   end subroutine torch_tensor_print
 
+  !> Determines the device index of a tensor.
+  function torch_tensor_get_device_index(tensor) result(device_index)
+    use, intrinsic :: iso_c_binding, only : c_int
+    type(torch_tensor), intent(in) :: tensor  !! Input tensor
+    integer(c_int) :: device_index  !! Device index of tensor
+
+    interface
+      function torch_tensor_get_device_index_c(tensor) result(device_index) &
+          bind(c, name = 'torch_tensor_get_device_index')
+        use, intrinsic :: iso_c_binding, only : c_int, c_ptr
+        type(c_ptr), value, intent(in) :: tensor
+        integer(c_int) :: device_index
+      end function torch_tensor_get_device_index_c
+    end interface
+
+    device_index = torch_tensor_get_device_index_c(tensor%p)
+  end function torch_tensor_get_device_index
+
   !> Deallocates a tensor.
   subroutine torch_tensor_delete(tensor)
     type(torch_tensor), intent(in) :: tensor     !! Input tensor

From 58d28ed83c7e4e2187e7c3367fe9c164f72200c9 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 20 Mar 2024 15:07:51 +0000
Subject: [PATCH 17/42] Rename device number as index

---
 src/ctorch.cpp  |  38 ++---
 src/ctorch.h    |  20 +--
 src/ftorch.f90  | 408 ++++++++++++++++++++++++------------------------
 src/ftorch.fypp |  86 +++++-----
 4 files changed, 276 insertions(+), 276 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 148adff6..431ba164 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -29,25 +29,25 @@ constexpr auto get_dtype(torch_data_t dtype)
   }
 }
 
-const auto get_device(torch_device_t device, int device_number)
+const auto get_device(torch_device_t device, int device_index)
 {
-  if (device_number < 0) {
-    std::cerr << "[ERROR]: device number must be non-negative, using zero instead"
+  if (device_index < 0) {
+    std::cerr << "[ERROR]: device index must be non-negative, using zero instead"
               << std::endl;
-    device_number = 0;
+    device_index = 0;
   }
   switch (device) {
   case torch_kCPU:
-    if (device_number > 0) {
-      std::cerr << "[ERROR]: device number unsupported for CPU-only runs"
+    if (device_index > 0) {
+      std::cerr << "[ERROR]: device index unsupported for CPU-only runs"
                 << std::endl;
     }
     return torch::Device(torch::kCPU);
   case torch_kCUDA:
-    if (device_number < torch::cuda::device_count()) {
-      return torch::Device(torch::kCUDA, device_number);
+    if (device_index < torch::cuda::device_count()) {
+      return torch::Device(torch::kCUDA, device_index);
     } else {
-      std::cerr << "[ERROR]: device number " << device_number
+      std::cerr << "[ERROR]: device index " << device_index
                 << " exceeds device count " << torch::cuda::device_count()
                 << ", using zero instead" << std::endl;
       return torch::Device(torch::kCUDA);
@@ -60,7 +60,7 @@ const auto get_device(torch_device_t device, int device_number)
 }
 
 torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
-                           torch_device_t device, int device_number)
+                           torch_device_t device, int device_index)
 {
   torch::Tensor* tensor = nullptr;
   try {
@@ -68,7 +68,7 @@ torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
     c10::IntArrayRef vshape(shape, ndim);
     tensor = new torch::Tensor;
     *tensor = torch::zeros(
-        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_number));
+        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_index));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete tensor;
@@ -82,7 +82,7 @@ torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
 }
 
 torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
-                          torch_device_t device, int device_number)
+                          torch_device_t device, int device_index)
 {
   torch::Tensor* tensor = nullptr;
   try {
@@ -90,7 +90,7 @@ torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
     c10::IntArrayRef vshape(shape, ndim);
     tensor = new torch::Tensor;
     *tensor = torch::ones(
-        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_number));
+        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_index));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete tensor;
@@ -104,7 +104,7 @@ torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
 }
 
 torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
-                           torch_device_t device, int device_number)
+                           torch_device_t device, int device_index)
 {
   torch::Tensor* tensor = nullptr;
   try {
@@ -112,7 +112,7 @@ torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
     c10::IntArrayRef vshape(shape, ndim);
     tensor = new torch::Tensor;
     *tensor = torch::empty(
-        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_number));
+        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_index));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete tensor;
@@ -129,7 +129,7 @@ torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
 // data
 torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
                                const int64_t* strides, torch_data_t dtype,
-                               torch_device_t device, int device_number)
+                               torch_device_t device, int device_index)
 {
   torch::Tensor* tensor = nullptr;
 
@@ -140,7 +140,7 @@ torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
     tensor = new torch::Tensor;
     *tensor = torch::from_blob(
         data, vshape, vstrides,
-        torch::dtype(get_dtype(dtype))).to(get_device(device, device_number));
+        torch::dtype(get_dtype(dtype))).to(get_device(device, device_index));
 
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
@@ -174,12 +174,12 @@ void torch_tensor_delete(torch_tensor_t tensor)
 
 torch_jit_script_module_t torch_jit_load(const char* filename,
                                          const torch_device_t device,
-                                         const int device_number)
+                                         const int device_index)
 {
   torch::jit::script::Module* module = nullptr;
   try {
     module = new torch::jit::script::Module;
-    *module = torch::jit::load(filename, get_device(device, device_number));
+    *module = torch::jit::load(filename, get_device(device, device_index));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete module;
diff --git a/src/ctorch.h b/src/ctorch.h
index 2f0eafe0..8aacc8e2 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -38,11 +38,11 @@ typedef enum { torch_kCPU, torch_kCUDA } torch_device_t;
  * @param shape of the Tensor
  * @param data type of the elements of the Tensor
  * @param device used (cpu, CUDA, etc.)
- * @param device number for the CUDA case
+ * @param device index for the CUDA case
  */
 EXPORT_C torch_tensor_t torch_zeros(int ndim, const int64_t* shape,
                                     torch_data_t dtype, torch_device_t device,
-                                    int device_number);
+                                    int device_index);
 
 /**
  * Function to generate a Torch Tensor of ones
@@ -50,11 +50,11 @@ EXPORT_C torch_tensor_t torch_zeros(int ndim, const int64_t* shape,
  * @param shape of the Tensor
  * @param data type of the elements of the Tensor
  * @param device used (cpu, CUDA, etc.)
- * @param device number for the CUDA case
+ * @param device index for the CUDA case
  */
 EXPORT_C torch_tensor_t torch_ones(int ndim, const int64_t* shape,
                                    torch_data_t dtype, torch_device_t device,
-                                   int device_number);
+                                   int device_index);
 
 /**
  * Function to generate an empty Torch Tensor
@@ -62,11 +62,11 @@ EXPORT_C torch_tensor_t torch_ones(int ndim, const int64_t* shape,
  * @param shape of the Tensor
  * @param data type of the elements of the Tensor
  * @param device used (cpu, CUDA, etc.)
- * @param device number for the CUDA case
+ * @param device index for the CUDA case
  */
 EXPORT_C torch_tensor_t torch_empty(int ndim, const int64_t* shape,
                                     torch_data_t dtype, torch_device_t device,
-                                    int device_number);
+                                    int device_index);
 
 /**
  * Function to create a Torch Tensor from memory location given extra information
@@ -76,7 +76,7 @@ EXPORT_C torch_tensor_t torch_empty(int ndim, const int64_t* shape,
  * @param strides to take through data
  * @param data type of the elements of the Tensor
  * @param device used (cpu, CUDA, etc.)
- * @param device number for the CUDA case
+ * @param device index for the CUDA case
  * @return Torch Tensor interpretation of the data pointed at
  */
 EXPORT_C torch_tensor_t torch_from_blob(void* data, int ndim,
@@ -84,7 +84,7 @@ EXPORT_C torch_tensor_t torch_from_blob(void* data, int ndim,
                                         const int64_t* strides,
                                         torch_data_t dtype,
                                         torch_device_t device,
-                                        int device_number);
+                                        int device_index);
 
 /**
  * Function to print out a Torch Tensor
@@ -113,12 +113,12 @@ EXPORT_C void torch_tensor_delete(torch_tensor_t tensor);
  * Function to load in a Torch model from a TorchScript file and store in a Torch Module
  * @param filename where TorchScript description of model is stored
  * @param device used (cpu, CUDA, etc.)
- * @param device number for the CUDA case
+ * @param device index for the CUDA case
  * @return Torch Module loaded in from file
  */
 EXPORT_C torch_jit_script_module_t torch_jit_load(const char* filename,
                                                   const torch_device_t device,
-                                                  const int device_number);
+                                                  const int device_index);
 
 /**
  * Function to run the `forward` method of a Torch Module
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index d83f3091..d7d1e608 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -75,7 +75,7 @@ module ftorch
   end interface
 
   interface
-    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number) result(tensor_p) &
+    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_index) result(tensor_p) &
                                bind(c, name = 'torch_from_blob')
       use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
 
@@ -86,7 +86,7 @@ function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, de
       integer(c_int64_t), intent(in)    :: strides(*)
       integer(c_int), value, intent(in) :: dtype
       integer(c_int), value, intent(in) :: device
-      integer(c_int), value, intent(in) :: device_number
+      integer(c_int), value, intent(in) :: device_index
       type(c_ptr)                       :: tensor_p
     end function torch_from_blob_c
   end interface
@@ -94,90 +94,90 @@ end function torch_from_blob_c
 contains
 
   !> Returns a tensor filled with the scalar value 0.
-  function torch_tensor_zeros(ndims, tensor_shape, dtype, device, device_number) result(tensor)
+  function torch_tensor_zeros(ndims, tensor_shape, dtype, device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
-    integer(c_int)                 :: device_number_value  !! Device number used
+    integer(c_int)                 :: device_index_value  !! device index used
 
     interface
-      function torch_zeros_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
+      function torch_zeros_c(ndims, tensor_shape, dtype, device, device_index) result(tensor) &
           bind(c, name = 'torch_zeros')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
         integer(c_int), value, intent(in) :: device
-        integer(c_int), value, intent(in) :: device_number
+        integer(c_int), value, intent(in) :: device_index
         type(c_ptr)                       :: tensor
       end function torch_zeros_c
     end interface
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_number_value)
+    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_index_value)
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
-  function torch_tensor_ones(ndims, tensor_shape, dtype, device, device_number) result(tensor)
+  function torch_tensor_ones(ndims, tensor_shape, dtype, device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
-    integer(c_int)                 :: device_number_value  !! Device number used
+    integer(c_int)                 :: device_index_value  !! device index used
 
     interface
-      function torch_ones_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
+      function torch_ones_c(ndims, tensor_shape, dtype, device, device_index) result(tensor) &
           bind(c, name = 'torch_ones')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
         integer(c_int), value, intent(in) :: device
-        integer(c_int), value, intent(in) :: device_number
+        integer(c_int), value, intent(in) :: device_index
         type(c_ptr)                       :: tensor
       end function torch_ones_c
     end interface
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_number_value)
+    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_index_value)
   end function torch_tensor_ones
 
   ! Torch Tensor API
   !| Exposes the given data as a tensor without taking ownership of the original data.
   !  This routine will take an (i, j, k) array and return an (k, j, i) tensor.
-  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device, device_number) result(tensor)
+  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
     type(c_ptr), intent(in)        :: data       !! Pointer to data
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     integer(c_int), intent(in)     :: layout(*)  !! Layout for strides for accessing data
     type(torch_tensor)             :: tensor     !! Returned tensor
 
     integer(c_int)                 :: i          !! loop index
     integer(c_int64_t)             :: strides(ndims) !! Strides for accessing data
-    integer(c_int)                 :: device_number_value  !! Device number used
+    integer(c_int)                 :: device_index_value  !! device index used
 
     strides(layout(1)) = 1
     do i = 2, ndims
@@ -185,13 +185,13 @@ function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number_value)
+    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_index_value)
   end function torch_tensor_from_blob
 
   !> Prints the contents of a tensor.
@@ -244,22 +244,22 @@ end subroutine torch_tensor_delete
 
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
-  function torch_module_load(filename, device, device_number) result(module)
+  function torch_module_load(filename, device, device_index) result(module)
     use, intrinsic :: iso_c_binding, only : c_int, c_null_char
     character(*), intent(in)   :: filename !! Filename of TorchScript module
     integer(c_int), optional, intent(in) :: device !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index !! device index to use for `torch_kCUDA` case
     type(torch_module)         :: module   !! Returned deserialized module
     integer(c_int) :: device_value
-    integer(c_int) :: device_number_value
+    integer(c_int) :: device_index_value
 
     interface
-      function torch_jit_load_c(filename, device, device_number) result(module) &
+      function torch_jit_load_c(filename, device, device_index) result(module) &
           bind(c, name = 'torch_jit_load')
         use, intrinsic :: iso_c_binding, only : c_char, c_int, c_ptr
         character(c_char), intent(in) :: filename(*)
         integer(c_int), value, intent(in)    :: device
-        integer(c_int), value, intent(in)    :: device_number
+        integer(c_int), value, intent(in)    :: device_index
         type(c_ptr)                   :: module
       end function torch_jit_load_c
     end interface
@@ -270,14 +270,14 @@ end function torch_jit_load_c
     else
       device_value = 0
     endif
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_value, device_number_value)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_value, device_index_value)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors
@@ -327,7 +327,7 @@ end subroutine torch_jit_module_delete_c
   end subroutine torch_module_delete
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int8`
-  function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -335,7 +335,7 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_numbe
     integer(kind=int8), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -346,7 +346,7 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_numbe
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -356,18 +356,18 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_numbe
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int8_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int8`
-  function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -375,7 +375,7 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_numbe
     integer(kind=int8), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -386,7 +386,7 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_numbe
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -396,18 +396,18 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_numbe
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int8_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int8`
-  function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -415,7 +415,7 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_numbe
     integer(kind=int8), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -426,7 +426,7 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_numbe
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -436,18 +436,18 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_numbe
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int8_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int8`
-  function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
@@ -455,7 +455,7 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_numbe
     integer(kind=int8), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -466,7 +466,7 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_numbe
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -476,18 +476,18 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_numbe
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int8_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int16`
-  function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -495,7 +495,7 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_numb
     integer(kind=int16), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -506,7 +506,7 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -516,18 +516,18 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int16_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int16`
-  function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -535,7 +535,7 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_numb
     integer(kind=int16), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -546,7 +546,7 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -556,18 +556,18 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int16_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int16`
-  function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -575,7 +575,7 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_numb
     integer(kind=int16), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -586,7 +586,7 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -596,18 +596,18 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int16_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int16`
-  function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
@@ -615,7 +615,7 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_numb
     integer(kind=int16), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -626,7 +626,7 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -636,18 +636,18 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int16_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int32`
-  function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -655,7 +655,7 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_numb
     integer(kind=int32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -666,7 +666,7 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -676,18 +676,18 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int32`
-  function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -695,7 +695,7 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_numb
     integer(kind=int32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -706,7 +706,7 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -716,18 +716,18 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int32`
-  function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -735,7 +735,7 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_numb
     integer(kind=int32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -746,7 +746,7 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -756,18 +756,18 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int32`
-  function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
@@ -775,7 +775,7 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_numb
     integer(kind=int32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -786,7 +786,7 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -796,18 +796,18 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int64`
-  function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -815,7 +815,7 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_numb
     integer(kind=int64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -826,7 +826,7 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -836,18 +836,18 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int64`
-  function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -855,7 +855,7 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_numb
     integer(kind=int64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -866,7 +866,7 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -876,18 +876,18 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int64`
-  function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -895,7 +895,7 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_numb
     integer(kind=int64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -906,7 +906,7 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -916,18 +916,18 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int64`
-  function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
@@ -935,7 +935,7 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_numb
     integer(kind=int64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -946,7 +946,7 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_numb
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -956,18 +956,18 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_numb
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_int64_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real32`
-  function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -975,7 +975,7 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_num
     real(kind=real32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -986,7 +986,7 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -996,18 +996,18 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_num
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_real32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real32`
-  function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1015,7 +1015,7 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_num
     real(kind=real32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1026,7 +1026,7 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -1036,18 +1036,18 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_num
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_real32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real32`
-  function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1055,7 +1055,7 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_num
     real(kind=real32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1066,7 +1066,7 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -1076,18 +1076,18 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_num
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_real32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real32`
-  function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
@@ -1095,7 +1095,7 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_num
     real(kind=real32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1106,7 +1106,7 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -1116,18 +1116,18 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_num
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_real32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real64`
-  function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -1135,7 +1135,7 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_num
     real(kind=real64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1146,7 +1146,7 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(1)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 1                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -1156,18 +1156,18 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_num
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_real64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real64`
-  function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -1175,7 +1175,7 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_num
     real(kind=real64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1186,7 +1186,7 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(2)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 2                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -1196,18 +1196,18 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_num
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_real64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real64`
-  function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -1215,7 +1215,7 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_num
     real(kind=real64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1226,7 +1226,7 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(3)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 3                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -1236,18 +1236,18 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_num
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_real64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real64`
-  function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
@@ -1255,7 +1255,7 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_num
     real(kind=real64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -1266,7 +1266,7 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_num
     integer(c_int64_t)        :: strides(4)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = 4                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -1276,13 +1276,13 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_num
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_real64_4d
 
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index d2c9bef0..d76d2d81 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -73,7 +73,7 @@ module ftorch
   end interface
 
   interface
-    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number) result(tensor_p) &
+    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_index) result(tensor_p) &
                                bind(c, name = 'torch_from_blob')
       use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
 
@@ -84,7 +84,7 @@ module ftorch
       integer(c_int64_t), intent(in)    :: strides(*)
       integer(c_int), value, intent(in) :: dtype
       integer(c_int), value, intent(in) :: device
-      integer(c_int), value, intent(in) :: device_number
+      integer(c_int), value, intent(in) :: device_index
       type(c_ptr)                       :: tensor_p
     end function torch_from_blob_c
   end interface
@@ -92,90 +92,90 @@ module ftorch
 contains
 
   !> Returns a tensor filled with the scalar value 0.
-  function torch_tensor_zeros(ndims, tensor_shape, dtype, device, device_number) result(tensor)
+  function torch_tensor_zeros(ndims, tensor_shape, dtype, device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
-    integer(c_int)                 :: device_number_value  !! Device number used
+    integer(c_int)                 :: device_index_value  !! device index used
 
     interface
-      function torch_zeros_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
+      function torch_zeros_c(ndims, tensor_shape, dtype, device, device_index) result(tensor) &
           bind(c, name = 'torch_zeros')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
         integer(c_int), value, intent(in) :: device
-        integer(c_int), value, intent(in) :: device_number
+        integer(c_int), value, intent(in) :: device_index
         type(c_ptr)                       :: tensor
       end function torch_zeros_c
     end interface
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_number_value)
+    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_index_value)
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
-  function torch_tensor_ones(ndims, tensor_shape, dtype, device, device_number) result(tensor)
+  function torch_tensor_ones(ndims, tensor_shape, dtype, device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
-    integer(c_int)                 :: device_number_value  !! Device number used
+    integer(c_int)                 :: device_index_value  !! device index used
 
     interface
-      function torch_ones_c(ndims, tensor_shape, dtype, device, device_number) result(tensor) &
+      function torch_ones_c(ndims, tensor_shape, dtype, device, device_index) result(tensor) &
           bind(c, name = 'torch_ones')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
         integer(c_int), value, intent(in) :: device
-        integer(c_int), value, intent(in) :: device_number
+        integer(c_int), value, intent(in) :: device_index
         type(c_ptr)                       :: tensor
       end function torch_ones_c
     end interface
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_number_value)
+    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_index_value)
   end function torch_tensor_ones
 
   ! Torch Tensor API
   !| Exposes the given data as a tensor without taking ownership of the original data.
   !  This routine will take an (i, j, k) array and return an (k, j, i) tensor.
-  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device, device_number) result(tensor)
+  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
     type(c_ptr), intent(in)        :: data       !! Pointer to data
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
     integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number     !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     integer(c_int), intent(in)     :: layout(*)  !! Layout for strides for accessing data
     type(torch_tensor)             :: tensor     !! Returned tensor
 
     integer(c_int)                 :: i          !! loop index
     integer(c_int64_t)             :: strides(ndims) !! Strides for accessing data
-    integer(c_int)                 :: device_number_value  !! Device number used
+    integer(c_int)                 :: device_index_value  !! device index used
 
     strides(layout(1)) = 1
     do i = 2, ndims
@@ -183,13 +183,13 @@ contains
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_number_value)
+    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_index_value)
   end function torch_tensor_from_blob
 
   !> Prints the contents of a tensor.
@@ -242,22 +242,22 @@ contains
 
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
-  function torch_module_load(filename, device, device_number) result(module)
+  function torch_module_load(filename, device, device_index) result(module)
     use, intrinsic :: iso_c_binding, only : c_int, c_null_char
     character(*), intent(in)   :: filename !! Filename of TorchScript module
     integer(c_int), optional, intent(in) :: device !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index !! device index to use for `torch_kCUDA` case
     type(torch_module)         :: module   !! Returned deserialized module
     integer(c_int) :: device_value
-    integer(c_int) :: device_number_value
+    integer(c_int) :: device_index_value
 
     interface
-      function torch_jit_load_c(filename, device, device_number) result(module) &
+      function torch_jit_load_c(filename, device, device_index) result(module) &
           bind(c, name = 'torch_jit_load')
         use, intrinsic :: iso_c_binding, only : c_char, c_int, c_ptr
         character(c_char), intent(in) :: filename(*)
         integer(c_int), value, intent(in)    :: device
-        integer(c_int), value, intent(in)    :: device_number
+        integer(c_int), value, intent(in)    :: device_index
         type(c_ptr)                   :: module
       end function torch_jit_load_c
     end interface
@@ -268,14 +268,14 @@ contains
     else
       device_value = 0
     endif
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_value, device_number_value)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_value, device_index_value)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors
@@ -327,7 +327,7 @@ contains
   #:for PREC in PRECISIONS
   #:for RANK in RANKS
   !> Return a Torch tensor pointing to data_in array of rank ${RANK}$ containing data of type `${PREC}$`
-  function torch_tensor_from_array_${PREC}$_${RANK}$d(data_in, layout, c_device, device_number) result(tensor)
+  function torch_tensor_from_array_${PREC}$_${RANK}$d(data_in, layout, c_device, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : ${PREC}$
 
@@ -335,7 +335,7 @@ contains
     ${f_type(PREC)}$(kind=${PREC}$), intent(in), target :: data_in${ranksuffix(RANK)}$   !! Input data that tensor will point at
     integer, intent(in)        :: layout(${RANK}$) !! Control order of indices
     integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
-    integer(c_int), optional, intent(in) :: device_number    !! Device number to use for `torch_kCUDA` case
+    integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
     type(torch_tensor) :: tensor     !! Returned tensor
@@ -346,7 +346,7 @@ contains
     integer(c_int64_t)        :: strides(${RANK}$)                  !! Strides for accessing data
     integer(c_int), parameter :: ndims = ${RANK}$                   !! Number of dimension of input data
     integer                   :: i
-    integer(c_int)            :: device_number_value
+    integer(c_int)            :: device_index_value
 
     c_tensor_shape = shape(data_in)
 
@@ -356,13 +356,13 @@ contains
     end do
 
     ! Process optional arguments
-    if (present(device_number)) then
-      device_number_value = device_number
+    if (present(device_index)) then
+      device_index_value = device_index
     else
-      device_number_value = 0
+      device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_number_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
 
   end function torch_tensor_from_array_${PREC}$_${RANK}$d
 

From 682d8872e5de5d3c382c75b1ef4b89fc8f603f20 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 20 Mar 2024 15:15:10 +0000
Subject: [PATCH 18/42] Rename device as device type

---
 src/ctorch.cpp  |  24 +++---
 src/ctorch.h    |  20 ++---
 src/ftorch.f90  | 192 ++++++++++++++++++++++++------------------------
 src/ftorch.fypp |  54 +++++++-------
 4 files changed, 145 insertions(+), 145 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 431ba164..3a3b9145 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -29,14 +29,14 @@ constexpr auto get_dtype(torch_data_t dtype)
   }
 }
 
-const auto get_device(torch_device_t device, int device_index)
+const auto get_device(torch_device_t device_type, int device_index)
 {
   if (device_index < 0) {
     std::cerr << "[ERROR]: device index must be non-negative, using zero instead"
               << std::endl;
     device_index = 0;
   }
-  switch (device) {
+  switch (device_type) {
   case torch_kCPU:
     if (device_index > 0) {
       std::cerr << "[ERROR]: device index unsupported for CPU-only runs"
@@ -60,7 +60,7 @@ const auto get_device(torch_device_t device, int device_index)
 }
 
 torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
-                           torch_device_t device, int device_index)
+                           torch_device_t device_type, int device_index)
 {
   torch::Tensor* tensor = nullptr;
   try {
@@ -68,7 +68,7 @@ torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
     c10::IntArrayRef vshape(shape, ndim);
     tensor = new torch::Tensor;
     *tensor = torch::zeros(
-        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_index));
+        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device_type, device_index));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete tensor;
@@ -82,7 +82,7 @@ torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
 }
 
 torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
-                          torch_device_t device, int device_index)
+                          torch_device_t device_type, int device_index)
 {
   torch::Tensor* tensor = nullptr;
   try {
@@ -90,7 +90,7 @@ torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
     c10::IntArrayRef vshape(shape, ndim);
     tensor = new torch::Tensor;
     *tensor = torch::ones(
-        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_index));
+        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device_type, device_index));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete tensor;
@@ -104,7 +104,7 @@ torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
 }
 
 torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
-                           torch_device_t device, int device_index)
+                           torch_device_t device_type, int device_index)
 {
   torch::Tensor* tensor = nullptr;
   try {
@@ -112,7 +112,7 @@ torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
     c10::IntArrayRef vshape(shape, ndim);
     tensor = new torch::Tensor;
     *tensor = torch::empty(
-        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device, device_index));
+        vshape, torch::dtype(get_dtype(dtype))).to(get_device(device_type, device_index));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete tensor;
@@ -129,7 +129,7 @@ torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
 // data
 torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
                                const int64_t* strides, torch_data_t dtype,
-                               torch_device_t device, int device_index)
+                               torch_device_t device_type, int device_index)
 {
   torch::Tensor* tensor = nullptr;
 
@@ -140,7 +140,7 @@ torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
     tensor = new torch::Tensor;
     *tensor = torch::from_blob(
         data, vshape, vstrides,
-        torch::dtype(get_dtype(dtype))).to(get_device(device, device_index));
+        torch::dtype(get_dtype(dtype))).to(get_device(device_type, device_index));
 
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
@@ -173,13 +173,13 @@ void torch_tensor_delete(torch_tensor_t tensor)
 }
 
 torch_jit_script_module_t torch_jit_load(const char* filename,
-                                         const torch_device_t device,
+                                         const torch_device_t device_type,
                                          const int device_index)
 {
   torch::jit::script::Module* module = nullptr;
   try {
     module = new torch::jit::script::Module;
-    *module = torch::jit::load(filename, get_device(device, device_index));
+    *module = torch::jit::load(filename, get_device(device_type, device_index));
   } catch (const torch::Error& e) {
     std::cerr << "[ERROR]: " << e.msg() << std::endl;
     delete module;
diff --git a/src/ctorch.h b/src/ctorch.h
index 8aacc8e2..4891e1ce 100644
--- a/src/ctorch.h
+++ b/src/ctorch.h
@@ -37,11 +37,11 @@ typedef enum { torch_kCPU, torch_kCUDA } torch_device_t;
  * @param number of dimensions of the Tensor
  * @param shape of the Tensor
  * @param data type of the elements of the Tensor
- * @param device used (cpu, CUDA, etc.)
+ * @param device type used (cpu, CUDA, etc.)
  * @param device index for the CUDA case
  */
 EXPORT_C torch_tensor_t torch_zeros(int ndim, const int64_t* shape,
-                                    torch_data_t dtype, torch_device_t device,
+                                    torch_data_t dtype, torch_device_t device_type,
                                     int device_index);
 
 /**
@@ -49,11 +49,11 @@ EXPORT_C torch_tensor_t torch_zeros(int ndim, const int64_t* shape,
  * @param number of dimensions of the Tensor
  * @param shape of the Tensor
  * @param data type of the elements of the Tensor
- * @param device used (cpu, CUDA, etc.)
+ * @param device type used (cpu, CUDA, etc.)
  * @param device index for the CUDA case
  */
 EXPORT_C torch_tensor_t torch_ones(int ndim, const int64_t* shape,
-                                   torch_data_t dtype, torch_device_t device,
+                                   torch_data_t dtype, torch_device_t device_type,
                                    int device_index);
 
 /**
@@ -61,11 +61,11 @@ EXPORT_C torch_tensor_t torch_ones(int ndim, const int64_t* shape,
  * @param number of dimensions of the Tensor
  * @param shape of the Tensor
  * @param data type of the elements of the Tensor
- * @param device used (cpu, CUDA, etc.)
+ * @param device type used (cpu, CUDA, etc.)
  * @param device index for the CUDA case
  */
 EXPORT_C torch_tensor_t torch_empty(int ndim, const int64_t* shape,
-                                    torch_data_t dtype, torch_device_t device,
+                                    torch_data_t dtype, torch_device_t device_type,
                                     int device_index);
 
 /**
@@ -75,7 +75,7 @@ EXPORT_C torch_tensor_t torch_empty(int ndim, const int64_t* shape,
  * @param shape of the Tensor
  * @param strides to take through data
  * @param data type of the elements of the Tensor
- * @param device used (cpu, CUDA, etc.)
+ * @param device type used (cpu, CUDA, etc.)
  * @param device index for the CUDA case
  * @return Torch Tensor interpretation of the data pointed at
  */
@@ -83,7 +83,7 @@ EXPORT_C torch_tensor_t torch_from_blob(void* data, int ndim,
                                         const int64_t* shape,
                                         const int64_t* strides,
                                         torch_data_t dtype,
-                                        torch_device_t device,
+                                        torch_device_t device_type,
                                         int device_index);
 
 /**
@@ -112,12 +112,12 @@ EXPORT_C void torch_tensor_delete(torch_tensor_t tensor);
 /**
  * Function to load in a Torch model from a TorchScript file and store in a Torch Module
  * @param filename where TorchScript description of model is stored
- * @param device used (cpu, CUDA, etc.)
+ * @param device type used (cpu, CUDA, etc.)
  * @param device index for the CUDA case
  * @return Torch Module loaded in from file
  */
 EXPORT_C torch_jit_script_module_t torch_jit_load(const char* filename,
-                                                  const torch_device_t device,
+                                                  const torch_device_t device_type,
                                                   const int device_index);
 
 /**
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index d7d1e608..a8ac616f 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -75,7 +75,7 @@ module ftorch
   end interface
 
   interface
-    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_index) result(tensor_p) &
+    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device_type, device_index) result(tensor_p) &
                                bind(c, name = 'torch_from_blob')
       use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
 
@@ -85,7 +85,7 @@ function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, de
       integer(c_int64_t), intent(in)    :: tensor_shape(*)
       integer(c_int64_t), intent(in)    :: strides(*)
       integer(c_int), value, intent(in) :: dtype
-      integer(c_int), value, intent(in) :: device
+      integer(c_int), value, intent(in) :: device_type
       integer(c_int), value, intent(in) :: device_index
       type(c_ptr)                       :: tensor_p
     end function torch_from_blob_c
@@ -94,24 +94,24 @@ end function torch_from_blob_c
 contains
 
   !> Returns a tensor filled with the scalar value 0.
-  function torch_tensor_zeros(ndims, tensor_shape, dtype, device, device_index) result(tensor)
+  function torch_tensor_zeros(ndims, tensor_shape, dtype, device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
-    integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
     integer(c_int)                 :: device_index_value  !! device index used
 
     interface
-      function torch_zeros_c(ndims, tensor_shape, dtype, device, device_index) result(tensor) &
+      function torch_zeros_c(ndims, tensor_shape, dtype, device_type, device_index) result(tensor) &
           bind(c, name = 'torch_zeros')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
-        integer(c_int), value, intent(in) :: device
+        integer(c_int), value, intent(in) :: device_type
         integer(c_int), value, intent(in) :: device_index
         type(c_ptr)                       :: tensor
       end function torch_zeros_c
@@ -124,28 +124,28 @@ end function torch_zeros_c
       device_index_value = 0
     endif
 
-    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_index_value)
+    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device_type, device_index_value)
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
-  function torch_tensor_ones(ndims, tensor_shape, dtype, device, device_index) result(tensor)
+  function torch_tensor_ones(ndims, tensor_shape, dtype, device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
-    integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
     integer(c_int)                 :: device_index_value  !! device index used
 
     interface
-      function torch_ones_c(ndims, tensor_shape, dtype, device, device_index) result(tensor) &
+      function torch_ones_c(ndims, tensor_shape, dtype, device_type, device_index) result(tensor) &
           bind(c, name = 'torch_ones')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
-        integer(c_int), value, intent(in) :: device
+        integer(c_int), value, intent(in) :: device_type
         integer(c_int), value, intent(in) :: device_index
         type(c_ptr)                       :: tensor
       end function torch_ones_c
@@ -158,19 +158,19 @@ end function torch_ones_c
       device_index_value = 0
     endif
 
-    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_index_value)
+    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device_type, device_index_value)
   end function torch_tensor_ones
 
   ! Torch Tensor API
   !| Exposes the given data as a tensor without taking ownership of the original data.
   !  This routine will take an (i, j, k) array and return an (k, j, i) tensor.
-  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device, device_index) result(tensor)
+  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
     type(c_ptr), intent(in)        :: data       !! Pointer to data
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
-    integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     integer(c_int), intent(in)     :: layout(*)  !! Layout for strides for accessing data
     type(torch_tensor)             :: tensor     !! Returned tensor
@@ -191,7 +191,7 @@ function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_index_value)
+    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device_type, device_index_value)
   end function torch_tensor_from_blob
 
   !> Prints the contents of a tensor.
@@ -244,31 +244,31 @@ end subroutine torch_tensor_delete
 
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
-  function torch_module_load(filename, device, device_index) result(module)
+  function torch_module_load(filename, device_type, device_index) result(module)
     use, intrinsic :: iso_c_binding, only : c_int, c_null_char
     character(*), intent(in)   :: filename !! Filename of TorchScript module
-    integer(c_int), optional, intent(in) :: device !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), optional, intent(in) :: device_type !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index !! device index to use for `torch_kCUDA` case
     type(torch_module)         :: module   !! Returned deserialized module
-    integer(c_int) :: device_value
+    integer(c_int) :: device_type_value
     integer(c_int) :: device_index_value
 
     interface
-      function torch_jit_load_c(filename, device, device_index) result(module) &
+      function torch_jit_load_c(filename, device_type, device_index) result(module) &
           bind(c, name = 'torch_jit_load')
         use, intrinsic :: iso_c_binding, only : c_char, c_int, c_ptr
         character(c_char), intent(in) :: filename(*)
-        integer(c_int), value, intent(in)    :: device
+        integer(c_int), value, intent(in)    :: device_type
         integer(c_int), value, intent(in)    :: device_index
         type(c_ptr)                   :: module
       end function torch_jit_load_c
     end interface
 
     ! Process optional arguments
-    if (present(device)) then
-      device_value = device
+    if (present(device_type)) then
+      device_type_value = device_type
     else
-      device_value = 0
+      device_type_value = 0
     endif
     if (present(device_index)) then
       device_index_value = device_index
@@ -277,7 +277,7 @@ end function torch_jit_load_c
     endif
 
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_value, device_index_value)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_type_value, device_index_value)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors
@@ -327,14 +327,14 @@ end subroutine torch_jit_module_delete_c
   end subroutine torch_module_delete
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int8`
-  function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int8_1d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
     ! inputs
     integer(kind=int8), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -362,19 +362,19 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device, device_index
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int8_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int8`
-  function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int8_2d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
     ! inputs
     integer(kind=int8), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -402,19 +402,19 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device, device_index
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int8_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int8`
-  function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int8_3d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
     ! inputs
     integer(kind=int8), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -442,19 +442,19 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device, device_index
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int8_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int8`
-  function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int8_4d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int8
 
     ! inputs
     integer(kind=int8), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -482,19 +482,19 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device, device_index
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int8_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int16`
-  function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int16_1d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
     ! inputs
     integer(kind=int16), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -522,19 +522,19 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int16_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int16`
-  function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int16_2d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
     ! inputs
     integer(kind=int16), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -562,19 +562,19 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int16_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int16`
-  function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int16_3d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
     ! inputs
     integer(kind=int16), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -602,19 +602,19 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int16_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int16`
-  function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int16_4d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int16
 
     ! inputs
     integer(kind=int16), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -642,19 +642,19 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int16_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int32`
-  function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int32_1d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
     ! inputs
     integer(kind=int32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -682,19 +682,19 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int32`
-  function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int32_2d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
     ! inputs
     integer(kind=int32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -722,19 +722,19 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int32`
-  function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int32_3d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
     ! inputs
     integer(kind=int32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -762,19 +762,19 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int32`
-  function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int32_4d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int32
 
     ! inputs
     integer(kind=int32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -802,19 +802,19 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `int64`
-  function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int64_1d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
     ! inputs
     integer(kind=int64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -842,19 +842,19 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `int64`
-  function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int64_2d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
     ! inputs
     integer(kind=int64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -882,19 +882,19 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `int64`
-  function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int64_3d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
     ! inputs
     integer(kind=int64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -922,19 +922,19 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `int64`
-  function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_int64_4d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : int64
 
     ! inputs
     integer(kind=int64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -962,19 +962,19 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device, device_inde
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_int64_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real32`
-  function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_real32_1d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
     ! inputs
     real(kind=real32), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -1002,19 +1002,19 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device, device_ind
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_real32_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real32`
-  function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_real32_2d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
     ! inputs
     real(kind=real32), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -1042,19 +1042,19 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device, device_ind
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_real32_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real32`
-  function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_real32_3d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
     ! inputs
     real(kind=real32), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -1082,19 +1082,19 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device, device_ind
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_real32_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real32`
-  function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_real32_4d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real32
 
     ! inputs
     real(kind=real32), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -1122,19 +1122,19 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device, device_ind
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_real32_4d
 
   !> Return a Torch tensor pointing to data_in array of rank 1 containing data of type `real64`
-  function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_real64_1d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
     ! inputs
     real(kind=real64), intent(in), target :: data_in(:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(1) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -1162,19 +1162,19 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device, device_ind
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_real64_1d
 
   !> Return a Torch tensor pointing to data_in array of rank 2 containing data of type `real64`
-  function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_real64_2d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
     ! inputs
     real(kind=real64), intent(in), target :: data_in(:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(2) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -1202,19 +1202,19 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device, device_ind
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_real64_2d
 
   !> Return a Torch tensor pointing to data_in array of rank 3 containing data of type `real64`
-  function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_real64_3d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
     ! inputs
     real(kind=real64), intent(in), target :: data_in(:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(3) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -1242,19 +1242,19 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device, device_ind
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_real64_3d
 
   !> Return a Torch tensor pointing to data_in array of rank 4 containing data of type `real64`
-  function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_real64_4d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : real64
 
     ! inputs
     real(kind=real64), intent(in), target :: data_in(:,:,:,:)   !! Input data that tensor will point at
     integer, intent(in)        :: layout(4) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -1282,7 +1282,7 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device, device_ind
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_real64_4d
 
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index d76d2d81..4e32e1d0 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -73,7 +73,7 @@ module ftorch
   end interface
 
   interface
-    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_index) result(tensor_p) &
+    function torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device_type, device_index) result(tensor_p) &
                                bind(c, name = 'torch_from_blob')
       use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
 
@@ -83,7 +83,7 @@ module ftorch
       integer(c_int64_t), intent(in)    :: tensor_shape(*)
       integer(c_int64_t), intent(in)    :: strides(*)
       integer(c_int), value, intent(in) :: dtype
-      integer(c_int), value, intent(in) :: device
+      integer(c_int), value, intent(in) :: device_type
       integer(c_int), value, intent(in) :: device_index
       type(c_ptr)                       :: tensor_p
     end function torch_from_blob_c
@@ -92,24 +92,24 @@ module ftorch
 contains
 
   !> Returns a tensor filled with the scalar value 0.
-  function torch_tensor_zeros(ndims, tensor_shape, dtype, device, device_index) result(tensor)
+  function torch_tensor_zeros(ndims, tensor_shape, dtype, device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
-    integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
     integer(c_int)                 :: device_index_value  !! device index used
 
     interface
-      function torch_zeros_c(ndims, tensor_shape, dtype, device, device_index) result(tensor) &
+      function torch_zeros_c(ndims, tensor_shape, dtype, device_type, device_index) result(tensor) &
           bind(c, name = 'torch_zeros')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
-        integer(c_int), value, intent(in) :: device
+        integer(c_int), value, intent(in) :: device_type
         integer(c_int), value, intent(in) :: device_index
         type(c_ptr)                       :: tensor
       end function torch_zeros_c
@@ -122,28 +122,28 @@ contains
       device_index_value = 0
     endif
 
-    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device, device_index_value)
+    tensor%p = torch_zeros_c(ndims, tensor_shape, dtype, device_type, device_index_value)
   end function torch_tensor_zeros
 
   !> Returns a tensor filled with the scalar value 1.
-  function torch_tensor_ones(ndims, tensor_shape, dtype, device, device_index) result(tensor)
+  function torch_tensor_ones(ndims, tensor_shape, dtype, device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
-    integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     type(torch_tensor)             :: tensor     !! Returned tensor
     integer(c_int)                 :: device_index_value  !! device index used
 
     interface
-      function torch_ones_c(ndims, tensor_shape, dtype, device, device_index) result(tensor) &
+      function torch_ones_c(ndims, tensor_shape, dtype, device_type, device_index) result(tensor) &
           bind(c, name = 'torch_ones')
         use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
         integer(c_int), value, intent(in) :: ndims
         integer(c_int64_t), intent(in)    :: tensor_shape(*)
         integer(c_int), value, intent(in) :: dtype
-        integer(c_int), value, intent(in) :: device
+        integer(c_int), value, intent(in) :: device_type
         integer(c_int), value, intent(in) :: device_index
         type(c_ptr)                       :: tensor
       end function torch_ones_c
@@ -156,19 +156,19 @@ contains
       device_index_value = 0
     endif
 
-    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device, device_index_value)
+    tensor%p = torch_ones_c(ndims, tensor_shape, dtype, device_type, device_index_value)
   end function torch_tensor_ones
 
   ! Torch Tensor API
   !| Exposes the given data as a tensor without taking ownership of the original data.
   !  This routine will take an (i, j, k) array and return an (k, j, i) tensor.
-  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device, device_index) result(tensor)
+  function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_ptr
     type(c_ptr), intent(in)        :: data       !! Pointer to data
     integer(c_int), intent(in)     :: ndims      !! Number of dimensions of the tensor
     integer(c_int64_t), intent(in) :: tensor_shape(*)   !! Shape of the tensor
     integer(c_int), intent(in)     :: dtype      !! Data type of the tensor
-    integer(c_int), intent(in)     :: device     !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in)     :: device_type  !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index     !! device index to use for `torch_kCUDA` case
     integer(c_int), intent(in)     :: layout(*)  !! Layout for strides for accessing data
     type(torch_tensor)             :: tensor     !! Returned tensor
@@ -189,7 +189,7 @@ contains
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device, device_index_value)
+    tensor%p = torch_from_blob_c(data, ndims, tensor_shape, strides, dtype, device_type, device_index_value)
   end function torch_tensor_from_blob
 
   !> Prints the contents of a tensor.
@@ -242,31 +242,31 @@ contains
 
   ! Torch Module API
   !> Loads a TorchScript module (pre-trained PyTorch model saved with TorchScript)
-  function torch_module_load(filename, device, device_index) result(module)
+  function torch_module_load(filename, device_type, device_index) result(module)
     use, intrinsic :: iso_c_binding, only : c_int, c_null_char
     character(*), intent(in)   :: filename !! Filename of TorchScript module
-    integer(c_int), optional, intent(in) :: device !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), optional, intent(in) :: device_type !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index !! device index to use for `torch_kCUDA` case
     type(torch_module)         :: module   !! Returned deserialized module
-    integer(c_int) :: device_value
+    integer(c_int) :: device_type_value
     integer(c_int) :: device_index_value
 
     interface
-      function torch_jit_load_c(filename, device, device_index) result(module) &
+      function torch_jit_load_c(filename, device_type, device_index) result(module) &
           bind(c, name = 'torch_jit_load')
         use, intrinsic :: iso_c_binding, only : c_char, c_int, c_ptr
         character(c_char), intent(in) :: filename(*)
-        integer(c_int), value, intent(in)    :: device
+        integer(c_int), value, intent(in)    :: device_type
         integer(c_int), value, intent(in)    :: device_index
         type(c_ptr)                   :: module
       end function torch_jit_load_c
     end interface
 
     ! Process optional arguments
-    if (present(device)) then
-      device_value = device
+    if (present(device_type)) then
+      device_type_value = device_type
     else
-      device_value = 0
+      device_type_value = 0
     endif
     if (present(device_index)) then
       device_index_value = device_index
@@ -275,7 +275,7 @@ contains
     endif
 
     ! Need to append c_null_char at end of filename
-    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_value, device_index_value)
+    module%p = torch_jit_load_c(trim(adjustl(filename))//c_null_char, device_type_value, device_index_value)
   end function torch_module_load
 
   !> Performs a forward pass of the module with the input tensors
@@ -327,14 +327,14 @@ contains
   #:for PREC in PRECISIONS
   #:for RANK in RANKS
   !> Return a Torch tensor pointing to data_in array of rank ${RANK}$ containing data of type `${PREC}$`
-  function torch_tensor_from_array_${PREC}$_${RANK}$d(data_in, layout, c_device, device_index) result(tensor)
+  function torch_tensor_from_array_${PREC}$_${RANK}$d(data_in, layout, c_device_type, device_index) result(tensor)
     use, intrinsic :: iso_c_binding, only : c_int, c_int64_t, c_float, c_loc
     use, intrinsic :: iso_fortran_env, only : ${PREC}$
 
     ! inputs
     ${f_type(PREC)}$(kind=${PREC}$), intent(in), target :: data_in${ranksuffix(RANK)}$   !! Input data that tensor will point at
     integer, intent(in)        :: layout(${RANK}$) !! Control order of indices
-    integer(c_int), intent(in) :: c_device         !! Device on which the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
+    integer(c_int), intent(in) :: c_device_type    !! Device type the tensor will live on (`torch_kCPU` or `torch_kCUDA`)
     integer(c_int), optional, intent(in) :: device_index    !! device index to use for `torch_kCUDA` case
 
     ! output tensory
@@ -362,7 +362,7 @@ contains
       device_index_value = 0
     endif
 
-    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device, device_index_value)
+    tensor%p = torch_from_blob_c(c_loc(data_in), ndims, c_tensor_shape, strides, c_dtype, c_device_type, device_index_value)
 
   end function torch_tensor_from_array_${PREC}$_${RANK}$d
 

From 2d9698ccfa1a4d8a5b3b006b687e66a5b174d99b Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 20 Mar 2024 15:23:09 +0000
Subject: [PATCH 19/42] Device index defaults to -1 on CPU and 0 on GPU

---
 src/ctorch.cpp  | 13 ++++-------
 src/ftorch.f90  | 58 ++++++++++++++++++++++++++++++++++++++++++++++++-
 src/ftorch.fypp | 12 +++++++++-
 3 files changed, 72 insertions(+), 11 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 3a3b9145..64ab7c1f 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -31,24 +31,19 @@ constexpr auto get_dtype(torch_data_t dtype)
 
 const auto get_device(torch_device_t device_type, int device_index)
 {
-  if (device_index < 0) {
-    std::cerr << "[ERROR]: device index must be non-negative, using zero instead"
-              << std::endl;
-    device_index = 0;
-  }
   switch (device_type) {
   case torch_kCPU:
-    if (device_index > 0) {
+    if (device_index != -1) {
       std::cerr << "[ERROR]: device index unsupported for CPU-only runs"
                 << std::endl;
     }
     return torch::Device(torch::kCPU);
   case torch_kCUDA:
-    if (device_index < torch::cuda::device_count()) {
+    if (device_index >= 0 && device_index < torch::cuda::device_count()) {
       return torch::Device(torch::kCUDA, device_index);
     } else {
-      std::cerr << "[ERROR]: device index " << device_index
-                << " exceeds device count " << torch::cuda::device_count()
+      std::cerr << "[ERROR]: invalid device index " << device_index
+                << " for device count " << torch::cuda::device_count()
                 << ", using zero instead" << std::endl;
       return torch::Device(torch::kCUDA);
     }
diff --git a/src/ftorch.f90 b/src/ftorch.f90
index a8ac616f..6daea296 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -120,6 +120,8 @@ end function torch_zeros_c
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -154,6 +156,8 @@ end function torch_ones_c
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -187,6 +191,8 @@ function torch_tensor_from_blob(data, ndims, tensor_shape, layout, dtype, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -268,10 +274,12 @@ end function torch_jit_load_c
     if (present(device_type)) then
       device_type_value = device_type
     else
-      device_type_value = 0
+      device_type_value = torch_kCPU
     endif
     if (present(device_index)) then
       device_index_value = device_index
+    else if (device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -358,6 +366,8 @@ function torch_tensor_from_array_int8_1d(data_in, layout, c_device_type, device_
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -398,6 +408,8 @@ function torch_tensor_from_array_int8_2d(data_in, layout, c_device_type, device_
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -438,6 +450,8 @@ function torch_tensor_from_array_int8_3d(data_in, layout, c_device_type, device_
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -478,6 +492,8 @@ function torch_tensor_from_array_int8_4d(data_in, layout, c_device_type, device_
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -518,6 +534,8 @@ function torch_tensor_from_array_int16_1d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -558,6 +576,8 @@ function torch_tensor_from_array_int16_2d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -598,6 +618,8 @@ function torch_tensor_from_array_int16_3d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -638,6 +660,8 @@ function torch_tensor_from_array_int16_4d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -678,6 +702,8 @@ function torch_tensor_from_array_int32_1d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -718,6 +744,8 @@ function torch_tensor_from_array_int32_2d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -758,6 +786,8 @@ function torch_tensor_from_array_int32_3d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -798,6 +828,8 @@ function torch_tensor_from_array_int32_4d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -838,6 +870,8 @@ function torch_tensor_from_array_int64_1d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -878,6 +912,8 @@ function torch_tensor_from_array_int64_2d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -918,6 +954,8 @@ function torch_tensor_from_array_int64_3d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -958,6 +996,8 @@ function torch_tensor_from_array_int64_4d(data_in, layout, c_device_type, device
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -998,6 +1038,8 @@ function torch_tensor_from_array_real32_1d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -1038,6 +1080,8 @@ function torch_tensor_from_array_real32_2d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -1078,6 +1122,8 @@ function torch_tensor_from_array_real32_3d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -1118,6 +1164,8 @@ function torch_tensor_from_array_real32_4d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -1158,6 +1206,8 @@ function torch_tensor_from_array_real64_1d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -1198,6 +1248,8 @@ function torch_tensor_from_array_real64_2d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -1238,6 +1290,8 @@ function torch_tensor_from_array_real64_3d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -1278,6 +1332,8 @@ function torch_tensor_from_array_real64_4d(data_in, layout, c_device_type, devic
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index 4e32e1d0..b9ee3674 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -118,6 +118,8 @@ contains
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -152,6 +154,8 @@ contains
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -185,6 +189,8 @@ contains
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -266,10 +272,12 @@ contains
     if (present(device_type)) then
       device_type_value = device_type
     else
-      device_type_value = 0
+      device_type_value = torch_kCPU
     endif
     if (present(device_index)) then
       device_index_value = device_index
+    else if (device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif
@@ -358,6 +366,8 @@ contains
     ! Process optional arguments
     if (present(device_index)) then
       device_index_value = device_index
+    else if (c_device_type == torch_kCPU) then
+      device_index_value = -1
     else
       device_index_value = 0
     endif

From ca40777eff6f99a9517442dc6b281282c325ef58 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 20 Mar 2024 16:06:07 +0000
Subject: [PATCH 20/42] Make device type and index optional on C++ side

---
 examples/n_c_and_cpp/resnet_infer_c.c |  6 +++---
 src/ctorch.cpp                        | 12 ++++++------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/examples/n_c_and_cpp/resnet_infer_c.c b/examples/n_c_and_cpp/resnet_infer_c.c
index 4b60da56..852f831b 100644
--- a/examples/n_c_and_cpp/resnet_infer_c.c
+++ b/examples/n_c_and_cpp/resnet_infer_c.c
@@ -28,7 +28,7 @@ int main(int argc, const char* argv[])
   output_shape[0] = batch_size;
   output_shape[1] = 1000;
 
-  torch_jit_script_module_t model = torch_jit_load(argv[1], torch_kCPU, 0);
+  torch_jit_script_module_t model = torch_jit_load(argv[1]);
   int64_t input_size
       = input_shape[0] * input_shape[1] * input_shape[2] * input_shape[3];
   float* input_data = (float*)malloc(input_size * sizeof(float));
@@ -41,10 +41,10 @@ int main(int argc, const char* argv[])
   if (model) {
     torch_tensor_t inputs[1];
     inputs[0] = torch_from_blob(input_data, input_ndim, input_shape,
-                                           torch_kFloat32, torch_kCPU, 0);
+                                           torch_kFloat32, torch_kCPU);
     const int nin = 1;
     torch_tensor_t output = torch_from_blob(
-        output_data, output_ndim, output_shape, torch_kFloat32, torch_kCPU, 0);
+        output_data, output_ndim, output_shape, torch_kFloat32, torch_kCPU);
     torch_jit_module_forward(model, inputs, nin, output);
     torch_tensor_print(output);
     torch_jit_module_delete(model);
diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 64ab7c1f..8ddb794f 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -55,7 +55,7 @@ const auto get_device(torch_device_t device_type, int device_index)
 }
 
 torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
-                           torch_device_t device_type, int device_index)
+                           torch_device_t device_type, int device_index = -1)
 {
   torch::Tensor* tensor = nullptr;
   try {
@@ -77,7 +77,7 @@ torch_tensor_t torch_zeros(int ndim, const int64_t* shape, torch_data_t dtype,
 }
 
 torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
-                          torch_device_t device_type, int device_index)
+                          torch_device_t device_type, int device_index = -1)
 {
   torch::Tensor* tensor = nullptr;
   try {
@@ -99,7 +99,7 @@ torch_tensor_t torch_ones(int ndim, const int64_t* shape, torch_data_t dtype,
 }
 
 torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
-                           torch_device_t device_type, int device_index)
+                           torch_device_t device_type, int device_index = -1)
 {
   torch::Tensor* tensor = nullptr;
   try {
@@ -124,7 +124,7 @@ torch_tensor_t torch_empty(int ndim, const int64_t* shape, torch_data_t dtype,
 // data
 torch_tensor_t torch_from_blob(void* data, int ndim, const int64_t* shape,
                                const int64_t* strides, torch_data_t dtype,
-                               torch_device_t device_type, int device_index)
+                               torch_device_t device_type, int device_index = -1)
 {
   torch::Tensor* tensor = nullptr;
 
@@ -168,8 +168,8 @@ void torch_tensor_delete(torch_tensor_t tensor)
 }
 
 torch_jit_script_module_t torch_jit_load(const char* filename,
-                                         const torch_device_t device_type,
-                                         const int device_index)
+                                         const torch_device_t device_type = torch_kCPU,
+                                         const int device_index = -1)
 {
   torch::jit::script::Module* module = nullptr;
   try {

From e37f743273b6b21205b8f88d40f7beec5484911d Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 20 Mar 2024 17:18:31 +0000
Subject: [PATCH 21/42] Fix typo in torch_model_load

---
 src/ftorch.f90  | 2 +-
 src/ftorch.fypp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/ftorch.f90 b/src/ftorch.f90
index 6daea296..97b2ea0b 100644
--- a/src/ftorch.f90
+++ b/src/ftorch.f90
@@ -278,7 +278,7 @@ end function torch_jit_load_c
     endif
     if (present(device_index)) then
       device_index_value = device_index
-    else if (device_type == torch_kCPU) then
+    else if (device_type_value == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0
diff --git a/src/ftorch.fypp b/src/ftorch.fypp
index b9ee3674..2ccdeac8 100644
--- a/src/ftorch.fypp
+++ b/src/ftorch.fypp
@@ -276,7 +276,7 @@ contains
     endif
     if (present(device_index)) then
       device_index_value = device_index
-    else if (device_type == torch_kCPU) then
+    else if (device_type_value == torch_kCPU) then
       device_index_value = -1
     else
       device_index_value = 0

From 8b63dfea0afd23b6275755cf9e6ade4dd4bfb968 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 22 Mar 2024 13:58:14 +0000
Subject: [PATCH 22/42] Fix typos in example 1

---
 examples/1_SimpleNet/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/1_SimpleNet/README.md b/examples/1_SimpleNet/README.md
index 7d77f44e..9536779b 100644
--- a/examples/1_SimpleNet/README.md
+++ b/examples/1_SimpleNet/README.md
@@ -9,7 +9,7 @@ covered in later examples.
 
 ## Description
 
-A python file `simplenet.py` is provided that defines a very simple pytorch 'net' that takes an input
+A python file `simplenet.py` is provided that defines a very simple PyTorch 'net' that takes an input
 vector of length 5 and applies a single `Linear` layer to multiply it by 2.
 
 A modified version of the `pt2ts.py` tool saves this simple net to TorchScript.
@@ -29,7 +29,7 @@ To run this example requires:
 ## Running
 
 To run this example install FTorch as described in the main documentation.
-Then from this directory create a virtual environment an install the necessary python
+Then from this directory create a virtual environment and install the necessary python
 modules:
 ```
 python3 -m venv venv
@@ -47,7 +47,7 @@ tensor([[0, 2, 4, 6, 8]])
 ```
 
 To save the SimpleNet model to TorchScript run the modified version of the
-`pt2ts.py` tool :
+`pt2ts.py` tool:
 ```
 python3 pt2ts.py
 ```

From 8982129403d7eb3e6ec1cf81c298b6833e630357 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Fri, 22 Mar 2024 14:21:14 +0000
Subject: [PATCH 23/42] Initial draft of example 3_MultiGPU

---
 .../1_SimpleNet/simplenet_infer_python.py     |   1 -
 examples/3_MultiGPU/CMakeLists.txt            |  19 +++
 examples/3_MultiGPU/README.md                 |  98 +++++++++++
 examples/3_MultiGPU/pt2ts.py                  | 158 ++++++++++++++++++
 examples/3_MultiGPU/requirements.txt          |   1 +
 .../3_MultiGPU/simplenet_infer_fortran.f90    |  63 +++++++
 examples/3_MultiGPU/simplenet_infer_python.py |  56 +++++++
 examples/README.md                            |   3 +
 8 files changed, 398 insertions(+), 1 deletion(-)
 create mode 100644 examples/3_MultiGPU/CMakeLists.txt
 create mode 100644 examples/3_MultiGPU/README.md
 create mode 100644 examples/3_MultiGPU/pt2ts.py
 create mode 100644 examples/3_MultiGPU/requirements.txt
 create mode 100644 examples/3_MultiGPU/simplenet_infer_fortran.f90
 create mode 100644 examples/3_MultiGPU/simplenet_infer_python.py

diff --git a/examples/1_SimpleNet/simplenet_infer_python.py b/examples/1_SimpleNet/simplenet_infer_python.py
index 54570882..425873ac 100644
--- a/examples/1_SimpleNet/simplenet_infer_python.py
+++ b/examples/1_SimpleNet/simplenet_infer_python.py
@@ -45,7 +45,6 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
     saved_model_file = "saved_simplenet_model_cpu.pt"
 
     device_to_run = "cpu"
-    # device = "cuda"
 
     batch_size_to_run = 1
 
diff --git a/examples/3_MultiGPU/CMakeLists.txt b/examples/3_MultiGPU/CMakeLists.txt
new file mode 100644
index 00000000..8b5ac27f
--- /dev/null
+++ b/examples/3_MultiGPU/CMakeLists.txt
@@ -0,0 +1,19 @@
+cmake_minimum_required(VERSION 3.1 FATAL_ERROR)
+#policy CMP0076 - target_sources source files are relative to file where target_sources is run
+cmake_policy (SET CMP0076 NEW)
+
+set(PROJECT_NAME MultiGPUExample)
+
+project(${PROJECT_NAME} LANGUAGES Fortran)
+
+# Build in Debug mode if not specified
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Debug CACHE STRING "" FORCE)
+endif()
+
+find_package(FTorch)
+message(STATUS "Building with Fortran PyTorch coupling")
+
+# Fortran example
+add_executable(simplenet_infer_fortran simplenet_infer_fortran.f90)
+target_link_libraries(simplenet_infer_fortran PRIVATE FTorch::ftorch)
diff --git a/examples/3_MultiGPU/README.md b/examples/3_MultiGPU/README.md
new file mode 100644
index 00000000..ad9fd836
--- /dev/null
+++ b/examples/3_MultiGPU/README.md
@@ -0,0 +1,98 @@
+# Example 3 - MultiGPU
+
+This example revisits the SimpleNet example and demonstrates how to run it using
+multiple GPU devices.
+
+
+## Description
+
+The same python file `simplenet.py` is used from the earlier example. Recall that it
+defines a very simple PyTorch network that takes an input of length 5 and applies a
+single `Linear` layer to multiply it by 2.
+
+The same `pt2ts.py` tool is used to save the simple network to TorchScript.
+
+A series of files `simplenet_infer_<LANG>` then bind from other languages to run the
+TorchScript model in inference mode.
+
+## Dependencies
+
+To run this example requires:
+
+- cmake
+- mpif90
+- FTorch (installed as described in main package with mpicc, mpicxx, and mpif90)
+- python3
+
+## Running
+
+To run this example install FTorch as described in the main documentation. Then from
+this directory create a virtual environment and install the necessary python modules:
+```
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+Copy over the `simplenet.py` script from the earlier example:
+```
+cp ../1_SimpleNet/simplenet.py .
+```
+
+You can check that everything is working by running `simplenet.py`:
+```
+python3 simplenet.py
+```
+As before, this defines the network and runs it with an input tensor
+[0.0, 1.0, 2.0, 3.0, 4.0] to produce the result:
+```
+tensor([[0, 2, 4, 6, 8]])
+```
+
+To save the SimpleNet model to TorchScript run the modified version of the `pt2ts.py`
+tool:
+```
+python3 pt2ts.py
+```
+which will generate `saved_simplenet_model_cuda.pt` - the TorchScript instance of the
+network. The only difference with the earlier example is that the model is built to
+be run using CUDA rather than on CPU.
+
+You can check that everything is working by running the `simplenet_infer_python.py`
+script:
+```
+python3 simplenet_infer_python.py
+```
+This reads the model in from the TorchScript file and runs it with an input tensor
+[0.0, 1.0, 2.0, 3.0, 4.0] to produce the result:
+```
+tensor([[0, 2, 4, 6, 8]])
+```
+
+At this point we no longer require python, so can deactivate the virtual environment:
+```
+deactivate
+```
+
+To call the saved SimpleNet model from Fortran we need to compiler the `simplnet_infer`
+files. This can be done using the included `CMakeLists.txt` as follows, noting that we
+need to use an MPI-enabled Fortran compiler:
+```
+mkdir build
+cd build
+cmake .. -DCMAKE_PREFIX_PATH=<path/to/your/installation/of/library/> \
+	-DCMAKE_Fortran_COMPILER=<your/mpif90/compiler> -DCMAKE_BUILD_TYPE=Release
+cmake --build .
+```
+
+To run the compiled code calling the saved SimpleNet TorchScript from Fortran, run the
+executable with an argument of the saved model file:
+```
+./simplenet_infer_fortran ../saved_simplenet_model_cuda.pt
+```
+
+# TODO: Running
+
+# TODO: Make rather than CMake
+
+# TODO: Further options
diff --git a/examples/3_MultiGPU/pt2ts.py b/examples/3_MultiGPU/pt2ts.py
new file mode 100644
index 00000000..0e67669f
--- /dev/null
+++ b/examples/3_MultiGPU/pt2ts.py
@@ -0,0 +1,158 @@
+"""Load a PyTorch model and convert it to TorchScript."""
+
+from typing import Optional
+import torch
+
+# FPTLIB-TODO
+# Add a module import with your model here:
+# This example assumes the model architecture is in an adjacent module `my_ml_model.py`
+import simplenet
+
+
+def script_to_torchscript(
+    model: torch.nn.Module, filename: Optional[str] = "scripted_model.pt"
+) -> None:
+    """
+    Save PyTorch model to TorchScript using scripting.
+
+    Parameters
+    ----------
+    model : torch.NN.Module
+        a PyTorch model
+    filename : str
+        name of file to save to
+    """
+    print("Saving model using scripting...", end="")
+    scripted_model = torch.jit.script(model)
+    # print(scripted_model.code)
+    scripted_model.save(filename)
+    print("done.")
+
+
+def trace_to_torchscript(
+    model: torch.nn.Module,
+    dummy_input: torch.Tensor,
+    filename: Optional[str] = "traced_model.pt",
+) -> None:
+    """
+    Save PyTorch model to TorchScript using tracing.
+
+    Parameters
+    ----------
+    model : torch.NN.Module
+        a PyTorch model
+    dummy_input : torch.Tensor
+        appropriate size Tensor to act as input to model
+    filename : str
+        name of file to save to
+    """
+    print("Saving model using tracing...", end="")
+    traced_model = torch.jit.trace(model, dummy_input)
+    frozen_model = torch.jit.freeze(traced_model)
+    ## print(frozen_model.graph)
+    ## print(frozen_model.code)
+    frozen_model.save(filename)
+    print("done.")
+
+
+def load_torchscript(filename: Optional[str] = "saved_model.pt") -> torch.nn.Module:
+    """
+    Load a TorchScript from file.
+
+    Parameters
+    ----------
+    filename : str
+        name of file containing TorchScript model
+    """
+    model = torch.jit.load(filename)
+
+    return model
+
+
+if __name__ == "__main__":
+    # =====================================================
+    # Load model and prepare for saving
+    # =====================================================
+
+    # FPTLIB-TODO
+    # Load a pre-trained PyTorch model
+    # Insert code here to load your model as `trained_model`.
+    # This example assumes my_ml_model has a method `initialize` to load
+    # architecture, weights, and place in inference mode
+    trained_model = simplenet.SimpleNet()
+
+    # Switch off specific layers/parts of the model that behave
+    # differently during training and inference.
+    # This may have been done by the user already, so just make sure here.
+    trained_model.eval()
+
+    # =====================================================
+    # Prepare dummy input and check model runs
+    # =====================================================
+
+    # FPTLIB-TODO
+    # Generate a dummy input Tensor `dummy_input` to the model of appropriate size.
+    # This example assumes one input of size (5)
+    trained_model_dummy_input = torch.ones(5)
+
+    # FPTLIB-TODO
+    # Uncomment the following lines to save for inference on GPU (rather than CPU):
+    device = torch.device('cuda')
+    trained_model = trained_model.to(device)
+    trained_model.eval()
+    trained_model_dummy_input = trained_model_dummy_input.to(device)
+
+    # FPTLIB-TODO
+    # Run model for dummy inputs
+    # If something isn't working This will generate an error
+    trained_model_dummy_output = trained_model(
+        trained_model_dummy_input,
+    )
+
+    # =====================================================
+    # Save model
+    # =====================================================
+
+    # FPTLIB-TODO
+    # Set the name of the file you want to save the torchscript model to:
+    saved_ts_filename = "saved_simplenet_model_cuda.pt"
+
+    # FPTLIB-TODO
+    # Save the PyTorch model using either scripting (recommended where possible) or tracing
+    # -----------
+    # Scripting
+    # -----------
+    script_to_torchscript(trained_model, filename=saved_ts_filename)
+
+    # -----------
+    # Tracing
+    # -----------
+    # trace_to_torchscript(trained_model, trained_model_dummy_input, filename=saved_ts_filename)
+
+    print(f"Saved model to TorchScript in '{saved_ts_filename}'.")
+
+    # =====================================================
+    # Check model saved OK
+    # =====================================================
+
+    # Load torchscript and run model as a test
+    # FPTLIB-TODO
+    # Scale inputs as above and, if required, move inputs and mode to GPU
+    trained_model_dummy_input = 2.0 * trained_model_dummy_input
+    trained_model_dummy_input = trained_model_dummy_input.to("cuda")
+    trained_model_testing_output = trained_model(
+        trained_model_dummy_input,
+    )
+    ts_model = load_torchscript(filename=saved_ts_filename)
+    ts_model_output = ts_model(
+        trained_model_dummy_input,
+    )
+
+    if torch.all(ts_model_output.eq(trained_model_testing_output)):
+        print("Saved TorchScript model working as expected in a basic test.")
+        print("Users should perform further validation as appropriate.")
+    else:
+        raise RuntimeError(
+            "Saved Torchscript model is not performing as expected.\n"
+            "Consider using scripting if you used tracing, or investigate further."
+        )
diff --git a/examples/3_MultiGPU/requirements.txt b/examples/3_MultiGPU/requirements.txt
new file mode 100644
index 00000000..12c6d5d5
--- /dev/null
+++ b/examples/3_MultiGPU/requirements.txt
@@ -0,0 +1 @@
+torch
diff --git a/examples/3_MultiGPU/simplenet_infer_fortran.f90 b/examples/3_MultiGPU/simplenet_infer_fortran.f90
new file mode 100644
index 00000000..ba176124
--- /dev/null
+++ b/examples/3_MultiGPU/simplenet_infer_fortran.f90
@@ -0,0 +1,63 @@
+program inference
+
+   ! Import precision info from iso
+   use, intrinsic :: iso_fortran_env, only : sp => real32
+
+   ! Import our library for interfacing with PyTorch
+   use ftorch
+
+   ! Import MPI
+   use mpi
+
+   implicit none
+  
+   ! Set precision for reals
+   integer, parameter :: wp = sp
+   
+   integer :: num_args, ix
+   character(len=128), dimension(:), allocatable :: args
+
+   ! Set up Fortran data structures
+   real(wp), dimension(5), target :: in_data
+   real(wp), dimension(5), target :: out_data
+   integer, parameter :: n_inputs = 1
+   integer :: tensor_layout(1) = [1]
+
+   ! Set up Torch data structures
+   type(torch_module) :: model
+   type(torch_tensor), dimension(1) :: in_tensor
+   type(torch_tensor) :: out_tensor
+
+   ! TODO: MPI setup
+
+   ! Get TorchScript model file as a command line argument
+   num_args = command_argument_count()
+   allocate(args(num_args))
+   do ix = 1, num_args
+       call get_command_argument(ix,args(ix))
+   end do
+
+   ! Initialise data
+   ! TODO: Different inputs for different ranks
+   in_data = [0.0, 1.0, 2.0, 3.0, 4.0]
+
+   ! Create Torch input/output tensors from the above arrays
+   ! TODO: Use GPU
+   in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCPU)
+   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU)
+
+   ! Load ML model
+   ! TODO: Use GPU
+   model = torch_module_load(args(1))
+
+   ! Infer
+   call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
+   ! TODO: Write rank, too
+   write (*,*) out_data(:)
+
+   ! Cleanup
+   call torch_module_delete(model)
+   call torch_tensor_delete(in_tensor(1))
+   call torch_tensor_delete(out_tensor)
+
+end program inference
diff --git a/examples/3_MultiGPU/simplenet_infer_python.py b/examples/3_MultiGPU/simplenet_infer_python.py
new file mode 100644
index 00000000..eb6ef7de
--- /dev/null
+++ b/examples/3_MultiGPU/simplenet_infer_python.py
@@ -0,0 +1,56 @@
+"""Load saved SimpleNet to TorchScript and run inference example."""
+
+import torch
+
+
+def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
+    """
+    Load TorchScript SimpleNet and run inference with example Tensor.
+
+    Parameters
+    ----------
+    saved_model : str
+        location of SimpleNet model saved to Torchscript
+    device : str
+        Torch device to run model on, 'cpu' or 'cuda'
+    batch_size : int
+        batch size to run (default 1)
+
+    Returns
+    -------
+    output : torch.Tensor
+        result of running inference on model with example Tensor input
+    """
+    # TODO: Different input on different GPUs
+    input_tensor = torch.tensor([0.0, 1.0, 2.0, 3.0, 4.0]).repeat(batch_size, 1)
+
+    if device == "cpu":
+        # Load saved TorchScript model
+        model = torch.jit.load(saved_model)
+        # Inference
+        output = model.forward(input_tensor)
+
+    elif device == "cuda":
+        # All previously saved modules, no matter their device, are first
+        # loaded onto CPU, and then are moved to the devices they were saved
+        # from, so we don't need to manually transfer the model to the GPU
+        model = torch.jit.load(saved_model)
+        input_tensor_gpu = input_tensor.to(torch.device("cuda"))
+        output_gpu = model.forward(input_tensor_gpu)
+        output = output_gpu.to(torch.device("cpu"))
+
+    return output
+
+
+if __name__ == "__main__":
+    saved_model_file = "saved_simplenet_model_cuda.pt"
+
+    # TODO: cuda:{device_index}
+    device_to_run = "cuda"
+
+    batch_size_to_run = 1
+
+    with torch.no_grad():
+        result = deploy(saved_model_file, device_to_run, batch_size_to_run)
+
+    print(result)
diff --git a/examples/README.md b/examples/README.md
index 36ae79a5..9aa0c01f 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -9,3 +9,6 @@ This directory contains a number of examples of how to use the library:
 2. ResNet-18
     - More complex example demonstrating how to use the library with a multidimensional input.
     - Convert a pre-trained model to TorchScript and call from Fortran.
+
+3. MultiGPU
+	- Revisits the SimpleNet example but considering multiple GPUs.

From 1eec646fe9676a2f8cef404d386aa5c2b94d2d30 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 25 Mar 2024 09:18:04 +0000
Subject: [PATCH 24/42] Differentiate between errors and warnings in C++ code

---
 src/ctorch.cpp | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 8ddb794f..2de9d856 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -23,7 +23,7 @@ constexpr auto get_dtype(torch_data_t dtype)
   case torch_kFloat64:
     return torch::kFloat64;
   default:
-    std::cerr << "[ERROR]: unknown data type, setting to torch_kFloat32"
+    std::cerr << "[WARNING]: unknown data type, setting to torch_kFloat32"
               << std::endl;
     return torch::kFloat32;
   }
@@ -34,21 +34,26 @@ const auto get_device(torch_device_t device_type, int device_index)
   switch (device_type) {
   case torch_kCPU:
     if (device_index != -1) {
-      std::cerr << "[ERROR]: device index unsupported for CPU-only runs"
+      std::cerr << "[WARNING]: device index unused for CPU-only runs"
                 << std::endl;
     }
     return torch::Device(torch::kCPU);
   case torch_kCUDA:
+    if (device_index == -1) {
+      std::cerr << "[WARNING]: device index unset, setting to zero"
+                << std::endl;
+      device_index = 0;
+    }
     if (device_index >= 0 && device_index < torch::cuda::device_count()) {
       return torch::Device(torch::kCUDA, device_index);
     } else {
       std::cerr << "[ERROR]: invalid device index " << device_index
                 << " for device count " << torch::cuda::device_count()
-                << ", using zero instead" << std::endl;
-      return torch::Device(torch::kCUDA);
+                << std::endl;
+      exit(EXIT_FAILURE);
     }
   default:
-    std::cerr << "[ERROR]: unknown device type, setting to torch_kCPU"
+    std::cerr << "[WARNING]: unknown device type, setting to torch_kCPU"
               << std::endl;
     return torch::Device(torch::kCPU);
   }

From 2739c16a07354565794d06d8ddfb83ab87fdad9e Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 25 Mar 2024 09:22:18 +0000
Subject: [PATCH 25/42] Formatting

---
 examples/3_MultiGPU/pt2ts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/3_MultiGPU/pt2ts.py b/examples/3_MultiGPU/pt2ts.py
index 0e67669f..a699f860 100644
--- a/examples/3_MultiGPU/pt2ts.py
+++ b/examples/3_MultiGPU/pt2ts.py
@@ -97,7 +97,7 @@ def load_torchscript(filename: Optional[str] = "saved_model.pt") -> torch.nn.Mod
 
     # FPTLIB-TODO
     # Uncomment the following lines to save for inference on GPU (rather than CPU):
-    device = torch.device('cuda')
+    device = torch.device("cuda")
     trained_model = trained_model.to(device)
     trained_model.eval()
     trained_model_dummy_input = trained_model_dummy_input.to(device)

From fc18b528d4c769e3a6f5020ca62a1b982e8374ae Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 25 Mar 2024 09:49:31 +0000
Subject: [PATCH 26/42] Add mpi4py to requirements for example 3

---
 examples/3_MultiGPU/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/3_MultiGPU/requirements.txt b/examples/3_MultiGPU/requirements.txt
index 12c6d5d5..a9641ad5 100644
--- a/examples/3_MultiGPU/requirements.txt
+++ b/examples/3_MultiGPU/requirements.txt
@@ -1 +1,2 @@
+mpi4py
 torch

From 2b0086a28cd75562323686e329b6b7a6183a0b05 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 25 Mar 2024 09:57:23 +0000
Subject: [PATCH 27/42] Use mpi4py to differ inputs in simplenet_infer_python

---
 examples/3_MultiGPU/simplenet_infer_python.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/examples/3_MultiGPU/simplenet_infer_python.py b/examples/3_MultiGPU/simplenet_infer_python.py
index eb6ef7de..5c74e9b5 100644
--- a/examples/3_MultiGPU/simplenet_infer_python.py
+++ b/examples/3_MultiGPU/simplenet_infer_python.py
@@ -1,5 +1,6 @@
 """Load saved SimpleNet to TorchScript and run inference example."""
 
+from mpi4py import MPI
 import torch
 
 
@@ -21,21 +22,23 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
     output : torch.Tensor
         result of running inference on model with example Tensor input
     """
-    # TODO: Different input on different GPUs
     input_tensor = torch.tensor([0.0, 1.0, 2.0, 3.0, 4.0]).repeat(batch_size, 1)
 
+    # Add the rank (device index) to each tensor to make them differ
+    input_tensor += MPI.COMM_WORLD.rank
+
     if device == "cpu":
         # Load saved TorchScript model
         model = torch.jit.load(saved_model)
         # Inference
         output = model.forward(input_tensor)
 
-    elif device == "cuda":
+    elif device.startswith("cuda"):
         # All previously saved modules, no matter their device, are first
         # loaded onto CPU, and then are moved to the devices they were saved
         # from, so we don't need to manually transfer the model to the GPU
         model = torch.jit.load(saved_model)
-        input_tensor_gpu = input_tensor.to(torch.device("cuda"))
+        input_tensor_gpu = input_tensor.to(torch.device(device))
         output_gpu = model.forward(input_tensor_gpu)
         output = output_gpu.to(torch.device("cpu"))
 
@@ -45,8 +48,7 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
 if __name__ == "__main__":
     saved_model_file = "saved_simplenet_model_cuda.pt"
 
-    # TODO: cuda:{device_index}
-    device_to_run = "cuda"
+    device_to_run = f"cuda:{MPI.COMM_WORLD.rank}"
 
     batch_size_to_run = 1
 

From fced4c16f715c666579bc97536d4ecb2542705fe Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 25 Mar 2024 10:00:29 +0000
Subject: [PATCH 28/42] Raise ValueError for Python inference with invalid
 device

---
 examples/1_SimpleNet/simplenet_infer_python.py | 3 +++
 examples/2_ResNet18/resnet_infer_python.py     | 3 +++
 examples/3_MultiGPU/simplenet_infer_python.py  | 3 +++
 3 files changed, 9 insertions(+)

diff --git a/examples/1_SimpleNet/simplenet_infer_python.py b/examples/1_SimpleNet/simplenet_infer_python.py
index 425873ac..4b18e6ac 100644
--- a/examples/1_SimpleNet/simplenet_infer_python.py
+++ b/examples/1_SimpleNet/simplenet_infer_python.py
@@ -38,6 +38,9 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
         output_gpu = model.forward(input_tensor_gpu)
         output = output_gpu.to(torch.device("cpu"))
 
+    else:
+        raise ValueError(f"Device '{device}' not recognised.")
+
     return output
 
 
diff --git a/examples/2_ResNet18/resnet_infer_python.py b/examples/2_ResNet18/resnet_infer_python.py
index e2ed8288..adf396c4 100644
--- a/examples/2_ResNet18/resnet_infer_python.py
+++ b/examples/2_ResNet18/resnet_infer_python.py
@@ -47,6 +47,9 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
         output_gpu = model.forward(input_tensor_gpu)
         output = output_gpu.to(torch.device("cpu"))
 
+    else:
+        raise ValueError(f"Device '{device}' not recognised.")
+
     return output
 
 
diff --git a/examples/3_MultiGPU/simplenet_infer_python.py b/examples/3_MultiGPU/simplenet_infer_python.py
index 5c74e9b5..e669f61a 100644
--- a/examples/3_MultiGPU/simplenet_infer_python.py
+++ b/examples/3_MultiGPU/simplenet_infer_python.py
@@ -42,6 +42,9 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
         output_gpu = model.forward(input_tensor_gpu)
         output = output_gpu.to(torch.device("cpu"))
 
+    else:
+        raise ValueError(f"Device '{device}' not recognised.")
+
     return output
 
 

From 188b30579c6c495185da37e95dd72af07f318df8 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 25 Mar 2024 11:54:38 +0000
Subject: [PATCH 29/42] Print rank in Python case; updates to README

---
 examples/3_MultiGPU/README.md                 | 24 ++++++++++++-------
 examples/3_MultiGPU/simplenet_infer_python.py |  5 ++--
 2 files changed, 18 insertions(+), 11 deletions(-)

diff --git a/examples/3_MultiGPU/README.md b/examples/3_MultiGPU/README.md
index ad9fd836..d1d51afa 100644
--- a/examples/3_MultiGPU/README.md
+++ b/examples/3_MultiGPU/README.md
@@ -59,14 +59,20 @@ network. The only difference with the earlier example is that the model is built
 be run using CUDA rather than on CPU.
 
 You can check that everything is working by running the `simplenet_infer_python.py`
-script:
+script. It's set up with MPI such that a different GPU device is associated with each
+MPI rank. If you have four GPUs available then use the following, otherwise adjust the
+number of MPI ranks.
 ```
-python3 simplenet_infer_python.py
+mpiexec -np 4 python3 simplenet_infer_python.py
 ```
-This reads the model in from the TorchScript file and runs it with an input tensor
-[0.0, 1.0, 2.0, 3.0, 4.0] to produce the result:
+This reads the model in from the TorchScript file and runs it with an different input
+tensor on each GPU device: [0.0, 1.0, 2.0, 3.0, 4.0], plus the device index in each
+entry. The result should be (some permutation of):
 ```
-tensor([[0, 2, 4, 6, 8]])
+0: tensor([[0, 2, 4, 6, 8]])
+1: tensor([[2, 4, 6, 8, 10]])
+2: tensor([[4, 6, 8, 10, 12]])
+3: tensor([[6, 8, 10, 12, 14]])
 ```
 
 At this point we no longer require python, so can deactivate the virtual environment:
@@ -80,15 +86,15 @@ need to use an MPI-enabled Fortran compiler:
 ```
 mkdir build
 cd build
-cmake .. -DCMAKE_PREFIX_PATH=<path/to/your/installation/of/library/> \
-	-DCMAKE_Fortran_COMPILER=<your/mpif90/compiler> -DCMAKE_BUILD_TYPE=Release
+cmake .. -DCMAKE_PREFIX_PATH=<path/to/your/installation/of/library/> -DCMAKE_BUILD_TYPE=Release
 cmake --build .
 ```
 
 To run the compiled code calling the saved SimpleNet TorchScript from Fortran, run the
-executable with an argument of the saved model file:
+executable with an argument of the saved model file. Again, adjust the number of MPI
+processes, if required:
 ```
-./simplenet_infer_fortran ../saved_simplenet_model_cuda.pt
+mpiexec -np 4 ./simplenet_infer_fortran ../saved_simplenet_model_cuda.pt
 ```
 
 # TODO: Running
diff --git a/examples/3_MultiGPU/simplenet_infer_python.py b/examples/3_MultiGPU/simplenet_infer_python.py
index e669f61a..a2eb46c9 100644
--- a/examples/3_MultiGPU/simplenet_infer_python.py
+++ b/examples/3_MultiGPU/simplenet_infer_python.py
@@ -51,11 +51,12 @@ def deploy(saved_model: str, device: str, batch_size: int = 1) -> torch.Tensor:
 if __name__ == "__main__":
     saved_model_file = "saved_simplenet_model_cuda.pt"
 
-    device_to_run = f"cuda:{MPI.COMM_WORLD.rank}"
+    rank = MPI.COMM_WORLD.rank
+    device_to_run = f"cuda:{rank}"
 
     batch_size_to_run = 1
 
     with torch.no_grad():
         result = deploy(saved_model_file, device_to_run, batch_size_to_run)
 
-    print(result)
+    print(f"{rank}: {result}")

From dcfb153f35138fcd522482c51cbb1a53a493a854 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 25 Mar 2024 13:01:06 +0000
Subject: [PATCH 30/42] Setup MPI for simplenet_infer_fortran, too

---
 examples/3_MultiGPU/CMakeLists.txt            |  2 ++
 .../3_MultiGPU/simplenet_infer_fortran.f90    | 20 +++++++++----------
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/examples/3_MultiGPU/CMakeLists.txt b/examples/3_MultiGPU/CMakeLists.txt
index 8b5ac27f..5ae80636 100644
--- a/examples/3_MultiGPU/CMakeLists.txt
+++ b/examples/3_MultiGPU/CMakeLists.txt
@@ -12,8 +12,10 @@ if(NOT CMAKE_BUILD_TYPE)
 endif()
 
 find_package(FTorch)
+find_package(MPI REQUIRED)
 message(STATUS "Building with Fortran PyTorch coupling")
 
 # Fortran example
 add_executable(simplenet_infer_fortran simplenet_infer_fortran.f90)
 target_link_libraries(simplenet_infer_fortran PRIVATE FTorch::ftorch)
+target_link_libraries(simplenet_infer_fortran PRIVATE MPI::MPI_Fortran)
diff --git a/examples/3_MultiGPU/simplenet_infer_fortran.f90 b/examples/3_MultiGPU/simplenet_infer_fortran.f90
index ba176124..38ba3ee9 100644
--- a/examples/3_MultiGPU/simplenet_infer_fortran.f90
+++ b/examples/3_MultiGPU/simplenet_infer_fortran.f90
@@ -28,7 +28,11 @@ program inference
    type(torch_tensor), dimension(1) :: in_tensor
    type(torch_tensor) :: out_tensor
 
-   ! TODO: MPI setup
+   ! MPI configuration
+   integer :: rank, ierr, i
+
+   call mpi_init(ierr)
+   call mpi_comm_rank(mpi_comm_world, rank, ierr)
 
    ! Get TorchScript model file as a command line argument
    num_args = command_argument_count()
@@ -38,22 +42,18 @@ program inference
    end do
 
    ! Initialise data
-   ! TODO: Different inputs for different ranks
-   in_data = [0.0, 1.0, 2.0, 3.0, 4.0]
+   in_data = [(rank + i, i=0,4)]
 
    ! Create Torch input/output tensors from the above arrays
-   ! TODO: Use GPU
-   in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCPU)
-   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU)
+   in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCUDA, device_index=rank)
+   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCUDA, device_index=rank)
 
    ! Load ML model
-   ! TODO: Use GPU
-   model = torch_module_load(args(1))
+   model = torch_module_load(args(1), device_type=torch_kCUDA, device_index=rank)
 
    ! Infer
    call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
-   ! TODO: Write rank, too
-   write (*,*) out_data(:)
+   write (*,*) rank, ":", out_data(:)
 
    ! Cleanup
    call torch_module_delete(model)

From 392afb9ac7c9a8c33f964cd8da2ba897dca34dd0 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 25 Mar 2024 13:06:17 +0000
Subject: [PATCH 31/42] Write formatting for example 3

---
 examples/3_MultiGPU/README.md                   | 17 ++++++++++++-----
 examples/3_MultiGPU/simplenet_infer_fortran.f90 |  3 ++-
 2 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/examples/3_MultiGPU/README.md b/examples/3_MultiGPU/README.md
index d1d51afa..a05890c2 100644
--- a/examples/3_MultiGPU/README.md
+++ b/examples/3_MultiGPU/README.md
@@ -69,10 +69,10 @@ This reads the model in from the TorchScript file and runs it with an different
 tensor on each GPU device: [0.0, 1.0, 2.0, 3.0, 4.0], plus the device index in each
 entry. The result should be (some permutation of):
 ```
-0: tensor([[0, 2, 4, 6, 8]])
-1: tensor([[2, 4, 6, 8, 10]])
-2: tensor([[4, 6, 8, 10, 12]])
-3: tensor([[6, 8, 10, 12, 14]])
+0: tensor([[0., 2., 4., 6., 8.]])
+1: tensor([[ 2., 4.,  6.,  8., 10.]])
+2: tensor([[ 4., 6.,  8., 10., 12.]])
+3: tensor([[ 6., 8., 10., 12., 14.]])
 ```
 
 At this point we no longer require python, so can deactivate the virtual environment:
@@ -97,7 +97,14 @@ processes, if required:
 mpiexec -np 4 ./simplenet_infer_fortran ../saved_simplenet_model_cuda.pt
 ```
 
-# TODO: Running
+This runs the model with the same inputs as described above and should produce (some
+permutation of) the output:
+```
+0: [  0.0,  2.0,  4.0,  6.0,  8.0]
+1: [  2.0,  4.0,  6.0,  8.0, 10.0]
+2: [  4.0,  6.0,  8.0, 10.0, 12.0]
+3: [  6.0,  8.0, 10.0, 12.0, 14.0]
+```
 
 # TODO: Make rather than CMake
 
diff --git a/examples/3_MultiGPU/simplenet_infer_fortran.f90 b/examples/3_MultiGPU/simplenet_infer_fortran.f90
index 38ba3ee9..b1b052f5 100644
--- a/examples/3_MultiGPU/simplenet_infer_fortran.f90
+++ b/examples/3_MultiGPU/simplenet_infer_fortran.f90
@@ -53,7 +53,8 @@ program inference
 
    ! Infer
    call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
-   write (*,*) rank, ":", out_data(:)
+   write (6,100) rank, ": [", out_data(:), "]"
+   100 format(i1,a3,4(f5.1,","),f5.1,a1)
 
    ! Cleanup
    call torch_module_delete(model)

From 9fd3040cfde76cc7e2fc70a1d919e920964e8285 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Mon, 25 Mar 2024 13:19:14 +0000
Subject: [PATCH 32/42] Add note on building with Make

---
 examples/3_MultiGPU/README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/3_MultiGPU/README.md b/examples/3_MultiGPU/README.md
index a05890c2..544da73f 100644
--- a/examples/3_MultiGPU/README.md
+++ b/examples/3_MultiGPU/README.md
@@ -106,6 +106,9 @@ permutation of) the output:
 3: [  6.0,  8.0, 10.0, 12.0, 14.0]
 ```
 
-# TODO: Make rather than CMake
-
-# TODO: Further options
+Alternatively, we can use `make`, instead of cmake, copying the Makefile over from the
+first example:
+```
+cp ../1_SimpleNet/Makefile .
+```
+See the instructions in that example directory for further details.

From 24d5b6a3f81a3e8670560bacab35e4a92c46471e Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 27 Mar 2024 13:38:17 +0000
Subject: [PATCH 33/42] Print before and after; mpi_finalise; output on CPU;
 comments

---
 .../3_MultiGPU/simplenet_infer_fortran.f90    | 30 +++++++++++++------
 1 file changed, 21 insertions(+), 9 deletions(-)

diff --git a/examples/3_MultiGPU/simplenet_infer_fortran.f90 b/examples/3_MultiGPU/simplenet_infer_fortran.f90
index b1b052f5..3f9fa721 100644
--- a/examples/3_MultiGPU/simplenet_infer_fortran.f90
+++ b/examples/3_MultiGPU/simplenet_infer_fortran.f90
@@ -41,24 +41,36 @@ program inference
        call get_command_argument(ix,args(ix))
    end do
 
-   ! Initialise data
-   in_data = [(rank + i, i=0,4)]
+   ! Initialise data and print the values used on each MPI rank.
+   in_data = [(rank + i, i = 0, 4)]
+   write (6, 100) rank, in_data(:)
+   100 format("input on rank ", i1,": [", 4(f5.1,","), f5.1,"]")
 
-   ! Create Torch input/output tensors from the above arrays
-   in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCUDA, device_index=rank)
-   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCUDA, device_index=rank)
+   ! Create Torch input tensor from the above array. We use the torch_kCUDA
+   ! device type and the device index corresponding to the MPI rank.
+   in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCUDA, &
+                                          device_index=rank)
 
-   ! Load ML model
-   model = torch_module_load(args(1), device_type=torch_kCUDA, device_index=rank)
+   ! Create Torch input tensor from the above array. Here we use the
+   ! torch_kCPU device type since the tensor is for output only.
+   out_tensor = torch_tensor_from_array(out_data, tensor_layout, torch_kCPU)
+
+   ! Load ML model. Ensure that the same device type and device index are used
+   ! as for the input data.
+   model = torch_module_load(args(1), device_type=torch_kCUDA,                 &
+                             device_index=rank)
 
    ! Infer
    call torch_module_forward(model, in_tensor, n_inputs, out_tensor)
-   write (6,100) rank, ": [", out_data(:), "]"
-   100 format(i1,a3,4(f5.1,","),f5.1,a1)
+
+   ! Print the values computed on each MPI rank.
+   write (6, 200) rank, out_data(:)
+   200 format("output on rank ", i1,": [", 4(f5.1,","), f5.1,"]")
 
    ! Cleanup
    call torch_module_delete(model)
    call torch_tensor_delete(in_tensor(1))
    call torch_tensor_delete(out_tensor)
+   call mpi_finalize(ierr)
 
 end program inference

From 5ebe8459d74f4d460366ddc286b144ad2ba7b1c9 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 27 Mar 2024 15:07:14 +0000
Subject: [PATCH 34/42] Docs: device->device_type for consistency

---
 pages/gpu.md | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/pages/gpu.md b/pages/gpu.md
index f1b9ba5c..6e073a24 100644
--- a/pages/gpu.md
+++ b/pages/gpu.md
@@ -12,20 +12,20 @@ For example, when using
 this can be done by uncommenting the following lines:
 
 ```python
-device = torch.device("cuda")
-trained_model = trained_model.to(device)
+device_type = torch.device("cuda")
+trained_model = trained_model.to(device_type)
 trained_model.eval()
-trained_model_dummy_input_1 = trained_model_dummy_input_1.to(device)
-trained_model_dummy_input_2 = trained_model_dummy_input_2.to(device)
+trained_model_dummy_input_1 = trained_model_dummy_input_1.to(device_type)
+trained_model_dummy_input_2 = trained_model_dummy_input_2.to(device_type)
 ```
 
 > Note: _This code also moves the dummy input tensors to the GPU.
 > Whilst not necessary for saving the model, but the tensors must also be on the GPU
 > to test that the models runs._
 
-2) When calling `torch_tensor_from_array` in Fortran, the device for the input
+2) When calling `torch_tensor_from_array` in Fortran, the device type for the input
    tensor(s) should be set to `torch_kCUDA`, rather than `torch_kCPU`.
-   This ensures that the inputs are on the same device as the model.
+   This ensures that the inputs are on the same device type as the model.
 
-> Note: _You do **not** need to change the device for the output tensors as we
+> Note: _You do **not** need to change the device type for the output tensors as we
 > want them to be on the CPU for subsequent use in Fortran._

From 18fca7b29801a4e86eafd439fae9be67e6f1078d Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Wed, 27 Mar 2024 15:13:12 +0000
Subject: [PATCH 35/42] Add docs on MultiGPU

---
 pages/examples.md |  6 ++++++
 pages/gpu.md      | 11 +++++++++++
 2 files changed, 17 insertions(+)

diff --git a/pages/examples.md b/pages/examples.md
index 963d6374..02788722 100644
--- a/pages/examples.md
+++ b/pages/examples.md
@@ -168,3 +168,9 @@ using ResNet-18 to classify an image.
 As the input to this model is four-dimensional (batch size, colour, x, y),
 care must be taken dealing with the data array in Python and Fortran.
 See [when to transpose arrays](transposing.html) for more details.
+
+#### 3) MultiGPU
+
+[This worked example](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/3_MultiGPU)
+builds on the SimpleNet demo and shows how to account for the case of sending different
+data to multiple GPU devices.
diff --git a/pages/gpu.md b/pages/gpu.md
index 6e073a24..2b04e111 100644
--- a/pages/gpu.md
+++ b/pages/gpu.md
@@ -29,3 +29,14 @@ trained_model_dummy_input_2 = trained_model_dummy_input_2.to(device_type)
 
 > Note: _You do **not** need to change the device type for the output tensors as we
 > want them to be on the CPU for subsequent use in Fortran._
+
+### Multi-GPU runs
+
+In the case of having multiple GPU devices, as well as setting `torch_kCUDA` as the
+device type for any input tensors and models, you should also specify their device index
+as the GPU device to be targeted. This argument is optional and will default to device
+index 0 if unset.
+
+See the
+[MultiGPU example](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/3_MultiGPU)
+for a worked example of running with multiple GPUs.

From 475a859ae71e24f2e9bf2d63a34c27b361e89134 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <22053413+jwallwork23@users.noreply.github.com>
Date: Thu, 28 Mar 2024 09:01:11 +0000
Subject: [PATCH 36/42] Update warning text for defaulting to 0

Co-authored-by: jatkinson1000 <109271713+jatkinson1000@users.noreply.github.com>
---
 src/ctorch.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ctorch.cpp b/src/ctorch.cpp
index 2de9d856..c0b2e808 100644
--- a/src/ctorch.cpp
+++ b/src/ctorch.cpp
@@ -40,7 +40,7 @@ const auto get_device(torch_device_t device_type, int device_index)
     return torch::Device(torch::kCPU);
   case torch_kCUDA:
     if (device_index == -1) {
-      std::cerr << "[WARNING]: device index unset, setting to zero"
+      std::cerr << "[WARNING]: device index unset, defaulting to 0"
                 << std::endl;
       device_index = 0;
     }

From 3f264572d6451724aa16507cf9c1e9676c3a3f38 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 28 Mar 2024 08:57:14 +0000
Subject: [PATCH 37/42] Mention MPI in requirements

---
 examples/3_MultiGPU/README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/3_MultiGPU/README.md b/examples/3_MultiGPU/README.md
index 544da73f..8eb78155 100644
--- a/examples/3_MultiGPU/README.md
+++ b/examples/3_MultiGPU/README.md
@@ -20,8 +20,9 @@ TorchScript model in inference mode.
 To run this example requires:
 
 - cmake
+- An MPI installation.
 - mpif90
-- FTorch (installed as described in main package with mpicc, mpicxx, and mpif90)
+- FTorch (installed as described in main package)
 - python3
 
 ## Running

From 3dba29a07b657eb520e64c67b51ffb6546af4e18 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 28 Mar 2024 08:57:26 +0000
Subject: [PATCH 38/42] Update outputs for example 3

---
 examples/3_MultiGPU/README.md | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/examples/3_MultiGPU/README.md b/examples/3_MultiGPU/README.md
index 8eb78155..4d84f380 100644
--- a/examples/3_MultiGPU/README.md
+++ b/examples/3_MultiGPU/README.md
@@ -101,10 +101,14 @@ mpiexec -np 4 ./simplenet_infer_fortran ../saved_simplenet_model_cuda.pt
 This runs the model with the same inputs as described above and should produce (some
 permutation of) the output:
 ```
-0: [  0.0,  2.0,  4.0,  6.0,  8.0]
-1: [  2.0,  4.0,  6.0,  8.0, 10.0]
-2: [  4.0,  6.0,  8.0, 10.0, 12.0]
-3: [  6.0,  8.0, 10.0, 12.0, 14.0]
+input on rank0: [  0.0,  1.0,  2.0,  3.0,  4.0]
+input on rank1: [  1.0,  2.0,  3.0,  4.0,  5.0]
+input on rank2: [  2.0,  3.0,  4.0,  5.0,  6.0]
+input on rank3: [  3.0,  4.0,  5.0,  6.0,  7.0]
+output on rank0: [  0.0,  2.0,  4.0,  6.0,  8.0]
+output on rank1: [  2.0,  4.0,  6.0,  8.0, 10.0]
+output on rank2: [  4.0,  6.0,  8.0, 10.0, 12.0]
+output on rank3: [  6.0,  8.0, 10.0, 12.0, 14.0]
 ```
 
 Alternatively, we can use `make`, instead of cmake, copying the Makefile over from the

From 0e3272e498fd8cecef5c288d2f6e4ab0fe000b54 Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 28 Mar 2024 09:00:21 +0000
Subject: [PATCH 39/42] Use NP rather than 4 GPUs

---
 examples/3_MultiGPU/README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/3_MultiGPU/README.md b/examples/3_MultiGPU/README.md
index 4d84f380..25acb633 100644
--- a/examples/3_MultiGPU/README.md
+++ b/examples/3_MultiGPU/README.md
@@ -61,10 +61,9 @@ be run using CUDA rather than on CPU.
 
 You can check that everything is working by running the `simplenet_infer_python.py`
 script. It's set up with MPI such that a different GPU device is associated with each
-MPI rank. If you have four GPUs available then use the following, otherwise adjust the
-number of MPI ranks.
+MPI rank. You should substitute `<NP>` with the number of GPUs you wish to run with:
 ```
-mpiexec -np 4 python3 simplenet_infer_python.py
+mpiexec -np <NP> python3 simplenet_infer_python.py
 ```
 This reads the model in from the TorchScript file and runs it with an different input
 tensor on each GPU device: [0.0, 1.0, 2.0, 3.0, 4.0], plus the device index in each
@@ -92,10 +91,10 @@ cmake --build .
 ```
 
 To run the compiled code calling the saved SimpleNet TorchScript from Fortran, run the
-executable with an argument of the saved model file. Again, adjust the number of MPI
-processes, if required:
+executable with an argument of the saved model file. Again, specify the number of MPI
+processes according to the desired number of GPUs:
 ```
-mpiexec -np 4 ./simplenet_infer_fortran ../saved_simplenet_model_cuda.pt
+mpiexec -np <NP> ./simplenet_infer_fortran ../saved_simplenet_model_cuda.pt
 ```
 
 This runs the model with the same inputs as described above and should produce (some

From 99d3b5b458fea4b0c206ee228e75b85154fd258c Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 28 Mar 2024 09:15:04 +0000
Subject: [PATCH 40/42] Implement SimpleNet in example 3 but with a twist

---
 examples/3_MultiGPU/README.md    |  5 ---
 examples/3_MultiGPU/simplenet.py | 53 ++++++++++++++++++++++++++++++++
 2 files changed, 53 insertions(+), 5 deletions(-)
 create mode 100644 examples/3_MultiGPU/simplenet.py

diff --git a/examples/3_MultiGPU/README.md b/examples/3_MultiGPU/README.md
index 25acb633..c011210b 100644
--- a/examples/3_MultiGPU/README.md
+++ b/examples/3_MultiGPU/README.md
@@ -35,11 +35,6 @@ source venv/bin/activate
 pip install -r requirements.txt
 ```
 
-Copy over the `simplenet.py` script from the earlier example:
-```
-cp ../1_SimpleNet/simplenet.py .
-```
-
 You can check that everything is working by running `simplenet.py`:
 ```
 python3 simplenet.py
diff --git a/examples/3_MultiGPU/simplenet.py b/examples/3_MultiGPU/simplenet.py
new file mode 100644
index 00000000..81f65cbc
--- /dev/null
+++ b/examples/3_MultiGPU/simplenet.py
@@ -0,0 +1,53 @@
+"""Module defining a simple PyTorch 'Net' for coupling to Fortran."""
+
+import torch
+from torch import nn
+
+
+class SimpleNet(nn.Module):
+    """PyTorch module multiplying an input vector by 2."""
+
+    def __init__(
+        self,
+    ) -> None:
+        """
+        Initialize the SimpleNet model.
+
+        Consists of a single Linear layer with weights predefined to
+        multiply the input by 2.
+        """
+        super().__init__()
+        self._fwd_seq = nn.Sequential(
+            nn.Linear(5, 5, bias=False),
+        )
+        with torch.no_grad():
+            self._fwd_seq[0].weight = nn.Parameter(2.0 * torch.eye(5))
+
+    def forward(self, batch: torch.Tensor) -> torch.Tensor:
+        """
+        Pass ``batch`` through the model.
+
+        Parameters
+        ----------
+        batch : torch.Tensor
+            A mini-batch of input vectors of length 5.
+
+        Returns
+        -------
+        torch.Tensor
+            batch scaled by 2.
+
+        """
+        return self._fwd_seq(batch)
+
+
+if __name__ == "__main__":
+    model = SimpleNet()
+    model.eval()
+
+    input_tensor = torch.Tensor([0.0, 1.0, 2.0, 3.0, 4.0])
+    input_tensor_gpu = input_tensor.to(torch.device("cuda"))
+
+    print(f"SimpleNet forward pass on CUDA device {input_tensor_gpu.get_device()}")
+    with torch.no_grad():
+        print(model(input_tensor_gpu))

From 99002d5bb4a5cd22655cc99289b21805662ad1bd Mon Sep 17 00:00:00 2001
From: Joe Wallwork <jw2423@cam.ac.uk>
Date: Thu, 28 Mar 2024 09:22:51 +0000
Subject: [PATCH 41/42] Add code snippets for multi-GPU doc section

---
 pages/gpu.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/pages/gpu.md b/pages/gpu.md
index 2b04e111..0ae3c1fd 100644
--- a/pages/gpu.md
+++ b/pages/gpu.md
@@ -37,6 +37,20 @@ device type for any input tensors and models, you should also specify their devi
 as the GPU device to be targeted. This argument is optional and will default to device
 index 0 if unset.
 
+For example, the following code snippet sets up a Torch tensor with GPU device index 2:
+
+```fortran
+device_index = 2
+in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCUDA,    &
+                                       device_index=device_index)
+```
+
+Whereas the following code snippet sets up a Torch tensor with (default) device index 0:
+
+```fortran
+in_tensor(1) = torch_tensor_from_array(in_data, tensor_layout, torch_kCUDA)
+```
+
 See the
 [MultiGPU example](https://github.com/Cambridge-ICCS/FTorch/tree/main/examples/3_MultiGPU)
 for a worked example of running with multiple GPUs.

From e2b68bddab3ce9f263b797558e15d8c0c3b48d2c Mon Sep 17 00:00:00 2001
From: Jack Atkinson <jwa34@cam.ac.uk>
Date: Thu, 28 Mar 2024 09:07:25 -0600
Subject: [PATCH 42/42] Add note about multiple GPU support to README.md.

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index efb52443..f0a79d43 100644
--- a/README.md
+++ b/README.md
@@ -187,7 +187,8 @@ adaptations to the code:
 2. When using FTorch in Fortran, set the device for the input
    tensor(s) to `torch_kCUDA`, rather than `torch_kCPU`.
 
-For detailed guidance about running on GPU please see the
+For detailed guidance about running on GPU, including instructions for using multiple
+devices, please see the
 [online GPU documentation](https://cambridge-iccs.github.io/FTorch/page/gpu.html).
 
 ## Examples