From b694832789c7e32c3d011418e067216dca8eabcb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 12:23:22 +0300 Subject: [PATCH 01/13] Fix: amp_recipe.py fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- recipes_source/recipes/amp_recipe.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index 2cdd37c803..99c6fa2e0e 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -79,8 +79,8 @@ def make_model(in_size, out_size, num_layers): # Creates data in default precision. # The same data is used for both default and mixed precision trials below. # You don't need to manually change inputs' ``dtype`` when enabling mixed precision. -data = [torch.randn(batch_size, in_size, device="cuda") for _ in range(num_batches)] -targets = [torch.randn(batch_size, out_size, device="cuda") for _ in range(num_batches)] +data = [torch.randn(batch_size, in_size, device=torch.device('cuda')) for _ in range(num_batches)] +targets = [torch.randn(batch_size, out_size, device=torch.device('cuda')) for _ in range(num_batches)] loss_fn = torch.nn.MSELoss().cuda() @@ -116,7 +116,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): # Runs the forward pass under ``autocast``. - with torch.autocast(device_type='cuda', dtype=torch.float16): + with torch.autocast(device_type=torch.device('cuda'), dtype=torch.float16): output = net(input) # output is float16 because linear layers ``autocast`` to float16. assert output.dtype is torch.float16 @@ -151,7 +151,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): - with torch.autocast(device_type='cuda', dtype=torch.float16): + with torch.autocast(device_type=torch.device('cuda'), dtype=torch.float16): output = net(input) loss = loss_fn(output, target) @@ -184,7 +184,7 @@ def make_model(in_size, out_size, num_layers): start_timer() for epoch in range(epochs): for input, target in zip(data, targets): - with torch.autocast(device_type='cuda', dtype=torch.float16, enabled=use_amp): + with torch.autocast(device_type=torch.device('cuda'), dtype=torch.float16, enabled=use_amp): output = net(input) loss = loss_fn(output, target) scaler.scale(loss).backward() @@ -202,7 +202,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): - with torch.autocast(device_type='cuda', dtype=torch.float16): + with torch.autocast(device_type=torch.device('cuda'), dtype=torch.float16): output = net(input) loss = loss_fn(output, target) scaler.scale(loss).backward() From aa6f573e0428eb98a723a057fdc1e18a0005eb4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 12:27:35 +0300 Subject: [PATCH 02/13] Fix: amp_recipe fixed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- recipes_source/recipes/amp_recipe.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index 99c6fa2e0e..8fb024f229 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -76,11 +76,14 @@ def make_model(in_size, out_size, num_layers): num_batches = 50 epochs = 3 +device = torch.device('cuda') +torch.set_default_device(device) + # Creates data in default precision. # The same data is used for both default and mixed precision trials below. # You don't need to manually change inputs' ``dtype`` when enabling mixed precision. -data = [torch.randn(batch_size, in_size, device=torch.device('cuda')) for _ in range(num_batches)] -targets = [torch.randn(batch_size, out_size, device=torch.device('cuda')) for _ in range(num_batches)] +data = [torch.randn(batch_size, in_size) for _ in range(num_batches)] +targets = [torch.randn(batch_size, out_size) for _ in range(num_batches)] loss_fn = torch.nn.MSELoss().cuda() @@ -116,7 +119,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): # Runs the forward pass under ``autocast``. - with torch.autocast(device_type=torch.device('cuda'), dtype=torch.float16): + with torch.autocast(dtype=torch.float16): output = net(input) # output is float16 because linear layers ``autocast`` to float16. assert output.dtype is torch.float16 @@ -151,7 +154,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): - with torch.autocast(device_type=torch.device('cuda'), dtype=torch.float16): + with torch.autocast(dtype=torch.float16): output = net(input) loss = loss_fn(output, target) @@ -184,7 +187,7 @@ def make_model(in_size, out_size, num_layers): start_timer() for epoch in range(epochs): for input, target in zip(data, targets): - with torch.autocast(device_type=torch.device('cuda'), dtype=torch.float16, enabled=use_amp): + with torch.autocast(dtype=torch.float16, enabled=use_amp): output = net(input) loss = loss_fn(output, target) scaler.scale(loss).backward() @@ -202,7 +205,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): - with torch.autocast(device_type=torch.device('cuda'), dtype=torch.float16): + with torch.autocast(dtype=torch.float16): output = net(input) loss = loss_fn(output, target) scaler.scale(loss).backward() From a92053403fd7c91cc34599421020dfd047f8a15c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 12:34:05 +0300 Subject: [PATCH 03/13] Fix: beginner/examples_autograd/polynomial_autograd.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- recipes_source/recipes/amp_recipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index 8fb024f229..94a68285f9 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -76,7 +76,7 @@ def make_model(in_size, out_size, num_layers): num_batches = 50 epochs = 3 -device = torch.device('cuda') +device = 'cuda' if torch.cuda.is_available() else 'cpu' torch.set_default_device(device) # Creates data in default precision. From 340cbd9b2e348e4d60ab70fc97175bbe1ca1b26f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 16:57:12 +0300 Subject: [PATCH 04/13] Polynomial autograd fixed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- .../examples_autograd/polynomial_autograd.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/beginner_source/examples_autograd/polynomial_autograd.py b/beginner_source/examples_autograd/polynomial_autograd.py index 05744ff560..9c992d2ca4 100755 --- a/beginner_source/examples_autograd/polynomial_autograd.py +++ b/beginner_source/examples_autograd/polynomial_autograd.py @@ -18,23 +18,23 @@ import math dtype = torch.float -device = torch.device("cpu") -# device = torch.device("cuda:0") # Uncomment this to run on GPU +device = "cuda" if torch.cuda.is_available() else "cpu" +torch.set_default_device(device) # Create Tensors to hold input and outputs. # By default, requires_grad=False, which indicates that we do not need to # compute gradients with respect to these Tensors during the backward pass. -x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype) +x = torch.linspace(-math.pi, math.pi, 2000, dtype=dtype) y = torch.sin(x) # Create random Tensors for weights. For a third order polynomial, we need # 4 weights: y = a + b x + c x^2 + d x^3 # Setting requires_grad=True indicates that we want to compute gradients with # respect to these Tensors during the backward pass. -a = torch.randn((), device=device, dtype=dtype, requires_grad=True) -b = torch.randn((), device=device, dtype=dtype, requires_grad=True) -c = torch.randn((), device=device, dtype=dtype, requires_grad=True) -d = torch.randn((), device=device, dtype=dtype, requires_grad=True) +a = torch.randn((), dtype=dtype, requires_grad=True) +b = torch.randn((), dtype=dtype, requires_grad=True) +c = torch.randn((), dtype=dtype, requires_grad=True) +d = torch.randn((), dtype=dtype, requires_grad=True) learning_rate = 1e-6 for t in range(2000): From 5f3b837f534a637d68e584148286c2ab72af2957 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 17:01:11 +0300 Subject: [PATCH 05/13] Fix tuning_guide MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- recipes_source/recipes/tuning_guide.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/recipes_source/recipes/tuning_guide.py b/recipes_source/recipes/tuning_guide.py index 7c8aa135b1..0f82fb76d3 100644 --- a/recipes_source/recipes/tuning_guide.py +++ b/recipes_source/recipes/tuning_guide.py @@ -357,7 +357,7 @@ def fused_gelu(x): # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # Instead of calling ``torch.rand(size).cuda()`` to generate a random tensor, # produce the output directly on the target device: -# ``torch.rand(size, device=torch.device('cuda'))``. +# ``torch.rand(size, device='cuda')``. # # This is applicable to all functions which create new tensors and accept # ``device`` argument: From 7173e8b01cf297b37348e6ae0956527cf2cc0db0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 17:05:52 +0300 Subject: [PATCH 06/13] Fix nestedtensor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- prototype_source/nestedtensor.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py index 0d2898cc4a..582f19c433 100644 --- a/prototype_source/nestedtensor.py +++ b/prototype_source/nestedtensor.py @@ -25,6 +25,7 @@ import torch.nn.functional as F device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') +torch.set_default_device(device) ###################################################################### # NestedTensor Initialization @@ -35,7 +36,7 @@ # From the Python frontend, a nestedtensor can be created from a list of tensors. # We denote nt[i] as the ith tensor component of a nestedtensor. nt = torch.nested.nested_tensor([torch.arange(12).reshape( - 2, 6), torch.arange(18).reshape(3, 6)], dtype=torch.float, device=device) + 2, 6), torch.arange(18).reshape(3, 6)], dtype=torch.float) print(f"{nt=}") ###################################################################### @@ -111,7 +112,7 @@ # Applying the operation on a nestedtensor is equivalent to # applying the operation to the underlying tensor components, # with the result being a nestedtensor as well. -nt_mm = torch.nested.nested_tensor([torch.randn((2, 3, 4)), torch.randn((2, 3, 5))], device=device) +nt_mm = torch.nested.nested_tensor([torch.randn((2, 3, 4)), torch.randn((2, 3, 5))]) nt3 = torch.matmul(nt_transposed, nt_mm) print(f"Result of Matmul:\n {nt3}") @@ -318,7 +319,7 @@ def mha_padded(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, nhea # Have to manipulate masks in order to apply them to the attention weights key_padding_mask = attn_mask_q.view(N, 1, 1, L_t).expand(-1, nheads, -1, -1).reshape(N*nheads, 1, L_t).to(device=device) - attn_mask = torch.zeros(key_padding_mask.shape, device=device, dtype=torch.float32) + attn_mask = torch.zeros(key_padding_mask.shape, dtype=torch.float32) attn_mask = attn_mask.masked_fill_(key_padding_mask, float("-inf")) # Zero out the attention weights where the mask is True by adding -inf prior to softmax @@ -384,10 +385,10 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: # create inputs # create parameters -W_q, b_q = torch.randn((E_total, E_q), device=device), torch.randn(E_total, device=device) -W_k, b_k = torch.randn((E_total, E_k), device=device), torch.randn(E_total, device=device) -W_v, b_v = torch.randn((E_total, E_v), device=device), torch.randn(E_total, device=device) -W_out, b_out = torch.randn((E_out, E_total), device=device), torch.randn(E_out, device=device) +W_q, b_q = torch.randn((E_total, E_q)), torch.randn(E_total) +W_k, b_k = torch.randn((E_total, E_k)), torch.randn(E_total) +W_v, b_v = torch.randn((E_total, E_v)), torch.randn(E_total) +W_out, b_out = torch.randn((E_out, E_total)), torch.randn(E_out) # create nested input queries = [] @@ -396,9 +397,9 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: for i in range(N): l = sentence_lengths[i] s = l - queries.append(torch.randn((l, E_q), device=device)) - keys .append(torch.randn((s, E_k), device=device)) - values .append(torch.randn((s, E_v), device=device)) + queries.append(torch.randn((l, E_q))) + keys .append(torch.randn((s, E_k))) + values .append(torch.randn((s, E_v))) query = torch.nested.nested_tensor(queries) key = torch.nested.nested_tensor(keys) value = torch.nested.nested_tensor(values) From cf21c1de062fb57413c7590864db7434b0135647 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 17:26:35 +0300 Subject: [PATCH 07/13] Fix polynomial tensor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- advanced_source/cpp_extension.rst | 1 + prototype_source/nestedtensor.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/advanced_source/cpp_extension.rst b/advanced_source/cpp_extension.rst index cb0e990797..6f6fa02f0e 100644 --- a/advanced_source/cpp_extension.rst +++ b/advanced_source/cpp_extension.rst @@ -553,6 +553,7 @@ creation time or using ``.to(cuda_device)`` after creation:: import torch assert torch.cuda.is_available() + device = "cuda" if torch.cuda.is_available() else "cpu" cuda_device = torch.device("cuda") # device object representing GPU batch_size = 16 diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py index 582f19c433..15bfb51b32 100644 --- a/prototype_source/nestedtensor.py +++ b/prototype_source/nestedtensor.py @@ -454,7 +454,7 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: # embeddings are assumed to be the same E = E_total -mha_lib = torch.nn.MultiheadAttention(E, nheads, batch_first=True, device=device) +mha_lib = torch.nn.MultiheadAttention(E, nheads, batch_first=True) mha_lib.eval() ###################################################################### From 3c19f993f6be4f2a209adeee56f3b88ee0b03dce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 17:30:42 +0300 Subject: [PATCH 08/13] Fix neural-style tutorial MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- advanced_source/neural_style_tutorial.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/advanced_source/neural_style_tutorial.py b/advanced_source/neural_style_tutorial.py index 3d84fc508b..d606fa09be 100644 --- a/advanced_source/neural_style_tutorial.py +++ b/advanced_source/neural_style_tutorial.py @@ -72,6 +72,7 @@ # method is used to move tensors or modules to a desired device. device = torch.device("cuda" if torch.cuda.is_available() else "cpu") +torch.set_default_device(device) ###################################################################### # Loading the Images @@ -107,7 +108,7 @@ def image_loader(image_name): image = Image.open(image_name) # fake batch dimension required to fit network's input dimensions image = loader(image).unsqueeze(0) - return image.to(device, torch.float) + return image style_img = image_loader("./data/images/neural-style/picasso.jpg") @@ -263,7 +264,7 @@ def forward(self, input): # network to evaluation mode using ``.eval()``. # -cnn = models.vgg19(pretrained=True).features.to(device).eval() +cnn = models.vgg19(pretrained=True).features.eval() @@ -273,8 +274,8 @@ def forward(self, input): # We will use them to normalize the image before sending it into the network. # -cnn_normalization_mean = torch.tensor([0.485, 0.456, 0.406]).to(device) -cnn_normalization_std = torch.tensor([0.229, 0.224, 0.225]).to(device) +cnn_normalization_mean = torch.tensor([0.485, 0.456, 0.406]) +cnn_normalization_std = torch.tensor([0.229, 0.224, 0.225]) # create a module to normalize input image so we can easily put it in a # ``nn.Sequential`` @@ -310,7 +311,7 @@ def get_style_model_and_losses(cnn, normalization_mean, normalization_std, content_layers=content_layers_default, style_layers=style_layers_default): # normalization module - normalization = Normalization(normalization_mean, normalization_std).to(device) + normalization = Normalization(normalization_mean, normalization_std) # just in order to have an iterable access to or list of content/style # losses @@ -375,7 +376,7 @@ def get_style_model_and_losses(cnn, normalization_mean, normalization_std, # # :: # -# input_img = torch.randn(content_img.data.size(), device=device) +# input_img = torch.randn(content_img.data.size()) # add the original input image to the figure: plt.figure() From 671960cd883d798b3267e6c1dda774a2d35261f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 18:38:41 +0300 Subject: [PATCH 09/13] Fix cpp_extension.rst MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- advanced_source/cpp_extension.rst | 1 - advanced_source/neural_style_tutorial.py | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/advanced_source/cpp_extension.rst b/advanced_source/cpp_extension.rst index 6f6fa02f0e..cb0e990797 100644 --- a/advanced_source/cpp_extension.rst +++ b/advanced_source/cpp_extension.rst @@ -553,7 +553,6 @@ creation time or using ``.to(cuda_device)`` after creation:: import torch assert torch.cuda.is_available() - device = "cuda" if torch.cuda.is_available() else "cpu" cuda_device = torch.device("cuda") # device object representing GPU batch_size = 16 diff --git a/advanced_source/neural_style_tutorial.py b/advanced_source/neural_style_tutorial.py index d606fa09be..9c7f0f8487 100644 --- a/advanced_source/neural_style_tutorial.py +++ b/advanced_source/neural_style_tutorial.py @@ -14,7 +14,7 @@ developed by Leon A. Gatys, Alexander S. Ecker and Matthias Bethge. Neural-Style, or Neural-Transfer, allows you to take an image and reproduce it with a new artistic style. The algorithm takes three images, -an input image, a content-image, and a style-image, and changes the input +an input image, a content-image, and a style-image, and changes the input to resemble the content of the content-image and the artistic style of the style-image. @@ -107,8 +107,7 @@ def image_loader(image_name): image = Image.open(image_name) # fake batch dimension required to fit network's input dimensions - image = loader(image).unsqueeze(0) - return image + return image.to(device, torch.float) style_img = image_loader("./data/images/neural-style/picasso.jpg") From 2a34c3ccc94cff3db37e21413927c23b7c1fc7bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 18:39:25 +0300 Subject: [PATCH 10/13] Fix neural_style_tutorial MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- advanced_source/neural_style_tutorial.py | 1 + 1 file changed, 1 insertion(+) diff --git a/advanced_source/neural_style_tutorial.py b/advanced_source/neural_style_tutorial.py index 9c7f0f8487..9b76b1bbcf 100644 --- a/advanced_source/neural_style_tutorial.py +++ b/advanced_source/neural_style_tutorial.py @@ -107,6 +107,7 @@ def image_loader(image_name): image = Image.open(image_name) # fake batch dimension required to fit network's input dimensions + image = loader(image).unsqueeze(0) return image.to(device, torch.float) From bc65968d1220d9257294fa96f48521fdee95688f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 18:54:39 +0300 Subject: [PATCH 11/13] Fix nested style MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- prototype_source/nestedtensor.py | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py index 15bfb51b32..4385e963d3 100644 --- a/prototype_source/nestedtensor.py +++ b/prototype_source/nestedtensor.py @@ -25,7 +25,6 @@ import torch.nn.functional as F device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') -torch.set_default_device(device) ###################################################################### # NestedTensor Initialization @@ -36,7 +35,7 @@ # From the Python frontend, a nestedtensor can be created from a list of tensors. # We denote nt[i] as the ith tensor component of a nestedtensor. nt = torch.nested.nested_tensor([torch.arange(12).reshape( - 2, 6), torch.arange(18).reshape(3, 6)], dtype=torch.float) + 2, 6), torch.arange(18).reshape(3, 6)], dtype=torch.float, device=device) print(f"{nt=}") ###################################################################### @@ -112,7 +111,7 @@ # Applying the operation on a nestedtensor is equivalent to # applying the operation to the underlying tensor components, # with the result being a nestedtensor as well. -nt_mm = torch.nested.nested_tensor([torch.randn((2, 3, 4)), torch.randn((2, 3, 5))]) +nt_mm = torch.nested.nested_tensor([torch.randn((2, 3, 4)), torch.randn((2, 3, 5))], device=device) nt3 = torch.matmul(nt_transposed, nt_mm) print(f"Result of Matmul:\n {nt3}") @@ -319,7 +318,7 @@ def mha_padded(query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, nhea # Have to manipulate masks in order to apply them to the attention weights key_padding_mask = attn_mask_q.view(N, 1, 1, L_t).expand(-1, nheads, -1, -1).reshape(N*nheads, 1, L_t).to(device=device) - attn_mask = torch.zeros(key_padding_mask.shape, dtype=torch.float32) + attn_mask = torch.zeros(key_padding_mask.shape, device=device, dtype=torch.float32) attn_mask = attn_mask.masked_fill_(key_padding_mask, float("-inf")) # Zero out the attention weights where the mask is True by adding -inf prior to softmax @@ -385,10 +384,10 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: # create inputs # create parameters -W_q, b_q = torch.randn((E_total, E_q)), torch.randn(E_total) -W_k, b_k = torch.randn((E_total, E_k)), torch.randn(E_total) -W_v, b_v = torch.randn((E_total, E_v)), torch.randn(E_total) -W_out, b_out = torch.randn((E_out, E_total)), torch.randn(E_out) +W_q, b_q = torch.randn((E_total, E_q), device=device), torch.randn(E_total, device=device) +W_k, b_k = torch.randn((E_total, E_k), device=device), torch.randn(E_total, device=device) +W_v, b_v = torch.randn((E_total, E_v), device=device), torch.randn(E_total, device=device) +W_out, b_out = torch.randn((E_out, E_total), device=device), torch.randn(E_out, device=device) # create nested input queries = [] @@ -397,9 +396,9 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: for i in range(N): l = sentence_lengths[i] s = l - queries.append(torch.randn((l, E_q))) - keys .append(torch.randn((s, E_k))) - values .append(torch.randn((s, E_v))) + queries.append(torch.randn((l, E_q), device=device)) + keys .append(torch.randn((s, E_k), device=device)) + values .append(torch.randn((s, E_v), device=device)) query = torch.nested.nested_tensor(queries) key = torch.nested.nested_tensor(keys) value = torch.nested.nested_tensor(values) @@ -454,7 +453,7 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: # embeddings are assumed to be the same E = E_total -mha_lib = torch.nn.MultiheadAttention(E, nheads, batch_first=True) +mha_lib = torch.nn.MultiheadAttention(E, nheads, batch_first=True, device=device) mha_lib.eval() ###################################################################### From b1a589de64da6f06d8c2c16eaec8eb3abff481ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 18:55:08 +0300 Subject: [PATCH 12/13] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- prototype_source/nestedtensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/prototype_source/nestedtensor.py b/prototype_source/nestedtensor.py index 4385e963d3..0d2898cc4a 100644 --- a/prototype_source/nestedtensor.py +++ b/prototype_source/nestedtensor.py @@ -453,7 +453,7 @@ def zipf_sentence_lengths(alpha: float, batch_size: int) -> np.ndarray: # embeddings are assumed to be the same E = E_total -mha_lib = torch.nn.MultiheadAttention(E, nheads, batch_first=True, device=device) +mha_lib = torch.nn.MultiheadAttention(E, nheads, batch_first=True, device=device) mha_lib.eval() ###################################################################### From a76c9546033bd0da4306698966a39fd8a5f6f1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Onur=20Berk=20T=C3=B6re?= Date: Sat, 10 Jun 2023 19:05:43 +0300 Subject: [PATCH 13/13] Fix amp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Onur Berk Töre --- recipes_source/recipes/amp_recipe.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/recipes_source/recipes/amp_recipe.py b/recipes_source/recipes/amp_recipe.py index 94a68285f9..141bc41a03 100644 --- a/recipes_source/recipes/amp_recipe.py +++ b/recipes_source/recipes/amp_recipe.py @@ -119,7 +119,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): # Runs the forward pass under ``autocast``. - with torch.autocast(dtype=torch.float16): + with torch.autocast(device_type=device, dtype=torch.float16): output = net(input) # output is float16 because linear layers ``autocast`` to float16. assert output.dtype is torch.float16 @@ -154,7 +154,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): - with torch.autocast(dtype=torch.float16): + with torch.autocast(device_type=device, dtype=torch.float16): output = net(input) loss = loss_fn(output, target) @@ -187,7 +187,7 @@ def make_model(in_size, out_size, num_layers): start_timer() for epoch in range(epochs): for input, target in zip(data, targets): - with torch.autocast(dtype=torch.float16, enabled=use_amp): + with torch.autocast(device_type=device, dtype=torch.float16, enabled=use_amp): output = net(input) loss = loss_fn(output, target) scaler.scale(loss).backward() @@ -205,7 +205,7 @@ def make_model(in_size, out_size, num_layers): for epoch in range(0): # 0 epochs, this section is for illustration only for input, target in zip(data, targets): - with torch.autocast(dtype=torch.float16): + with torch.autocast(device_type=device, dtype=torch.float16): output = net(input) loss = loss_fn(output, target) scaler.scale(loss).backward()