diff --git a/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp b/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp
index cb07af7ae4a8..e34e8d677d42 100644
--- a/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp
+++ b/llvm/lib/SYCLLowerIR/LowerESIMDVecArg.cpp
@@ -89,9 +89,6 @@ class ESIMDLowerVecArgPass {
   Function *rewriteFunc(Function &F);
   Type *getSimdArgPtrTyOrNull(Value *arg);
   void fixGlobals(Module &M);
-  void replaceConstExprWithGlobals(Module &M);
-  ConstantExpr *createNewConstantExpr(GlobalVariable *newGlobalVar,
-                                      Type *oldGlobalType, Value *old);
   void removeOldGlobals();
 };
 
@@ -229,41 +226,6 @@ Function *ESIMDLowerVecArgPass::rewriteFunc(Function &F) {
   return NF;
 }
 
-// Replace ConstantExpr if it contains old global variable.
-ConstantExpr *
-ESIMDLowerVecArgPass::createNewConstantExpr(GlobalVariable *NewGlobalVar,
-                                            Type *OldGlobalType, Value *Old) {
-  ConstantExpr *NewConstantExpr = nullptr;
-
-  if (isa<GlobalVariable>(Old)) {
-    NewConstantExpr = cast<ConstantExpr>(
-        ConstantExpr::getBitCast(NewGlobalVar, OldGlobalType));
-    return NewConstantExpr;
-  }
-
-  auto InnerMost = createNewConstantExpr(
-      NewGlobalVar, OldGlobalType, cast<ConstantExpr>(Old)->getOperand(0));
-
-  NewConstantExpr = cast<ConstantExpr>(
-      cast<ConstantExpr>(Old)->getWithOperandReplaced(0, InnerMost));
-
-  return NewConstantExpr;
-}
-
-// Globals are part of ConstantExpr. This loop iterates over
-// all such instances and replaces them with a new ConstantExpr
-// consisting of new global vector* variable.
-void ESIMDLowerVecArgPass::replaceConstExprWithGlobals(Module &M) {
-  for (auto &GlobalVars : OldNewGlobal) {
-    auto &G = *GlobalVars.first;
-    for (auto UseOfG : G.users()) {
-      auto NewGlobal = GlobalVars.second;
-      auto NewConstExpr = createNewConstantExpr(NewGlobal, G.getType(), UseOfG);
-      UseOfG->replaceAllUsesWith(NewConstExpr);
-    }
-  }
-}
-
 // This function creates new global variables of type vector* type
 // when old one is of simd* type.
 void ESIMDLowerVecArgPass::fixGlobals(Module &M) {
@@ -288,16 +250,17 @@ void ESIMDLowerVecArgPass::fixGlobals(Module &M) {
     }
   }
 
-  replaceConstExprWithGlobals(M);
-
   removeOldGlobals();
 }
 
 // Remove old global variables from the program.
 void ESIMDLowerVecArgPass::removeOldGlobals() {
   for (auto &G : OldNewGlobal) {
-    G.first->removeDeadConstantUsers();
-    G.first->eraseFromParent();
+    auto OldGlob = G.first;
+    auto NewGlobal = G.second;
+    OldGlob->replaceAllUsesWith(
+        ConstantExpr::getBitCast(NewGlobal, OldGlob->getType()));
+    OldGlob->eraseFromParent();
   }
 }
 
diff --git a/llvm/test/SYCLLowerIR/esimd_global_crash.ll b/llvm/test/SYCLLowerIR/esimd_global_crash.ll
new file mode 100644
index 000000000000..c25f1e494528
--- /dev/null
+++ b/llvm/test/SYCLLowerIR/esimd_global_crash.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -ESIMDLowerVecArg -S | FileCheck %s
+
+; This test checks that there is no compiler crash when a Global 
+; is used in simple instruction, not directly in ConstantExpr.
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown-sycldevice"
+
+%"class.cl::sycl::INTEL::gpu::simd" = type { <2512 x i32> }
+
+; CHECK: @Global = dso_local global <2512 x i32> undef, align 16384
+@Global = dso_local global %"class.cl::sycl::INTEL::gpu::simd" undef, align 16384
+
+define void @no_crash(<2512 x i32> %simd_val) {
+; CHECK-LABEL: @no_crash(
+; CHECK-NEXT:    [[CAST:%.*]] = addrspacecast %"class.cl::sycl::INTEL::gpu::simd"* bitcast (<2512 x i32>* @Global to %"class.cl::sycl::INTEL::gpu::simd"*) to %"class.cl::sycl::INTEL::gpu::simd" addrspace(4)*
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr %"class.cl::sycl::INTEL::gpu::simd", %"class.cl::sycl::INTEL::gpu::simd" addrspace(4)* [[CAST]], i64 0, i32 0
+; CHECK-NEXT:    store <2512 x i32> [[SIMD_VAL:%.*]], <2512 x i32> addrspace(4)* [[GEP]], align 16384
+; CHECK-NEXT:    ret void
+;
+  %cast = addrspacecast %"class.cl::sycl::INTEL::gpu::simd"* @Global to %"class.cl::sycl::INTEL::gpu::simd" addrspace(4)*
+  %gep = getelementptr %"class.cl::sycl::INTEL::gpu::simd", %"class.cl::sycl::INTEL::gpu::simd" addrspace(4)* %cast, i64 0, i32 0
+  store <2512 x i32> %simd_val, <2512 x i32> addrspace(4)* %gep, align 16384
+  ret void
+}
diff --git a/sycl/include/CL/sycl/detail/type_traits.hpp b/sycl/include/CL/sycl/detail/type_traits.hpp
index df480f58f99f..ee2a3bd9a327 100644
--- a/sycl/include/CL/sycl/detail/type_traits.hpp
+++ b/sycl/include/CL/sycl/detail/type_traits.hpp
@@ -207,6 +207,19 @@ template <typename T>
 struct is_vector_arithmetic
     : bool_constant<is_vec<T>::value && is_arithmetic<T>::value> {};
 
+// is_bool
+template <typename T>
+struct is_scalar_bool
+    : bool_constant<std::is_same<remove_cv_t<T>, bool>::value> {};
+
+template <typename T>
+struct is_vector_bool
+    : bool_constant<is_vec<T>::value &&
+                    is_scalar_bool<vector_element_t<T>>::value> {};
+
+template <typename T>
+struct is_bool : bool_constant<is_scalar_bool<vector_element_t<T>>::value> {};
+
 // is_pointer
 template <typename T> struct is_pointer_impl : std::false_type {};
 
diff --git a/sycl/include/CL/sycl/group.hpp b/sycl/include/CL/sycl/group.hpp
index 9692f33b094e..52b5e61d1da7 100644
--- a/sycl/include/CL/sycl/group.hpp
+++ b/sycl/include/CL/sycl/group.hpp
@@ -274,58 +274,99 @@ template <int Dimensions = 1> class group {
     __spirv_MemoryBarrier(__spv::Scope::Workgroup, flags);
   }
 
+  /// Asynchronously copies a number of elements specified by \p numElements
+  /// from the source pointed by \p src to destination pointed by \p dest
+  /// with a source stride specified by \p srcStride, and returns a SYCL
+  /// device_event which can be used to wait on the completion of the copy.
+  /// Permitted types for dataT are all scalar and vector types, except boolean.
   template <typename dataT>
-  device_event async_work_group_copy(local_ptr<dataT> dest,
-                                     global_ptr<dataT> src,
-                                     size_t numElements) const {
+  detail::enable_if_t<!detail::is_bool<dataT>::value, device_event>
+  async_work_group_copy(local_ptr<dataT> dest, global_ptr<dataT> src,
+                        size_t numElements, size_t srcStride) const {
     using DestT = detail::ConvertToOpenCLType_t<decltype(dest)>;
     using SrcT = detail::ConvertToOpenCLType_t<decltype(src)>;
 
-    __ocl_event_t e = OpGroupAsyncCopyGlobalToLocal(
+    __ocl_event_t E = OpGroupAsyncCopyGlobalToLocal(
         __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()),
-        numElements, 1, 0);
-    return device_event(&e);
+        numElements, srcStride, 0);
+    return device_event(&E);
   }
 
+  /// Asynchronously copies a number of elements specified by \p numElements
+  /// from the source pointed by \p src to destination pointed by \p dest with
+  /// the destination stride specified by \p destStride, and returns a SYCL
+  /// device_event which can be used to wait on the completion of the copy.
+  /// Permitted types for dataT are all scalar and vector types, except boolean.
   template <typename dataT>
-  device_event async_work_group_copy(global_ptr<dataT> dest,
-                                     local_ptr<dataT> src,
-                                     size_t numElements) const {
+  detail::enable_if_t<!detail::is_bool<dataT>::value, device_event>
+  async_work_group_copy(global_ptr<dataT> dest, local_ptr<dataT> src,
+                        size_t numElements, size_t destStride) const {
     using DestT = detail::ConvertToOpenCLType_t<decltype(dest)>;
     using SrcT = detail::ConvertToOpenCLType_t<decltype(src)>;
 
-    __ocl_event_t e = OpGroupAsyncCopyLocalToGlobal(
+    __ocl_event_t E = OpGroupAsyncCopyLocalToGlobal(
         __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()),
-        numElements, 1, 0);
-    return device_event(&e);
+        numElements, destStride, 0);
+    return device_event(&E);
+  }
+
+  /// Specialization for scalar bool type.
+  /// Asynchronously copies a number of elements specified by \p NumElements
+  /// from the source pointed by \p Src to destination pointed by \p Dest
+  /// with a stride specified by \p Stride, and returns a SYCL device_event
+  /// which can be used to wait on the completion of the copy.
+  template <typename T, access::address_space DestS, access::address_space SrcS>
+  detail::enable_if_t<detail::is_scalar_bool<T>::value, device_event>
+  async_work_group_copy(multi_ptr<T, DestS> Dest, multi_ptr<T, SrcS> Src,
+                        size_t NumElements, size_t Stride) const {
+    static_assert(sizeof(bool) == sizeof(uint8_t),
+                  "Async copy to/from bool memory is not supported.");
+    auto DestP =
+        multi_ptr<uint8_t, DestS>(reinterpret_cast<uint8_t *>(Dest.get()));
+    auto SrcP =
+        multi_ptr<uint8_t, SrcS>(reinterpret_cast<uint8_t *>(Src.get()));
+    return async_work_group_copy(DestP, SrcP, NumElements, Stride);
+  }
+
+  /// Specialization for vector bool type.
+  /// Asynchronously copies a number of elements specified by \p NumElements
+  /// from the source pointed by \p Src to destination pointed by \p Dest
+  /// with a stride specified by \p Stride, and returns a SYCL device_event
+  /// which can be used to wait on the completion of the copy.
+  template <typename T, access::address_space DestS, access::address_space SrcS>
+  detail::enable_if_t<detail::is_vector_bool<T>::value, device_event>
+  async_work_group_copy(multi_ptr<T, DestS> Dest, multi_ptr<T, SrcS> Src,
+                        size_t NumElements, size_t Stride) const {
+    static_assert(sizeof(bool) == sizeof(uint8_t),
+                  "Async copy to/from bool memory is not supported.");
+    using VecT = detail::change_base_type_t<T, uint8_t>;
+    auto DestP = multi_ptr<VecT, DestS>(reinterpret_cast<VecT *>(Dest.get()));
+    auto SrcP = multi_ptr<VecT, SrcS>(reinterpret_cast<VecT *>(Src.get()));
+    return async_work_group_copy(DestP, SrcP, NumElements, Stride);
   }
 
+  /// Asynchronously copies a number of elements specified by \p numElements
+  /// from the source pointed by \p src to destination pointed by \p dest and
+  /// returns a SYCL device_event which can be used to wait on the completion
+  /// of the copy.
+  /// Permitted types for dataT are all scalar and vector types.
   template <typename dataT>
   device_event async_work_group_copy(local_ptr<dataT> dest,
                                      global_ptr<dataT> src,
-                                     size_t numElements,
-                                     size_t srcStride) const {
-    using DestT = detail::ConvertToOpenCLType_t<decltype(dest)>;
-    using SrcT = detail::ConvertToOpenCLType_t<decltype(src)>;
-
-    __ocl_event_t e = OpGroupAsyncCopyGlobalToLocal(
-        __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()),
-        numElements, srcStride, 0);
-    return device_event(&e);
+                                     size_t numElements) const {
+    return async_work_group_copy(dest, src, numElements, 1);
   }
 
+  /// Asynchronously copies a number of elements specified by \p numElements
+  /// from the source pointed by \p src to destination pointed by \p dest and
+  /// returns a SYCL device_event which can be used to wait on the completion
+  /// of the copy.
+  /// Permitted types for dataT are all scalar and vector types.
   template <typename dataT>
   device_event async_work_group_copy(global_ptr<dataT> dest,
                                      local_ptr<dataT> src,
-                                     size_t numElements,
-                                     size_t destStride) const {
-    using DestT = detail::ConvertToOpenCLType_t<decltype(dest)>;
-    using SrcT = detail::ConvertToOpenCLType_t<decltype(src)>;
-
-    __ocl_event_t e = OpGroupAsyncCopyLocalToGlobal(
-        __spv::Scope::Workgroup, DestT(dest.get()), SrcT(src.get()),
-        numElements, destStride, 0);
-    return device_event(&e);
+                                     size_t numElements) const {
+    return async_work_group_copy(dest, src, numElements, 1);
   }
 
   template <typename... eventTN>
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 77db78115de0..370abcfb0b01 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -1589,7 +1589,7 @@ class __SYCL_EXPORT handler {
     // Make sure data shared_ptr points to is not released until we finish
     // work with it.
     MSharedPtrStorage.push_back(Dst);
-    T_Dst *RawDstPtr = Dst.get();
+    typename shared_ptr_class<T_Dst>::element_type *RawDstPtr = Dst.get();
     copy(Src, RawDstPtr);
   }
 
@@ -1612,7 +1612,7 @@ class __SYCL_EXPORT handler {
     // Make sure data shared_ptr points to is not released until we finish
     // work with it.
     MSharedPtrStorage.push_back(Src);
-    T_Src *RawSrcPtr = Src.get();
+    typename shared_ptr_class<T_Src>::element_type *RawSrcPtr = Src.get();
     copy(RawSrcPtr, Dst);
   }
 
diff --git a/sycl/test/basic_tests/group_async_copy.cpp b/sycl/test/basic_tests/group_async_copy.cpp
new file mode 100644
index 000000000000..e41b7414a19c
--- /dev/null
+++ b/sycl/test/basic_tests/group_async_copy.cpp
@@ -0,0 +1,160 @@
+// RUN: %clangxx -fsycl -fsycl-targets=%sycl_triple %s -o %t.run
+// RUN: %GPU_RUN_PLACEHOLDER %t.run
+// RUN: %CPU_RUN_PLACEHOLDER %t.run
+// RUN: %ACC_RUN_PLACEHOLDER %t.run
+// RUN: env SYCL_DEVICE_FILTER=host %t.run
+
+#include <CL/sycl.hpp>
+#include <iostream>
+#include <typeinfo>
+
+using namespace cl::sycl;
+
+template <typename T> class KernelName;
+
+// Define the number of work items to enqueue.
+const size_t NElems = 32;
+const size_t WorkGroupSize = 8;
+const size_t NWorkGroups = NElems / WorkGroupSize;
+
+template <typename T> void initInputBuffer(buffer<T, 1> &Buf, size_t Stride) {
+  auto Acc = Buf.template get_access<access::mode::write>();
+  for (size_t I = 0; I < Buf.get_count(); I += WorkGroupSize) {
+    for (size_t J = 0; J < WorkGroupSize; J++)
+      Acc[I + J] = I + J + ((J % Stride == 0) ? 100 : 0);
+  }
+}
+
+template <typename T> void initOutputBuffer(buffer<T, 1> &Buf) {
+  auto Acc = Buf.template get_access<access::mode::write>();
+  for (size_t I = 0; I < Buf.get_count(); I++)
+    Acc[I] = 0;
+}
+
+template <typename T> struct is_vec : std::false_type {};
+template <typename T, size_t N> struct is_vec<vec<T, N>> : std::true_type {};
+
+template <typename T> bool checkEqual(vec<T, 1> A, size_t B) {
+  T TB = B;
+  return A.s0() == TB;
+}
+
+template <typename T> bool checkEqual(vec<T, 4> A, size_t B) {
+  T TB = B;
+  return A.x() == TB && A.y() == TB && A.z() == TB && A.w() == TB;
+}
+
+template <typename T>
+typename std::enable_if<!is_vec<T>::value, bool>::type checkEqual(T A,
+                                                                  size_t B) {
+  T TB = B;
+  return A == TB;
+}
+
+template <typename T> std::string toString(vec<T, 1> A) {
+  std::string R("(");
+  return R + std::to_string(A.s0()) + ")";
+}
+
+template <typename T> std::string toString(vec<T, 4> A) {
+  std::string R("(");
+  R += std::to_string(A.x()) + "," + std::to_string(A.y()) + "," +
+       std::to_string(A.z()) + "," + std::to_string(A.w()) + ")";
+  return R;
+}
+
+template <typename T = void>
+typename std::enable_if<!is_vec<T>::value, std::string>::type toString(T A) {
+  return std::to_string(A);
+}
+
+template <typename T> int checkResults(buffer<T, 1> &OutBuf, size_t Stride) {
+  auto Out = OutBuf.template get_access<access::mode::read>();
+  int EarlyFailout = 20;
+
+  for (size_t I = 0; I < OutBuf.get_count(); I += WorkGroupSize) {
+    for (size_t J = 0; J < WorkGroupSize; J++) {
+      size_t ExpectedVal = (J % Stride == 0) ? (100 + I + J) : 0;
+      if (!checkEqual(Out[I + J], ExpectedVal)) {
+        std::cerr << std::string(typeid(T).name()) + ": Stride=" << Stride
+                  << " : Incorrect value at index " << I + J
+                  << " : Expected: " << toString(ExpectedVal)
+                  << ", Computed: " << toString(Out[I + J]) << "\n";
+        if (--EarlyFailout == 0)
+          return 1;
+      }
+    }
+  }
+  return EarlyFailout - 20;
+}
+
+template <typename T> int test(size_t Stride) {
+  queue Q;
+
+  buffer<T, 1> InBuf(NElems);
+  buffer<T, 1> OutBuf(NElems);
+
+  initInputBuffer(InBuf, Stride);
+  initOutputBuffer(OutBuf);
+
+  Q.submit([&](handler &CGH) {
+     auto In = InBuf.template get_access<access::mode::read>(CGH);
+     auto Out = OutBuf.template get_access<access::mode::write>(CGH);
+     accessor<T, 1, access::mode::read_write, access::target::local> Local(
+         range<1>{WorkGroupSize}, CGH);
+
+     nd_range<1> NDR{range<1>(NElems), range<1>(WorkGroupSize)};
+     CGH.parallel_for<KernelName<T>>(NDR, [=](nd_item<1> NDId) {
+       auto GrId = NDId.get_group_linear_id();
+       auto Group = NDId.get_group();
+       size_t NElemsToCopy =
+           WorkGroupSize / Stride + ((WorkGroupSize % Stride) ? 1 : 0);
+       size_t Offset = GrId * WorkGroupSize;
+       if (Stride == 1) { // Check the version without stride arg.
+         auto E = NDId.async_work_group_copy(
+             Local.get_pointer(), In.get_pointer() + Offset, NElemsToCopy);
+         E.wait();
+       } else {
+         auto E = NDId.async_work_group_copy(Local.get_pointer(),
+                                             In.get_pointer() + Offset,
+                                             NElemsToCopy, Stride);
+         E.wait();
+       }
+
+       if (Stride == 1) { // Check the version without stride arg.
+         auto E = Group.async_work_group_copy(
+             Out.get_pointer() + Offset, Local.get_pointer(), NElemsToCopy);
+         Group.wait_for(E);
+       } else {
+         auto E = Group.async_work_group_copy(Out.get_pointer() + Offset,
+                                              Local.get_pointer(), NElemsToCopy,
+                                              Stride);
+         Group.wait_for(E);
+       }
+     });
+   }).wait();
+
+  return checkResults(OutBuf, Stride);
+}
+
+int main() {
+  for (int Stride = 1; Stride < WorkGroupSize; Stride++) {
+    if (test<int>(Stride))
+      return 1;
+    if (test<vec<int, 1>>(Stride))
+      return 1;
+    if (test<int4>(Stride))
+      return 1;
+    if (test<bool>(Stride))
+      return 1;
+    if (test<vec<bool, 1>>(Stride))
+      return 1;
+    if (test<vec<bool, 4>>(Stride))
+      return 1;
+    if (test<cl::sycl::cl_bool>(Stride))
+      return 1;
+  }
+
+  std::cout << "Test passed.\n";
+  return 0;
+}
diff --git a/sycl/test/basic_tests/handler/handler_mem_op.cpp b/sycl/test/basic_tests/handler/handler_mem_op.cpp
index 2f9460051ed2..c0d3b1b10d4e 100644
--- a/sycl/test/basic_tests/handler/handler_mem_op.cpp
+++ b/sycl/test/basic_tests/handler/handler_mem_op.cpp
@@ -348,7 +348,7 @@ template <typename T> void test_copy_acc_ptr() {
 template <typename T> void test_copy_shared_ptr_acc() {
   const size_t Size = 10;
   T Data[Size] = {0};
-  std::shared_ptr<T> Values(new T[Size]());
+  std::shared_ptr<T[]> Values(new T[Size]());
   for (size_t I = 0; I < Size; ++I) {
     Values.get()[I] = I;
   }
@@ -369,7 +369,7 @@ template <typename T> void test_copy_shared_ptr_acc() {
 template <typename T> void test_copy_shared_ptr_const_acc() {
   constexpr size_t Size = 10;
   T Data[Size] = {0};
-  std::shared_ptr<const T> Values(new T[Size]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
+  std::shared_ptr<const T[]> Values(new T[Size]{0, 1, 2, 3, 4, 5, 6, 7, 8, 9});
   {
     buffer<T, 1> Buffer(Data, range<1>(Size));
     queue Queue;
@@ -390,7 +390,7 @@ template <typename T> void test_copy_acc_shared_ptr() {
   for (size_t I = 0; I < Size; ++I) {
     Data[I] = I;
   }
-  std::shared_ptr<T> Values(new T[Size]());
+  std::shared_ptr<T[]> Values(new T[Size]());
   {
     buffer<T, 1> Buffer(Data, range<1>(Size));
     queue Queue;