diff --git a/.gitignore b/.gitignore
index 83b135eb0..329246fbb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -89,7 +89,7 @@ torch/share/
 torch/test/
 torch/version.py
 
-intel_pytorch_extension_py/version.py
+torch_ipex/version.py
 torch_ipex/csrc/version.cpp
 torch_ipex/csrc/aten_ipex_sparse_type_default.*
 torch_ipex/csrc/cpu/SparseOPs*
diff --git a/.gitmodules b/.gitmodules
index 3569e75e4..7761a5ad7 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,12 +1,6 @@
-[submodule "third_party/pybind11"]
-	path = third_party/pybind11
-	url = https://github.com/pybind/pybind11.git
 [submodule "third_party/mkl-dnn"]
 	path = third_party/mkl-dnn
     url = https://github.com/oneapi-src/oneDNN
 [submodule "third_party/xsmm"]
 	path = third_party/xsmm
 	url = https://github.com/hfp/libxsmm.git
-[submodule "third_party/torch_ccl"]
-	path = third_party/torch_ccl
-	url = https://github.com/intel/torch-ccl.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6ec7a5117..3b9e49828 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -5,12 +5,11 @@ set(CMAKE_INSTALL_MESSAGE NEVER)
 # set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 
-set(PLUGIN_NAME _torch_ipex)
+set(PLUGIN_NAME torch_ipex)
 
 set(RPATH_VALUE $ORIGIN)
 set(CMAKE_SKIP_BUILD_RPATH  FALSE)
 set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
-set(CMAKE_INSTALL_RPATH "${RPATH_VALUE}/lib/")
 set(CMAKE_INSTALL_RPATH_USE_LINK_PATH FALSE)
 
 set(DPCPP_ROOT "${PROJECT_SOURCE_DIR}/torch_ipex/csrc")
@@ -20,6 +19,4 @@ list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 # Common dependencies
 
-add_subdirectory(${DPCPP_THIRD_PARTY_ROOT}/pybind11)
-
 include(cmake/CPU.cmake)
diff --git a/README.md b/README.md
index 9655180ac..3c03b6bb1 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ Intel® Extension for PyTorch (IPEX) is a Python package to extend official PyTo
 ### Install PyTorch (Optional)
  |IPEX Version|PyTorch Version|
  |--|--|
+ |[v1.9.0](https://github.com/intel/intel-extension-for-pytorch/tree/v1.9.0)|[v1.9.0](https://github.com/pytorch/pytorch/tree/v1.9.0 "v1.9.0")|
  |[v1.8.0](https://github.com/intel/intel-extension-for-pytorch/tree/v1.8.0)|[v1.8.0](https://github.com/pytorch/pytorch/tree/v1.8.0 "v1.8.0")|
  |[v1.2.0](https://github.com/intel/intel-extension-for-pytorch/tree/v1.2.0)|[v1.7.0](https://github.com/pytorch/pytorch/tree/v1.7.0 "v1.7.0")|
  |[v1.1.0](https://github.com/intel/intel-extension-for-pytorch/tree/v1.1.0)|[v1.5.0-rc3](https://github.com/pytorch/pytorch/tree/v1.5.0-rc3 "v1.5.0-rc3")|
@@ -38,16 +39,15 @@ From IPEX 1.8.0, compiling PyTorch from source is not required. If you still wan
 ### Install IPEX via wheel file
 
 ```
-python -m pip install torch_ipex==1.8.0 -f https://software.intel.com/ipex-whl-stable
+python -m pip install torch_ipex==1.9.0 -f https://software.intel.com/ipex-whl-stable
 ```
 
 :information_source: Wheel files availability for Python versions
 
 | IPEX Version | Python 3.6 | Python 3.7 | Python 3.8 | Python 3.9 |
 | :--: | :--: | :--: | :--: | :--: |
-| 1.8.0 |  | :heavy_check_mark: | | |
-
-**Note**: Currently we only provide wheel file for Python 3.7. For other Python versions, please follow instructions in the following section to compile from source.
+| 1.9.0 | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: | :heavy_check_mark: |
+| 1.8.0 |  | :heavy_check_mark: |  |  |
 
 ### Install IPEX by compiling from source
 
diff --git a/cmake/CPU.cmake b/cmake/CPU.cmake
index b9056d057..eb675d5fe 100644
--- a/cmake/CPU.cmake
+++ b/cmake/CPU.cmake
@@ -11,8 +11,7 @@ SET(DNNL_ENABLE_PRIMITIVE_CACHE TRUE CACHE BOOL "" FORCE)
 SET(DNNL_LIBRARY_TYPE STATIC CACHE STRING "" FORCE)
 
 set(DPCPP_CPU_ROOT "${PROJECT_SOURCE_DIR}/torch_ipex/csrc/cpu")
-add_subdirectory(${DPCPP_THIRD_PARTY_ROOT}/mkl-dnn)
-find_package(TorchCCL REQUIRED)
+add_subdirectory(${DPCPP_THIRD_PARTY_ROOT}/mkl-dnn EXCLUDE_FROM_ALL)
 list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake/Modules)
 
 FIND_PACKAGE(AVX)
@@ -141,9 +140,7 @@ endif()
 include_directories(${PROJECT_SOURCE_DIR})
 include_directories(${PROJECT_SOURCE_DIR}/torch_ipex)
 include_directories(${PROJECT_SOURCE_DIR}/torch_ipex/csrc/)
-include_directories(${DPCPP_THIRD_PARTY_ROOT}/pybind11/include)
 include_directories(${DPCPP_THIRD_PARTY_ROOT}/xsmm/include)
-include_directories(${TORCHCCL_INCLUDE_DIR})
 
 # sources
 set(DPCPP_SRCS)
@@ -167,9 +164,8 @@ ExternalProject_Add(xsmm
     "-j"
   INSTALL_COMMAND ""
   )
-# Compile code with pybind11
 set(DPCPP_SRCS ${DPCPP_ATEN_SRCS} ${DPCPP_COMMON_SRCS} ${DPCPP_CPU_SRCS} ${DPCPP_JIT_SRCS})
-pybind11_add_module(${PLUGIN_NAME} SHARED ${DPCPP_SRCS})
+add_library(${PLUGIN_NAME} SHARED ${DPCPP_SRCS})
 target_link_libraries(${PLUGIN_NAME} PRIVATE ${DPCPP_THIRD_PARTY_ROOT}/xsmm/lib/libxsmm.a)
 
 #link_directories(${PYTORCH_INSTALL_DIR}/lib)
@@ -188,15 +184,15 @@ else()
   message(FATAL_ERROR "Unknown ATen parallel backend: ${ATEN_THREADING}")
 endif()
 
-add_dependencies(${PLUGIN_NAME} pybind11)
-add_dependencies(${PLUGIN_NAME} torch_ccl)
 add_dependencies(${PLUGIN_NAME} dnnl)
 target_link_libraries(${PLUGIN_NAME} PUBLIC dnnl)
 add_dependencies(${PLUGIN_NAME} xsmm)
-target_link_libraries(${PLUGIN_NAME} PUBLIC torch_ccl)
 link_directories(${PYTORCH_INSTALL_DIR}/lib)
-target_link_libraries(${PLUGIN_NAME} PUBLIC ${PYTORCH_INSTALL_DIR}/lib/libtorch_python.so)
 target_link_libraries(${PLUGIN_NAME} PUBLIC ${PYTORCH_INSTALL_DIR}/lib/libtorch_cpu.so)
 target_link_libraries(${PLUGIN_NAME} PUBLIC ${PYTORCH_INSTALL_DIR}/lib/libc10.so)
 
 target_compile_options(${PLUGIN_NAME} PRIVATE "-DC10_BUILD_MAIN_LIB")
+
+#set_property(TARGET ${PLUGIN_NAME} PROPERTY VERSION "${IPEX_VERSION}")
+#set_property(TARGET ${PLUGIN_NAME} PROPERTY SOVERSION "${IPEX_VERSION}")
+install(TARGETS ${PLUGIN_NAME} LIBRARY DESTINATION lib)
diff --git a/cmake/Modules/FindTorchCCL.cmake b/cmake/Modules/FindTorchCCL.cmake
index 64435eb82..dc1259707 100644
--- a/cmake/Modules/FindTorchCCL.cmake
+++ b/cmake/Modules/FindTorchCCL.cmake
@@ -17,7 +17,10 @@ SET(TORCHCCL_INCLUDE_DIR)
 
 SET(TORCHCCL_ROOT "${PROJECT_SOURCE_DIR}/third_party/torch_ccl")
 
+SET(CMAKE_INSTALL_PREFIX_SAVED "${CMAKE_INSTALL_PREFIX}")
+SET(CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX_SAVED}/../torch_ccl")
 ADD_SUBDIRECTORY(${TORCHCCL_ROOT})
+SET(CMAKE_INSTALL_PREFIX "${CMAKE_INSTALL_PREFIX_SAVED}")
 IF(NOT TARGET torch_ccl)
     MESSAGE(FATAL_ERROR "Failed to include torch_ccl target")
 ENDIF()
diff --git a/docker/Dockerfile b/docker/Dockerfile
index d40caec7f..8cc915aff 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,12 +1,12 @@
 # syntax = docker/dockerfile:experimental
 # based onhttps://github.com/pytorch/pytorch/blob/master/Dockerfile
-# 
+#
 # NOTE: To build this you will need a docker version > 18.06 with
 #       experimental enabled and DOCKER_BUILDKIT=1
 #
 #       If you do not use buildkit you are not going to have a good time
 #
-#       For reference: 
+#       For reference:
 #           https://docs.docker.com/develop/develop-images/build_enhancements/
 
 ARG BASE_IMAGE=ubuntu:20.04
@@ -26,6 +26,7 @@ RUN --mount=type=cache,id=apt-dev,target=/var/cache/apt \
     libjpeg-dev \
     pybind11-dev \
     libpng-dev \
+    pybind11-dev \
     && rm -rf /var/lib/apt/lists/*
 RUN /usr/sbin/update-ccache-symlinks
 RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache
@@ -41,24 +42,29 @@ RUN curl -fsSL -v -o ~/miniconda.sh -O  https://repo.anaconda.com/miniconda/Mini
     /opt/conda/bin/conda clean -ya
 
 FROM dev-base AS build
-ARG IPEX_VERSION=v1.8.0
-ARG PYTORCH_VERSION=v1.8.0
+ARG IPEX_VERSION=v1.9.0
+ARG PYTORCH_VERSION=v1.9.0
+ARG TORCHVISION_VERSION=0.10.0+cpu
+ARG TORCHAUDIO_VERSION=0.9.0
 COPY --from=conda /opt/conda /opt/conda
 RUN --mount=type=cache,target=/opt/ccache \
-    pip3 install torch==${PYTORCH_VERSION}+cpu torchvision \
-    -f https://download.pytorch.org/whl/torch_stable.html && \
-    git clone -b ${IPEX_VERSION} --single-branch https://github.com/intel/intel-extension-for-pytorch && \
-    cd intel-extension-for-pytorch && git submodule sync && \
+    pip install torch==${PYTORCH_VERSION}+cpu torchvision==${TORCHVISION_VERSION} torchaudio==${TORCHAUDIO_VERSION} -f https://download.pytorch.org/whl/torch_stable.html && \
+    git clone https://github.com/intel/intel-extension-for-pytorch && \
+    cd intel-extension-for-pytorch && \
+    git checkout ${IPEX_VERSION} && \
+    git submodule sync && \
     git submodule update --init --recursive && \
     pip3 install -r requirements.txt && \
-    pip3 install -v . && rm -rf *
+    python setup.py bdist_wheel && \
+    pip3 install dist/*.whl && \
+    cd .. && rm -rf intel-extension-for-pytorch
 
 FROM dev-base as dev
 COPY --from=build /opt/conda /opt/conda
 ARG OMP_NUM_THREADS=1
 ENV OMP_NUM_THREADS ${OMP_NUM_THREADS}
 ARG KMP_BLOCKTIME=1
-ENV KMP_BLOCKTIME ${KMP_BLOCKTIME} 
+ENV KMP_BLOCKTIME ${KMP_BLOCKTIME}
 ARG KMP_HW_SUBSET=1T
 ENV KMP_HW_SUBSET ${KMP_HW_SUBSET}
 ENV LD_PRELOAD "/opt/conda/lib/libiomp5.so /usr/lib/x86_64-linux-gnu/libtcmalloc.so"
diff --git a/docker/README.md b/docker/README.md
index 6a58f7822..f85bcce7d 100644
--- a/docker/README.md
+++ b/docker/README.md
@@ -10,6 +10,6 @@
 
   ```console
   $ cd $DOCKERFILE_DIR
-  $ DOCKER_BUILDKIT=1 docker build --build-arg IPEX_VERSION=v1.8.0 --build-arg PYTORCH_VERSION=v1.8.0 -t intel-extension-for-pytorch:test .
+  $ DOCKER_BUILDKIT=1 docker build -t intel-extension-for-pytorch:test .
   $ docker run intel-extension-for-pytorch:test python -c "import torch;import intel_pytorch_extension as ipex;print('torch:', torch.__version__,' ipex:',ipex.__version__)"
   ```
diff --git a/intel_pytorch_extension_py/ops/embeddingbag.py b/intel_pytorch_extension_py/ops/embeddingbag.py
deleted file mode 100644
index 03fa33d33..000000000
--- a/intel_pytorch_extension_py/ops/embeddingbag.py
+++ /dev/null
@@ -1,14 +0,0 @@
-import torch
-from torch import nn
-from torch.autograd import Function
-import _torch_ipex as core
-
-# # extension for BF16 fast path only
-
-
-def embeddingbag(weights, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset):
-    ret = torch.ops.torch_ipex.embedding_bag(weights, indices, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset)
-    if len(ret)==1:
-        ret += [torch.Tensor(), torch.Tensor(), torch.Tensor()]
-    return ret
-torch.embedding_bag = embeddingbag
diff --git a/scripts/cpu/gen-dense-cpu-ops.py b/scripts/cpu/gen-dense-cpu-ops.py
index 6fcac50c5..0c7b1cf72 100755
--- a/scripts/cpu/gen-dense-cpu-ops.py
+++ b/scripts/cpu/gen-dense-cpu-ops.py
@@ -32,6 +32,7 @@
     'aten::mul_.Tensor(Tensor(a!) self, Tensor other) -> Tensor(a!)',
     'aten::mul.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)',
     'aten::linear(Tensor input, Tensor weight, Tensor? bias=None) -> Tensor',
+    # 'aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor',
     'aten::native_batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps) -> (Tensor, Tensor, Tensor)',
     'aten::native_batch_norm_backward(Tensor grad_out, Tensor input, Tensor? weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_invstd, bool train, float eps, bool[3] output_mask) -> (Tensor, Tensor, Tensor)',
     'aten::avg_pool2d(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None) -> Tensor',
@@ -75,7 +76,7 @@
     'aten::clone(Tensor self, *, MemoryFormat? memory_format=None) -> Tensor',
     'aten::gelu(Tensor self) -> Tensor',
     'aten::gelu_backward(Tensor grad, Tensor self) -> Tensor',
-    'aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=0, int? end=9223372036854775807, int step=1) -> Tensor(a)',
+    'aten::slice.Tensor(Tensor(a) self, int dim=0, int? start=None, int? end=None, int step=1) -> Tensor(a)',
     'aten::select.int(Tensor(a) self, int dim, int index) -> Tensor(a)',
     'aten::select.Dimname(Tensor(a) self, Dimname dim, int index) -> Tensor(a)',
     'aten::unbind.int(Tensor(a) self, int dim=0) -> Tensor(a)[]',
@@ -112,6 +113,10 @@
     'aten::div.Scalar(Tensor self, Scalar other) -> Tensor',
     'aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)',
     'aten::permute(Tensor(a) self, int[] dims) -> Tensor(a)',
+    'aten::to.dtype_layout(Tensor self, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor',
+    'aten::to.device(Tensor self, Device device, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor',
+    'aten::to.dtype(Tensor self, ScalarType dtype, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor',
+    'aten::to.other(Tensor self, Tensor other, bool non_blocking=False, bool copy=False, MemoryFormat? memory_format=None) -> Tensor',
 ]
 
 _FN_IPEX_FUNCS_WITH_SIMPLE_ATEN_SIG = [
@@ -126,6 +131,41 @@
     'aten::div.out(Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!)',
 ]
 
+_FN_EXCLUDE_FUNCS_WITH_SIMPLE_ATEN_SIG = [
+    "aten::conv1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] dilation=1, int groups=1) -> Tensor",
+    "aten::conv2d(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] dilation=1, int groups=1) -> Tensor",
+    "aten::conv3d(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] dilation=1, int groups=1) -> Tensor",
+    "aten::conv1d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, str padding=\"valid\", int[1] dilation=1, int groups=1) -> Tensor",
+    "aten::conv2d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, str padding=\"valid\", int[2] dilation=1, int groups=1) -> Tensor",
+    "aten::conv3d.padding(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, str padding=\"valid\", int[3] dilation=1, int groups=1) -> Tensor",
+    "aten::convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups) -> Tensor",
+    "aten::_convolution(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled, bool allow_tf32) -> Tensor",
+    "aten::_convolution.deprecated(Tensor input, Tensor weight, Tensor? bias, int[] stride, int[] padding, int[] dilation, bool transposed, int[] output_padding, int groups, bool benchmark, bool deterministic, bool cudnn_enabled) -> Tensor",
+    "aten::conv_transpose1d(Tensor input, Tensor weight, Tensor? bias=None, int[1] stride=1, int[1] padding=0, int[1] output_padding=0, int groups=1, int[1] dilation=1) -> Tensor",
+    "aten::conv_transpose2d.input(Tensor input, Tensor weight, Tensor? bias=None, int[2] stride=1, int[2] padding=0, int[2] output_padding=0, int groups=1, int[2] dilation=1) -> Tensor",
+    "aten::conv_transpose3d.input(Tensor input, Tensor weight, Tensor? bias=None, int[3] stride=1, int[3] padding=0, int[3] output_padding=0, int groups=1, int[3] dilation=1) -> Tensor",
+    "aten::log_softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor",
+    "aten::cross_entropy_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor",
+    "aten::log_softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor",
+    "aten::softmax.int(Tensor self, int dim, ScalarType? dtype=None) -> Tensor",
+    "aten::softmax.Dimname(Tensor self, Dimname dim, *, ScalarType? dtype=None) -> Tensor",
+    "aten::contiguous(Tensor(a) self, *, MemoryFormat memory_format=contiguous_format) -> Tensor(a)",
+    "aten::flatten.using_ints(Tensor(a) self, int start_dim=0, int end_dim=-1) -> Tensor(a)",
+    "aten::dropout(Tensor input, float p, bool train) -> Tensor",
+    "aten::dropout_(Tensor(a!) self, float p, bool train) -> Tensor(a!)",
+    "aten::nll_loss_nd(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor",
+    "aten::nll_loss(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor",
+    "aten::nll_loss.out(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100, *, Tensor(a!) out) -> Tensor(a!)",
+    "aten::batch_norm(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> Tensor",
+    "aten::_batch_norm_impl_index(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, bool training, float momentum, float eps, bool cudnn_enabled) -> (Tensor, Tensor, Tensor, Tensor, int)",
+    "aten::reshape(Tensor(a) self, int[] shape) -> Tensor(a)",
+    "aten::where.self(Tensor condition, Tensor self, Tensor other) -> Tensor",
+    "aten::where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor",
+    "aten::where.ScalarOther(Tensor condition, Tensor self, Scalar other) -> Tensor",
+    "aten::where.Scalar(Tensor condition, Scalar self, Scalar other) -> Tensor",
+    "aten::nll_loss2d(Tensor self, Tensor target, Tensor? weight=None, int reduction=Mean, int ignore_index=-100) -> Tensor",
+]
+
 _SHALLOW_FALLBACK_TO_CPU_TENSOR_LIST = 'shallowFallbackToCPUTensorList'
 _SHALLOW_FALLBACK_TO_CPU_TENSOR = 'shallowFallbackToCPUTensor'
 _SHALLOW_UPGRADE_TO_DPCPP_TENSOR = 'shallowUpgradeToDPCPPTensor'
@@ -221,6 +261,13 @@ def is_dnnl_func(self, simple_aten_sig):
                 return True
         return False
 
+    def is_exclude_func(self, simple_aten_sig):
+        stripped_str = simple_aten_sig.replace(' ', '')
+        for item in _FN_EXCLUDE_FUNCS_WITH_SIMPLE_ATEN_SIG:
+            if stripped_str == item.replace(' ', ''):
+                return True
+        return False
+
     def is_ipex_func(self, simple_aten_sig):
         stripped_str = simple_aten_sig.replace(' ', '')
         for item in _FN_IPEX_FUNCS_WITH_SIMPLE_ATEN_SIG:
@@ -580,6 +627,9 @@ def is_conv_overrideable_func(fname):
 
         func_defs = []
         for cpp_sig, aten_sig, native_cpp_sig, cpp_func_sig_str, aten_func_sig_str in self._sigs:
+            if self.is_exclude_func(aten_func_sig_str):
+                continue
+
             # The operator name should be unique because the new registration mechanism of PyTorch 1.7
             new_cpp_func_name = aten_sig.def_name.replace('.', '_')
             cpp_func_str_h, cpp_func_str_cpp = self.gen_func_signature(cpp_func_sig_str, cpp_sig.def_name, new_cpp_func_name)
diff --git a/scripts/cpu/pytorch_headers/SparseCPUType.h b/scripts/cpu/pytorch_headers/SparseCPUType.h
index 05d2fc47d..96628b78f 100644
--- a/scripts/cpu/pytorch_headers/SparseCPUType.h
+++ b/scripts/cpu/pytorch_headers/SparseCPUType.h
@@ -35,8 +35,8 @@ namespace SparseCPUType {
   Tensor empty(IntArrayRef size, optional<DimnameList> names, optional<ScalarType> dtype, optional<Layout> layout, optional<Device> device, optional<bool> pin_memory, optional<MemoryFormat> memory_format);
   Tensor empty(IntArrayRef size, optional<ScalarType> dtype, optional<Layout> layout, optional<Device> device, optional<bool> pin_memory, optional<MemoryFormat> memory_format);
   Tensor add(const Tensor & self, const Tensor & other, Scalar alpha);
-  Tensor & add_(Tensor & self, const Tensor & other, Scalar alpha);
-  Tensor & add_out(const Tensor & self, const Tensor & other, Scalar alpha, Tensor & out);
+  Tensor & add_(Tensor & self, const Tensor & other, const Scalar & alpha);
+  Tensor & add_out(const Tensor & self, const Tensor & other, const Scalar & alpha, Tensor & out);
   Tensor div(const Tensor & self, const Tensor & other);
   Tensor & div_(Tensor & self, const Tensor & other);
   Tensor & div_out(const Tensor & self, const Tensor & other, Tensor & out);
@@ -53,22 +53,23 @@ namespace SparseCPUType {
   Tensor & mul_out(const Tensor & self, const Tensor & other, Tensor & out);
   Tensor narrow_copy(const Tensor & self, int64_t dim, int64_t start, int64_t length);
   Tensor & narrow_copy_out(const Tensor & self, int64_t dim, int64_t start, int64_t length, Tensor & out);
-  Tensor & sspaddmm_out(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha, Tensor & out);
+  Tensor & sspaddmm_out(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha, Tensor & out);
   Tensor true_divide(const Tensor & self, const Tensor & other);
   Tensor & true_divide_(Tensor & self, const Tensor & other);
   Tensor & true_divide_out(const Tensor & self, const Tensor & other, Tensor & out);
-  Tensor native_norm(const Tensor & self, Scalar p);
+  Tensor native_norm(const Tensor & self, const Scalar & p);
   Tensor _sparse_sum_backward(const Tensor & grad, const Tensor & self, IntArrayRef dim);
   Tensor clone(const Tensor & self, optional<MemoryFormat> memory_format);
   Tensor & pow_out(const Tensor & self, const Tensor & exponent, Tensor & out);
-  Tensor pow(const Tensor & self, Scalar exponent);
+  Tensor pow(const Tensor & self, const Scalar & exponent);
   Tensor & zero_(Tensor & self);
-  Tensor & sub_out(const Tensor & self, const Tensor & other, Scalar alpha, Tensor & out);
-  Tensor sub(const Tensor & self, const Tensor & other, Scalar alpha);
-  Tensor & sub_(Tensor & self, const Tensor & other, Scalar alpha);
-  Tensor & addmm_out(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha, Tensor & out);
-  Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha);
-  Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, Scalar beta, Scalar alpha);
+  Tensor & sub_out(const Tensor & self, const Tensor & other, const Scalar & alpha, Tensor & out);
+  Tensor sub(const Tensor & self, const Tensor & other, const Scalar & alpha);
+  Tensor & sub_(Tensor & self, const Tensor & other, const Scalar & alpha);
+  Tensor & addmm_out(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha, Tensor & out);
+  Tensor addmm(const Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha);
+  Tensor & addmm_(Tensor & self, const Tensor & mat1, const Tensor & mat2, const Scalar & beta, const Scalar & alpha);
+  Tensor _sparse_coo_tensor_unsafe(const Tensor & indices, const Tensor & values, IntArrayRef size, optional<ScalarType> dtype, optional<Layout> layout, optional<Device> device, optional<bool> pin_memory);
   Tensor _sparse_coo_tensor_with_dims(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size, optional<ScalarType> dtype, optional<Layout> layout, optional<Device> device, optional<bool> pin_memory);
   Tensor _sparse_coo_tensor_with_dims_and_tensors(int64_t sparse_dim, int64_t dense_dim, IntArrayRef size, const Tensor & indices, const Tensor & values, optional<ScalarType> dtype, optional<Layout> layout, optional<Device> device, optional<bool> pin_memory);
   Tensor & sparse_resize_(Tensor & self, IntArrayRef size, int64_t sparse_dim, int64_t dense_dim);
@@ -81,6 +82,7 @@ namespace SparseCPUType {
   int64_t _dimV(const Tensor & self);
   int64_t _nnz(const Tensor & self);
   Tensor coalesce(const Tensor & self);
+  Tensor _coalesce(const Tensor & self);
   bool is_coalesced(const Tensor & self);
   Tensor _indices(const Tensor & self);
   Tensor _values(const Tensor & self);
diff --git a/setup.py b/setup.py
index 89b9cb306..6c325a5cf 100644
--- a/setup.py
+++ b/setup.py
@@ -1,8 +1,8 @@
 #!/usr/bin/env python
 from __future__ import print_function
 
-TORCH_VERSION = '1.8.0'
-TORCH_IPEX_VERSION = '1.8.0'
+TORCH_VERSION = '1.9.0'
+TORCH_IPEX_VERSION = '1.9.0'
 
 # import torch
 import platform
@@ -15,54 +15,56 @@
 import urllib.request
 
 try:
-    from packaging import version
+  from packaging import version
 except Exception:
-    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'packaging'])
-    from packaging import version
+  subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'packaging'])
+  from packaging import version
 
 installed_raw = {pkg for pkg in pkg_resources.working_set}
 installed = {}
 for i in installed_raw:
-    installed[i.key] = i.version
+  installed[i.key] = i.version
 
 requires = {}
 requires_raw = {}
 try:
-    with open('requirements.txt', 'r') as reader:
-        for line in reader.readlines():
-            line_raw = line.replace('\n', '')
-            line = line_raw.replace('=', '')
-            tmp = re.split('[=<>]', line)
-            if len(tmp) == 2:
-                requires[tmp[0]] = tmp[1]
-            else:
-                requires[tmp[0]] = ''
-            requires_raw[tmp[0]] = line_raw
+  with open('requirements.txt', 'r') as reader:
+    for line in reader.readlines():
+      line_raw = line.replace('\n', '')
+      line = line_raw.replace('=', '')
+      tmp = re.split('[=<>]', line)
+      if len(tmp) == 2:
+        requires[tmp[0]] = tmp[1]
+      else:
+        requires[tmp[0]] = ''
+      requires_raw[tmp[0]] = line_raw
 except Exception:
-    pass
+  pass
 
 restart = False
 for k in requires.keys():
-    if k in installed.keys():
-        if requires[k] != '' and version.parse(installed[k]) < version.parse(requires[k]):
-            subprocess.check_call([sys.executable, '-m', 'pip', 'install', requires_raw[k]])
-            if k == 'wheel':
-                restart = True
-    else:
-        subprocess.check_call([sys.executable, '-m', 'pip', 'install', k])
-        if k == 'wheel':
-            restart = True
+  if k in installed.keys():
+    if requires[k] != '' and version.parse(installed[k]) < version.parse(requires[k]):
+      subprocess.check_call([sys.executable, '-m', 'pip', 'install', requires_raw[k]])
+      if k == 'wheel':
+        restart = True
+  else:
+    subprocess.check_call([sys.executable, '-m', 'pip', 'install', k])
+    if k == 'wheel':
+      restart = True
 if restart:
-    os.execv(sys.executable, ['python'] + sys.argv)
-    exit(1)
+  os.execv(sys.executable, ['python'] + sys.argv)
+  exit(1)
 
 TORCH_VERSION = os.getenv('TORCH_VERSION', TORCH_VERSION)
 
 try:
-    import torch
+  import torch
+  from torch.utils.cpp_extension import include_paths, library_paths
 except ImportError as e:
-    subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'torch=='+TORCH_VERSION+'+cpu', '-f', 'https://download.pytorch.org/whl/torch_stable.html'])
-    import torch
+  subprocess.check_call([sys.executable, '-m', 'pip', 'install', 'torch=='+TORCH_VERSION+'+cpu', '-f', 'https://download.pytorch.org/whl/torch_stable.html'])
+  import torch
+  from torch.utils.cpp_extension import include_paths, library_paths
 
 PYTHON_VERSION = sys.version_info
 IS_WINDOWS = (platform.system() == 'Windows')
@@ -71,27 +73,27 @@
 
 TORCH_URL = 'torch @ https://download.pytorch.org/whl/cpu/torch-{0}%2Bcpu-cp{1}{2}-cp{1}{2}-linux_x86_64.whl'.format(TORCH_VERSION, PYTHON_VERSION.major, PYTHON_VERSION.minor)
 if IS_DARWIN:
-    TORCH_URL = 'torch=={}'.format(TORCH_VERSION)
+  TORCH_URL = 'torch=={}'.format(TORCH_VERSION)
 else:
-    OS_VER = 'linux_x86_64'
-    if IS_WINDOWS:
-        TORCH_URL = 'torch @ https://download.pytorch.org/whl/cpu/torch-{0}%2Bcpu-cp{1}{2}-cp{1}{2}-win_amd64.whl'.format(TORCH_VERSION, PYTHON_VERSION.major, PYTHON_VERSION.minor)
-        OS_VER = 'win_amd64'
+  OS_VER = 'linux_x86_64'
+  if IS_WINDOWS:
+    TORCH_URL = 'torch @ https://download.pytorch.org/whl/cpu/torch-{0}%2Bcpu-cp{1}{2}-cp{1}{2}-win_amd64.whl'.format(TORCH_VERSION, PYTHON_VERSION.major, PYTHON_VERSION.minor)
+    OS_VER = 'win_amd64'
 
-    try:
-        fp = urllib.request.urlopen('https://download.pytorch.org/whl/torch_stable.html', timeout=30)
-        cont_bytes = fp.read()
-        cont = cont_bytes.decode('utf8').replace('\n', '')
-        fp.close()
-        lines = re.split(r'<br>', cont)
-
-        for line in lines:
-            matches = re.match('<a href="(cpu\/torch-{0}.*cp{1}{2}.*{3}.*)">(.*)<\/a>'.format(TORCH_VERSION, PYTHON_VERSION.major, PYTHON_VERSION.minor, OS_VER), line)
-            if matches and len(matches.groups()) == 2:
-                TORCH_URL = 'torch @ https://download.pytorch.org/whl/{}'.format(matches.group(2))
-                break
-    except Exception:
-        pass
+  try:
+    fp = urllib.request.urlopen('https://download.pytorch.org/whl/torch_stable.html', timeout=30)
+    cont_bytes = fp.read()
+    cont = cont_bytes.decode('utf8').replace('\n', '')
+    fp.close()
+    lines = re.split(r'<br>', cont)
+
+    for line in lines:
+      matches = re.match('<a href="(cpu\/torch-{0}.*cp{1}{2}.*{3}.*)">(.*)<\/a>'.format(TORCH_VERSION, PYTHON_VERSION.major, PYTHON_VERSION.minor, OS_VER), line)
+      if matches and len(matches.groups()) == 2:
+        TORCH_URL = 'torch @ https://download.pytorch.org/whl/{}'.format(matches.group(2))
+        break
+  except Exception:
+    pass
 
 from subprocess import check_call, check_output
 from setuptools import setup, Extension, find_packages, distutils
@@ -107,6 +109,8 @@
 import multiprocessing
 import multiprocessing.pool
 import shutil
+import pathlib
+
 
 pytorch_install_dir = os.path.dirname(os.path.abspath(torch.__file__))
 base_dir = os.path.dirname(os.path.abspath(__file__))
@@ -114,37 +118,37 @@
 
 # from https://github.com/pytorch/pytorch/blob/master/tools/setup_helpers/__init__.py
 def which(thefile):
-    path = os.environ.get("PATH", os.defpath).split(os.pathsep)
-    for d in path:
-        fname = os.path.join(d, thefile)
-        fnames = [fname]
-        if sys.platform == 'win32':
-            exts = os.environ.get('PATHEXT', '').split(os.pathsep)
-            fnames += [fname + ext for ext in exts]
-        for name in fnames:
-            if os.access(name, os.F_OK | os.X_OK) and not os.path.isdir(name):
-                return name
-    return None
+  path = os.environ.get("PATH", os.defpath).split(os.pathsep)
+  for d in path:
+    fname = os.path.join(d, thefile)
+    fnames = [fname]
+    if sys.platform == 'win32':
+      exts = os.environ.get('PATHEXT', '').split(os.pathsep)
+      fnames += [fname + ext for ext in exts]
+    for name in fnames:
+      if os.access(name, os.F_OK | os.X_OK) and not os.path.isdir(name):
+        return name
+  return None
 
 def get_cmake_command():
-    def _get_version(cmd):
-        for line in check_output([cmd, '--version']).decode('utf-8').split('\n'):
-            if 'version' in line:
-                return LooseVersion(line.strip().split(' ')[2])
-        raise RuntimeError('no version found')
-    "Returns cmake command."
-    cmake_command = 'cmake'
-    if platform.system() == 'Windows':
-        return cmake_command
-    cmake3 = which('cmake3')
-    cmake = which('cmake')
-    if cmake3 is not None and _get_version(cmake3) >= LooseVersion("3.13.0"):
-        cmake_command = 'cmake3'
-        return cmake_command
-    elif cmake is not None and _get_version(cmake) >= LooseVersion("3.13.0"):
-         return cmake_command
-    else:
-         raise RuntimeError('no cmake or cmake3 with version >= 3.13.0 found')
+  def _get_version(cmd):
+    for line in check_output([cmd, '--version']).decode('utf-8').split('\n'):
+      if 'version' in line:
+        return LooseVersion(line.strip().split(' ')[2])
+    raise RuntimeError('no version found')
+  "Returns cmake command."
+  cmake_command = 'cmake'
+  if platform.system() == 'Windows':
+    return cmake_command
+  cmake3 = which('cmake3')
+  cmake = which('cmake')
+  if cmake3 is not None and _get_version(cmake3) >= LooseVersion("3.13.0"):
+    cmake_command = 'cmake3'
+    return cmake_command
+  elif cmake is not None and _get_version(cmake) >= LooseVersion("3.13.0"):
+    return cmake_command
+  else:
+    raise RuntimeError('no cmake or cmake3 with version >= 3.13.0 found')
 
 def _check_env_flag(name, default=''):
   return os.getenv(name, default).upper() in ['ON', '1', 'YES', 'TRUE', 'Y']
@@ -163,18 +167,19 @@ def _get_env_backend():
     else:
       return env_backend_val
 
+debug = _check_env_flag('DEBUG')
 
 def get_git_head_sha(base_dir):
   ipex_git_sha = ''
   torch_git_sha = ''
   try:
     ipex_git_sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
-                                          cwd=base_dir).decode('ascii').strip()
+            cwd=base_dir).decode('ascii').strip()
     if os.path.isdir(os.path.join(base_dir, '..', '.git')):
-      torch_git_sha = subprocess.check_output(['git', 'rev-parse', 'HEAD'],
-                                              cwd=os.path.join(
-                                                  base_dir,
-                                                  '..')).decode('ascii').strip()
+      torch_git_sha = subprocess.check_output(
+              ['git', 'rev-parse', 'HEAD'],
+              cwd=os.path.join(base_dir, '..')
+            ).decode('ascii').strip()
   except Exception:
     pass
   return ipex_git_sha, torch_git_sha
@@ -192,12 +197,16 @@ def get_build_version(ipex_git_sha):
 
 def create_version_files(base_dir, version, ipex_git_sha, torch_git_sha):
   print('Building torch_ipex version: {}'.format(version))
-  py_version_path = os.path.join(base_dir, 'intel_pytorch_extension_py', 'version.py')
+  py_version_path = os.path.join(base_dir, 'torch_ipex', 'version.py')
   with open(py_version_path, 'w') as f:
     f.write('# Autogenerated file, do not edit!\n')
     f.write("__version__ = '{}'\n".format(version))
     f.write("__ipex_gitrev__ = '{}'\n".format(ipex_git_sha))
     f.write("__torch_gitrev__ = '{}'\n".format(torch_git_sha))
+    if debug:
+      f.write("__mode__ = 'debug'\n")
+    else:
+      f.write("__mode__ = 'release'\n")
 
   cpp_version_path = os.path.join(base_dir, 'torch_ipex', 'csrc', 'version.cpp')
   with open(cpp_version_path, 'w') as f:
@@ -235,14 +244,13 @@ class IPEXExt(Extension, object):
   def __init__(self, name, project_dir=os.path.dirname(__file__)):
     Extension.__init__(self, name, sources=[])
     self.project_dir = os.path.abspath(project_dir)
-    self.build_dir = os.path.join(project_dir, 'build')
+    #self.build_dir = os.path.join(project_dir, 'build_' + self.name)
 
 
 class IPEXClean(distutils.command.clean.clean, object):
 
   def run(self):
     import glob
-    import re
     with open('.gitignore', 'r') as f:
       ignores = f.read()
       pat = re.compile(r'^#( BEGIN NOT-CLEAN-FILES )?')
@@ -276,74 +284,82 @@ def run(self):
     if cmake is None:
       raise RuntimeError(
           "CMake must be installed to build the following extensions: " +
-              ", ".join(e.name for e in self.extensions))
+          ", ".join(e.name for e in self.extensions))
     self.cmake = cmake
 
     if platform.system() == "Windows":
       raise RuntimeError("Does not support windows")
 
-    for ext in self.extensions:
-      self.build_extension(ext)
+    ipex_exts = [ext for ext in self.extensions if isinstance(ext, IPEXExt)]
+    for ext in ipex_exts:
+      self.build_ipex_extension(ext)
 
-  def build_extension(self, ext):
-    ext_dir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-    if not os.path.exists(ext.build_dir):
-      os.mkdir(ext.build_dir)
+    self.extensions = [ext for ext in self.extensions if not isinstance(ext, IPEXExt)]
+    super(IPEXBuild, self).run()
+
+  def build_ipex_extension(self, ext):
+    if not isinstance(ext, IPEXExt):
+      return super(IPEXBuild, self).build_extension(ext)
+    build_dir = os.path.join(ext.project_dir, 'build', 'build_' + ext.name)
+    if not os.path.exists(build_dir):
+      os.makedirs(build_dir)
 
     build_type = 'Release'
     use_ninja = False
 
-    if _check_env_flag('DEBUG'):
+    if debug:
       build_type = 'Debug'
 
     # install _torch_ipex.so as python module
-    if ext.name == 'torch_ipex' and _check_env_flag("USE_SYCL"):
-      ext_dir = ext_dir + '/torch_ipex'
+    ext_dir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
+    if ext.name == 'torch_ipex':
+      ext_dir = os.path.join(ext_dir, ext.name)
+    if not os.path.exists(ext_dir):
+      os.makedirs(ext_dir)
 
     cmake_args = [
             '-DCMAKE_BUILD_TYPE=' + build_type,
-            '-DPYTORCH_INSTALL_DIR=' + pytorch_install_dir,
-            '-DPYTHON_EXECUTABLE=' + sys.executable,
             '-DCMAKE_INSTALL_PREFIX=' + ext_dir,
-            '-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=' + ext_dir,
-            '-DCMAKE_ARCHIVE_OUTPUT_DIRECTORY=' + ext_dir,
-            '-DPYTHON_INCLUDE_DIR=' + python_include_dir,
+            '-DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI)),
+            '-DPYTHON_INCLUDE_DIRS=' + python_include_dir,
+            '-DPYTHON_EXECUTABLE=' + sys.executable,
+            '-DPYTORCH_INSTALL_DIR=' + pytorch_install_dir,
             '-DPYTORCH_INCLUDE_DIRS=' + pytorch_install_dir + "/include",
             '-DPYTORCH_LIBRARY_DIRS=' + pytorch_install_dir + "/lib",
-            '-DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI)),
+            '-DIPEX_VERSION=' + TORCH_IPEX_VERSION
         ]
 
-    if _check_env_flag("IPEX_DISP_OP"):
+    if _check_env_flag('IPEX_DISP_OP'):
       cmake_args += ['-DIPEX_DISP_OP=1']
 
-    if _check_env_flag("IPEX_PROFILE_OP"):
+    if os.getenv('IPEX_PROFILE_OP', 'UNSET') == 'UNSET' or _check_env_flag('IPEX_PROFILE_OP'):
       cmake_args += ['-DIPEX_PROFILE_OP=1']
 
-    if _check_env_flag("USE_SYCL"):
+    if _check_env_flag('USE_SYCL'):
       cmake_args += ['-DUSE_SYCL=1']
 
-    if _check_env_flag("DPCPP_ENABLE_PROFILING"):
+    if os.getenv('DPCPP_ENABLE_PROFILING', 'UNSET') == 'UNSET' or _check_env_flag('DPCPP_ENABLE_PROFILING'):
       cmake_args += ['-DDPCPP_ENABLE_PROFILING=1']
 
-    if _check_env_flag("USE_NINJA"):
+    if _check_env_flag('USE_NINJA'):
       use_ninja = True
       cmake_args += ['-GNinja']
 
     build_args = ['-j', str(multiprocessing.cpu_count())]
 
     env = os.environ.copy()
-    if _check_env_flag("USE_SYCL"):
+    if _check_env_flag('USE_SYCL'):
       os.environ['CXX'] = 'compute++'
-      check_call([self.cmake, ext.project_dir] + cmake_args, cwd=ext.build_dir, env=env)
+      check_call([self.cmake, ext.project_dir] + cmake_args, cwd=build_dir, env=env)
     else:
-      check_call([self.cmake, ext.project_dir] + cmake_args, cwd=ext.build_dir, env=env)
+      check_call([self.cmake, ext.project_dir] + cmake_args, cwd=build_dir, env=env)
 
     # build_args += ['VERBOSE=1']
     if use_ninja:
-      check_call(['ninja'] + build_args, cwd=ext.build_dir, env=env)
+      check_call(['ninja'] + build_args, cwd=build_dir, env=env)
     else:
-      check_call(['make'] + build_args, cwd=ext.build_dir, env=env)
-    check_call(['make', 'install'] + build_args, cwd=ext.build_dir, env=env)
+      check_call(['make'] + build_args, cwd=build_dir, env=env)
+    check_call(['make', 'install'] + build_args, cwd=build_dir, env=env)
 
 ipex_git_sha, torch_git_sha = get_git_head_sha(base_dir)
 version = get_build_version(ipex_git_sha)
@@ -363,28 +379,101 @@ def make_relative_rpath(path):
   else:
     return '-Wl,-rpath,$ORIGIN/' + path
 
-install_requires=[
-        TORCH_URL,
-]
+def get_c_module():
+  main_compile_args = ['-D_GLIBCXX_USE_CXX11_ABI=' + str(int(torch._C._GLIBCXX_USE_CXX11_ABI))]
+  main_libraries = ['torch_ipex']
+  main_link_args = [
+          '-ltorch_python',
+          '-ldnnl'
+  ]
+  main_sources = [os.path.join("torch_ipex", "csrc", "_C.cpp")]
+  cwd = os.path.dirname(os.path.abspath(__file__))
+  include_dirs = [
+          ".",
+          os.path.join("torch_ipex", "csrc"),
+          os.path.join("third_party", "mkl-dnn", "include"),
+          os.path.join("build", "build_torch_ipex", "third_party", "mkl-dnn", "include"),
+          os.path.join(pytorch_install_dir, "include"),
+          os.path.join(pytorch_install_dir, "include", "torch", "csrc", "api", "include")
+  ]
+  #lib_path = os.path.join(cwd, "torch_ipex", "lib")
+  #lib_path = os.path.join(cwd, "build")
+  #lib_path = os.path.join(cwd, "build", "build_torch_ipex")
+  library_dirs = [
+          os.path.join(cwd, "build", "build_torch_ipex"),
+          os.path.join(cwd, "build", "build_torch_ipex", "third_party", "mkl-dnn", "src"),
+          os.path.join(pytorch_install_dir, "lib")
+  ]
+  #lib_path_1 = os.path.join(cwd, "build", "lib.linux-x86_64-3.8")
+  #library_dirs = [lib_path, lib_path_1]
+  extra_link_args = []
+  extra_compile_args = [
+      '-Wall',
+      '-Wextra',
+      '-Wno-strict-overflow',
+      '-Wno-unused-parameter',
+      '-Wno-missing-field-initializers',
+      '-Wno-write-strings',
+      '-Wno-unknown-pragmas',
+      # This is required for Python 2 declarations that are deprecated in 3.
+      '-Wno-deprecated-declarations',
+      # Python 2.6 requires -fno-strict-aliasing, see
+      # http://legacy.python.org/dev/peps/pep-3123/
+      # We also depend on it in our code (even Python 3).
+      '-fno-strict-aliasing',
+      # Clang has an unfixed bug leading to spurious missing
+      # braces warnings, see
+      # https://bugs.llvm.org/show_bug.cgi?id=21629
+      '-Wno-missing-braces',
+  ]
+
+  C_ext = Extension("torch_ipex._C",
+                libraries=main_libraries,
+                sources=main_sources,
+                language='c',
+                extra_compile_args=main_compile_args + extra_compile_args,
+                include_dirs=include_dirs,
+                library_dirs=library_dirs,
+                extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('lib')])
+                # extra_link_args=extra_link_args + main_link_args + [make_relative_rpath('..')])
+  return C_ext
+
+install_requires=[]
+if not debug:
+  install_requires.append([
+      TORCH_URL,
+  ])
 
 setup(
-    name='torch_ipex',
-    version=version,
-    description='Intel PyTorch Extension',
-    url='https://github.com/intel/intel-extension-for-pytorch',
-    author='Intel/PyTorch Dev Team',
-    install_requires=install_requires,
-    # Exclude the build files.
-    #packages=find_packages(exclude=['build']),
-    packages=[
-      'torch_ipex',
-      'intel_pytorch_extension',
-      'intel_pytorch_extension.optim',
-      'intel_pytorch_extension.ops'],
-    package_dir={'intel_pytorch_extension': 'intel_pytorch_extension_py'},
-    zip_safe=False,
-    ext_modules=[IPEXExt('_torch_ipex')],
-    cmdclass={
-        'build_ext': IPEXBuild,
-        'clean': IPEXClean,
-    })
+  name='torch_ipex',
+  version=version,
+  description='Intel PyTorch Extension',
+  url='https://github.com/intel/intel-extension-for-pytorch',
+  author='Intel/PyTorch Dev Team',
+  install_requires=install_requires,
+  # Exclude the build files.
+  #packages=find_packages(exclude=['build']),
+  packages=[
+    'torch_ipex',
+    'torch_ipex.ops',
+    'torch_ipex.optim',
+    'intel_pytorch_extension',
+    'intel_pytorch_extension.ops',
+    'intel_pytorch_extension.optim'],
+  package_dir={'intel_pytorch_extension': 'torch_ipex'},
+  #package_data={
+  #    'torch_ipex':[
+  #        'README.md',
+  #        'requirements.txt',
+  #        '*.py',
+  #        'lib/*.so',
+  #        'include/*.h',
+  #        'include/core/*.h',
+  #        'include/utils/*.h']
+  #    },
+  zip_safe=False,
+  ext_modules=[IPEXExt('torch_ipex'), get_c_module()],
+  cmdclass={
+      'build_ext': IPEXBuild,
+      'clean': IPEXClean,
+  })
diff --git a/tests/cpu/common_device_type.py b/tests/cpu/common_device_type.py
index 805a493bd..fc5c71eca 100644
--- a/tests/cpu/common_device_type.py
+++ b/tests/cpu/common_device_type.py
@@ -49,7 +49,7 @@
 from functools import wraps
 import unittest
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 import copy
 from common_utils import TestCase, TEST_WITH_ROCM, TEST_MKL, \
     skipCUDANonDefaultStreamIf
diff --git a/tests/cpu/common_ipex_conf.py b/tests/cpu/common_ipex_conf.py
index ee0e9ae1b..b9d19fc0e 100644
--- a/tests/cpu/common_ipex_conf.py
+++ b/tests/cpu/common_ipex_conf.py
@@ -1,5 +1,5 @@
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 
 class AutoMixPrecision(object):
     def __init__(self, enable_or_not = False, train = False):
diff --git a/tests/cpu/common_utils.py b/tests/cpu/common_utils.py
index fbd42eb37..39a6a2333 100644
--- a/tests/cpu/common_utils.py
+++ b/tests/cpu/common_utils.py
@@ -1,6 +1,5 @@
 '''
 From PyTorch:
-
 Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
 Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
 Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
@@ -10,37 +9,28 @@
 Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
 Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
 Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)
-
 From Caffe2:
-
 Copyright (c) 2016-present, Facebook Inc. All rights reserved.
-
 All contributions by Facebook:
 Copyright (c) 2016 Facebook Inc.
-
 All contributions by Google:
 Copyright (c) 2015 Google Inc.
 All rights reserved.
-
 All contributions by Yangqing Jia:
 Copyright (c) 2015 Yangqing Jia
 All rights reserved.
-
 All contributions from Caffe:
 Copyright(c) 2013, 2014, 2015, the respective contributors
 All rights reserved.
-
 All other contributions:
 Copyright(c) 2015, 2016 the respective contributors
 All rights reserved.
-
 Caffe2 uses a copyright model similar to Caffe: each contributor holds
 copyright over their contributions to Caffe2. The project versioning records
 all such contribution and copyright details. If a contributor wants to further
 mark their specific copyright on a particular contribution, they should
 indicate their copyright solely in the commit message of the change when it is
 committed.
-
 All rights reserved.
 '''
 
@@ -48,7 +38,6 @@
 r"""Importing this file must **not** initialize CUDA context. test_distributed
 relies on this assumption to properly run. This means that when this is imported
 no CUDA calls shall be made, including torch.cuda.device_count(), etc.
-
 torch.testing._internal.common_cuda.py can freely initialize CUDA context when imported.
 """
 
@@ -88,9 +77,8 @@
 from typing import cast, Any, Dict, Iterable, Iterator, Optional
 
 from torch.testing._internal import expecttest
-from torch.testing import \
-    (_compare_tensors_internal, _compare_scalars_internal, _compare_return_type,
-     floating_types_and, integral_types, complex_types)
+from torch.testing._core import \
+    (_compare_tensors_internal, _compare_scalars_internal, _compare_return_type)
 
 import torch
 import torch.cuda
@@ -574,11 +562,9 @@ def wrapper(*args, **kwargs):
 
 def skipIfNotRegistered(op_name, message):
     """Wraps the decorator to hide the import of the `core`.
-
     Args:
         op_name: Check if this op is registered in `core._REGISTERED_OPERATORS`.
         message: message to fail with.
-
     Usage:
         @skipIfNotRegistered('MyOp', 'MyOp is not linked!')
             This will check if 'MyOp' is in the caffe2.python.core
@@ -1315,7 +1301,6 @@ def assertNotWarn(self, callable, msg=''):
     @contextmanager
     def maybeWarnsRegex(self, category, regex=''):
         """Context manager for code that *may* warn, e.g. ``TORCH_WARN_ONCE``.
-
         This filters expected warnings from the test log and fails the test if
         any unexpected warnings are caught.
         """
@@ -1341,7 +1326,6 @@ def assertExpected(self, s, subname=None):
         is placed in the 'expect' directory in the same directory
         as the test script. You can automatically update the recorded test
         output using --accept.
-
         If you call this multiple times in a single function, you must
         give a unique subname each time.
         """
@@ -1444,6 +1428,24 @@ def runWithPytorchAPIUsageStderr(code):
         return stderr.decode('ascii')
 
 
+    def get_src_dtype_from_auto_dtype_conversion_info(self, line):
+        return line.strip().split(",")[1].split("=")[1]
+
+    def get_dst_dtype_from_auto_dtype_conversion_info(self, line):
+        return line.strip().split(",")[2].split("=")[1]
+
+    def get_op_name_from_auto_dtype_conversion_info(self, line):
+        return line.strip().split(",")[4].split("=")[1]
+
+    def get_src_dtype_from_auto_dtype_conversion_info(self, line):
+        return line.strip().split(",")[1].split("=")[1]
+
+    def get_dst_dtype_from_auto_dtype_conversion_info(self, line):
+        return line.strip().split(",")[2].split("=")[1]
+
+    def get_op_name_from_auto_dtype_conversion_info(self, line):
+        return line.strip().split(",")[4].split("=")[1]
+
 def download_file(url, binary=True):
     from urllib.parse import urlsplit
     from urllib import request, error
@@ -1683,7 +1685,6 @@ def random_fullrank_matrix_distinct_singular_value(matrix_size, *batch_dims,
 
 def random_matrix(rows, columns, *batch_dims, **kwargs):
     """Return rectangular matrix or batches of rectangular matrices.
-
     Parameters:
       dtype - the data type
       device - the device kind
@@ -1723,7 +1724,6 @@ def random_lowrank_matrix(rank, rows, columns, *batch_dims, **kwargs):
 
 def random_sparse_matrix(rows, columns, density=0.01, **kwargs):
     """Return rectangular random sparse matrix within given density.
-
     The density of the result approaches to given density as the size
     of the matrix is increased and a relatively small value of density
     is specified but higher than min(rows, columns)/(rows * columns)
@@ -1750,10 +1750,8 @@ def random_sparse_matrix(rows, columns, density=0.01, **kwargs):
 
 def random_sparse_pd_matrix(matrix_size, density=0.01, **kwargs):
     """Return random sparse positive-definite matrix with given density.
-
     The eigenvalues of the matrix are defined as::
       arange(1, matrix_size+1)/matrix_size
-
     Algorithm:
       A = diag(arange(1, matrix_size+1)/matrix_size)
       while <A density is smaller than required>:
@@ -1994,4 +1992,4 @@ def set_cwd(path: str) -> Iterator[None]:
 dtype2prec_DONTUSE = {torch.float: 1e-5,
                       torch.double: 1e-5,
                       torch.half: 1e-2,
-                      torch.bfloat16: 1e-1}
+                      torch.bfloat16: 1e-1}
\ No newline at end of file
diff --git a/tests/cpu/linear_prepack.py b/tests/cpu/linear_prepack.py
index d2ab6540d..9c12fec83 100644
--- a/tests/cpu/linear_prepack.py
+++ b/tests/cpu/linear_prepack.py
@@ -1,5 +1,5 @@
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 from common_utils import int8_calibration
 
 ipex.core.enable_auto_dnnl()
@@ -22,24 +22,24 @@ def run_linear(auto_mix_conf=None):
             LL(get_input())
 
 if __name__ == "__main__":
-    print(f"fp32, {'*' * 50}") 
+    print(f"fp32, {'*' * 50}")
     run_linear()
 
-    print(f"auto-mix for bf16, {'*' * 50}") 
+    print(f"auto-mix for bf16, {'*' * 50}")
     bf16_conf = ipex.AmpConf(torch.bfloat16)
     run_linear(bf16_conf)
 
-    print(f"back to fp32, {'*' * 50}") 
+    print(f"back to fp32, {'*' * 50}")
     ipex.core.reorder_to_float32(LL.weight)
     ipex.core.reorder_to_float32(LL.bias)
     run_linear()
 
-    print(f"auto-mix for int8, {'*' * 50}") 
+    print(f"auto-mix for int8, {'*' * 50}")
     int8_calibration(LL,  [get_input() for i in range(3)], "./int8.config")
     int8_conf = ipex.AmpConf(torch.int8, "./int8.config")
     run_linear(int8_conf)
 
-    print(f"back to fp32, {'*' * 50}") 
+    print(f"back to fp32, {'*' * 50}")
     ipex.core.reorder_to_float32(LL.weight)
     ipex.core.reorder_to_float32(LL.bias)
-    run_linear()
\ No newline at end of file
+    run_linear()
diff --git a/tests/cpu/override.py b/tests/cpu/override.py
index 32e1995b9..456b54c65 100644
--- a/tests/cpu/override.py
+++ b/tests/cpu/override.py
@@ -1,5 +1,5 @@
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 
 torch_function = ['rand', 'randint', 'arange', 'bartlett_window', 'blackman_window', \
                   'empty', '_empty_affine_quantized', '_empty_per_channel_affine_quantized', \
diff --git a/tests/cpu/test_bf16_lazy_reorder.py b/tests/cpu/test_bf16_lazy_reorder.py
index 04b979de3..b1a7a3562 100644
--- a/tests/cpu/test_bf16_lazy_reorder.py
+++ b/tests/cpu/test_bf16_lazy_reorder.py
@@ -12,7 +12,7 @@
 import sys
 import itertools
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
@@ -293,7 +293,7 @@ def _test_deconv(self, dims):
                         self.assertEqual(
                             y_aten, y_auto_mix_train, atol=1e-1, rtol=1e-5)
                         self.assertEqual(
-                            module.weight.grad, module_auto_mix_train.weight.grad, atol=1e-1, rtol=1e-5)
+                            module.weight.grad, module_auto_mix_train.weight.grad, atol=2e-1, rtol=1e-3)
                         self.assertEqual(
                             x_aten.grad, x_auto_mix_train.grad, atol=1e-1, rtol=1e-5)
                         if bias:
@@ -2488,7 +2488,7 @@ def test__pack_padded_sequence(self):
         seqs = [torch.FloatTensor(random.randint(1, 6)).to(ipex.DEVICE) for _ in range(5)]
         seqs = [s.random_(-128, 128) for s in seqs]
         ordered = sorted(seqs, key=len, reverse=True)
-        lengths = list(map(len, ordered))
+        lengths = torch.as_tensor(list(map(len, ordered)), dtype=torch.int64).to(ipex.DEVICE)
         padded_tensor = rnn_utils.pad_sequence(ordered)
         with AutoDNNL(True):
             for enforce_sorted in [True, False]:
@@ -2508,7 +2508,7 @@ def _lstm_params_list(self, cell):
             "bias": [False, True],
             "empty_state": [False, True],
             "batch_first": [False, True],
-            "dropout": [0, 1], # [0, 0.5, 1] # TODO 0.5 will fail
+            "dropout": [0], # [0, 0.5, 1] # TODO 0.5 will fail
             "batch_size": [1, 2],
             "seq_len": [1, 3]
         }
diff --git a/tests/cpu/test_conf.py b/tests/cpu/test_conf.py
index 2c8dad988..628787b56 100644
--- a/tests/cpu/test_conf.py
+++ b/tests/cpu/test_conf.py
@@ -4,7 +4,7 @@
 from functools import reduce
 
 import torch
-import _torch_ipex as ipex
+import torch_ipex._C as ipex
 
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
diff --git a/tests/cpu/test_emb.py b/tests/cpu/test_emb.py
index 64c92d27b..8a64337ab 100644
--- a/tests/cpu/test_emb.py
+++ b/tests/cpu/test_emb.py
@@ -1,6 +1,6 @@
 import torch
 import torch.nn as nn
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 import unittest
 import copy
 from common_utils import TestCase
diff --git a/tests/cpu/test_int8.py b/tests/cpu/test_int8.py
index 975f0fb36..f91efe7fc 100644
--- a/tests/cpu/test_int8.py
+++ b/tests/cpu/test_int8.py
@@ -15,7 +15,7 @@
 from torch.jit._recursive import wrap_cpp_module
 import copy
 
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 
 import torch.nn as nn
 from torch.nn import Parameter
@@ -191,13 +191,13 @@ def _lstm_int8(self, seq_len, batch_size, input_size, hidden_size, num_layers, b
 
     def test_lstm(self):
         self._lstm_int8(seq_len=5, batch_size=2, input_size=16, hidden_size=16, num_layers=1, bidirectional=False, bias=True, empty_state=False)
-        
+
         self._lstm_int8(seq_len=5, batch_size=2, input_size=16, hidden_size=16, num_layers=1, bidirectional=True, bias=True, empty_state=False)
-        
+
         self._lstm_int8(seq_len=5, batch_size=2, input_size=16, hidden_size=16, num_layers=1, bidirectional=False, bias=False, empty_state=False)
-        
+
         self._lstm_int8(seq_len=5, batch_size=2, input_size=16, hidden_size=16, num_layers=1, bidirectional=True, bias=False, empty_state=False)
-    
+
 if __name__ == '__main__':
     rand_seed = int(time.time() * 1000000000)
     torch.manual_seed(rand_seed)
diff --git a/tests/cpu/test_interaction.py b/tests/cpu/test_interaction.py
index 8904fdd37..a8d12ef56 100644
--- a/tests/cpu/test_interaction.py
+++ b/tests/cpu/test_interaction.py
@@ -5,7 +5,7 @@
 
 import torch
 
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
diff --git a/tests/cpu/test_jit.py b/tests/cpu/test_jit.py
index 9d61d781b..3a73cc6a3 100644
--- a/tests/cpu/test_jit.py
+++ b/tests/cpu/test_jit.py
@@ -60,8 +60,8 @@
 from torch.jit._recursive import wrap_cpp_module
 import copy
 
-import intel_pytorch_extension as ipex
-from intel_pytorch_extension import core
+import torch_ipex as ipex
+from torch_ipex import core
 
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
diff --git a/tests/cpu/test_lazy_reorder.py b/tests/cpu/test_lazy_reorder.py
index 76165f559..5cf9d8c4c 100644
--- a/tests/cpu/test_lazy_reorder.py
+++ b/tests/cpu/test_lazy_reorder.py
@@ -12,7 +12,7 @@
 import sys
 import itertools
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 import contextlib
 import io
 
@@ -1603,7 +1603,7 @@ def _lstm_params_list(self, cell):
             "bias": [False, True],
             "empty_state": [False, True],
             "batch_first": [False, True],
-            "dropout": [0, 1], # [0, 0.5, 1] # TODO 0.5 will fail
+            "dropout": [0], # [0, 0.5, 1] # TODO 0.5 will fail
             "batch_size": [1, 2],
             "seq_len": [1, 3]
         }
diff --git a/tests/cpu/test_mlp.py b/tests/cpu/test_mlp.py
index 62d085095..f01b9d4b1 100644
--- a/tests/cpu/test_mlp.py
+++ b/tests/cpu/test_mlp.py
@@ -5,7 +5,7 @@
 
 from functools import reduce
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 
 import torch.nn as nn
 import torch.backends.cudnn as cudnn
diff --git a/tests/cpu/test_rn50_cpu_ops.py b/tests/cpu/test_rn50_cpu_ops.py
index a43db2bd7..586503e5b 100644
--- a/tests/cpu/test_rn50_cpu_ops.py
+++ b/tests/cpu/test_rn50_cpu_ops.py
@@ -55,7 +55,7 @@
 from functools import reduce
 
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 from common_ipex_conf import AutoMixPrecision, AutoDNNL
 
 import torch.nn as nn
diff --git a/tests/cpu/test_sparse.py b/tests/cpu/test_sparse.py
index 6b89ebc23..53f494d7c 100644
--- a/tests/cpu/test_sparse.py
+++ b/tests/cpu/test_sparse.py
@@ -2,7 +2,7 @@
 import copy
 
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 import torch.nn as nn
 from common_utils import TestCase
 from numbers import Number
diff --git a/tests/cpu/test_torch.py b/tests/cpu/test_torch.py
index a108b7cac..9543640ae 100644
--- a/tests/cpu/test_torch.py
+++ b/tests/cpu/test_torch.py
@@ -83,7 +83,7 @@
     skipIf, skipCPUIfNoLapack, skipCUDAIfNoMagma, skipCUDAIfRocm, onlyCUDA, onlyCPU, \
     dtypes, dtypesIfCUDA, deviceCountAtLeast, skipCUDAIf, precisionOverride, ipex
 import torch.backends.quantized
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 
 
 # load_tests from common_utils is used to automatically filter tests for
diff --git a/tests/cpu/utils/test_lazy_reorder_with_pattern.py b/tests/cpu/utils/test_lazy_reorder_with_pattern.py
index fcbeafc6a..1e2237fc8 100644
--- a/tests/cpu/utils/test_lazy_reorder_with_pattern.py
+++ b/tests/cpu/utils/test_lazy_reorder_with_pattern.py
@@ -5,7 +5,7 @@
 import sys
 import unittest
 import torch
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 
 sys.path.append("..")
 from common_utils import TestCase
diff --git a/tests/cpu/utils/utils.py b/tests/cpu/utils/utils.py
index 7e754a353..4a6b13885 100644
--- a/tests/cpu/utils/utils.py
+++ b/tests/cpu/utils/utils.py
@@ -2,7 +2,7 @@
 import unittest
 from torch.testing._internal import expecttest
 from functools import wraps
-import intel_pytorch_extension as ipex
+import torch_ipex as ipex
 
 class VerboseTestCase(expecttest.TestCase):
     def __init__(self, method_name='runTest'):
diff --git a/third_party/pybind11 b/third_party/pybind11
deleted file mode 160000
index 373524912..000000000
--- a/third_party/pybind11
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 37352491225358b97ce302273bf2d887a477efb0
diff --git a/third_party/torch_ccl b/third_party/torch_ccl
deleted file mode 160000
index 064d9eb3a..000000000
--- a/third_party/torch_ccl
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 064d9eb3aeeb10ed37a349e6175161bb3da36104
diff --git a/intel_pytorch_extension_py/__init__.py b/torch_ipex/__init__.py
similarity index 95%
rename from intel_pytorch_extension_py/__init__.py
rename to torch_ipex/__init__.py
index cbf83ca69..af625b912 100644
--- a/intel_pytorch_extension_py/__init__.py
+++ b/torch_ipex/__init__.py
@@ -6,8 +6,10 @@
 from .tensor import *
 from .optim import *
 from .ops import *
-import _torch_ipex as core
-core.enable_torch_ccl()
+
+base_dir = os.path.basename(os.path.dirname(os.path.abspath(__file__)))
+if base_dir == 'intel_pytorch_extension':
+    print('[WARNING] "import intel_pytorch_extension" will be deprecated in future releases. Please use "import torch_ipex" instead.')
 
 DEVICE = 'xpu:0'
 
diff --git a/torch_ipex/csrc/CMakeLists.txt b/torch_ipex/csrc/CMakeLists.txt
index bb482c5df..8d85998cc 100644
--- a/torch_ipex/csrc/CMakeLists.txt
+++ b/torch_ipex/csrc/CMakeLists.txt
@@ -5,12 +5,10 @@ LIST(APPEND DPCPP_COMMON_SRCS
     ${DPCPP_ROOT}/aten_ipex_bridge.cpp
     ${DPCPP_ROOT}/aten_ipex_type.cpp
     ${DPCPP_ROOT}/dpcpp_allocator.cpp
-    ${DPCPP_ROOT}/init_python_bindings.cpp
     ${DPCPP_ROOT}/ipex_tensor_impl.cpp
     ${DPCPP_ROOT}/ipex_sparse_tensor_impl.cpp
     ${DPCPP_ROOT}/version.cpp
     ${DPCPP_ROOT}/utils.cpp
-    ${DPCPP_ROOT}/distributed/xpu_ccl.cpp
 )
 
 # Pass to parent
diff --git a/torch_ipex/csrc/init_python_bindings.cpp b/torch_ipex/csrc/_C.cpp
similarity index 92%
rename from torch_ipex/csrc/init_python_bindings.cpp
rename to torch_ipex/csrc/_C.cpp
index 139109133..bdd8df60a 100644
--- a/torch_ipex/csrc/init_python_bindings.cpp
+++ b/torch_ipex/csrc/_C.cpp
@@ -1,4 +1,3 @@
-#include "init_python_bindings.h"
 #include "version.h"
 
 #include <c10/core/Device.h>
@@ -30,8 +29,6 @@
 #include "cpu/FusionOPs.h"
 #include "cpu/int8/Config.h"
 #include "cpu/int8/quantization/Observer.h"
-#include "ProcessGroupCCL.hpp"
-#include <pybind11/chrono.h>
 #include <torch/csrc/api/include/torch/python.h>
 #include <c10/core/DeviceType.h>
 #include <torch/csrc/Exceptions.h>
@@ -223,18 +220,7 @@ void InitIpexModuleBindings(py::module m) {
     }
     Int8OptConfig::get_config().set_indicators(indicators);
   });
-  
-  m.def("enable_torch_ccl", [=]() {
-       py::object module = py::module::import("torch.distributed");
-       py::object register_backend = module.attr("Backend").attr("register_backend"); 
-       register_backend("ccl", py::cpp_function(&c10d::ProcessGroupCCL::createProcessGroupCCL,
-                                            py::arg("store"),
-                                            py::arg("rank"),
-                                            py::arg("size"),
-                                            py::arg("timeout") = std::chrono::milliseconds(
-                                              ::c10d::ProcessGroupCCL::OP_TIMEOUT_MILLIS)));
-       
-  });
+
   m.def("set_xpu_mode", [=](std::string mode){
        AutoOptConfig::singleton().set_xpu_mode(torch_ipex::stringToXPUMode(mode));});
 
@@ -260,4 +246,4 @@ void InitIpexBindings(py::module m) {
 
 }  // namespace torch_ipex
 
-PYBIND11_MODULE(_torch_ipex, m) { torch_ipex::InitIpexBindings(m); }
+PYBIND11_MODULE(_C, m) { torch_ipex::InitIpexBindings(m); }
diff --git a/torch_ipex/csrc/aten_ipex_bridge.cpp b/torch_ipex/csrc/aten_ipex_bridge.cpp
index d32cb4fef..9460a56a0 100644
--- a/torch_ipex/csrc/aten_ipex_bridge.cpp
+++ b/torch_ipex/csrc/aten_ipex_bridge.cpp
@@ -221,7 +221,8 @@ at::Tensor shallowUpgradeToDPCPPTensor(const at::Tensor& cpuTensor) {
     TORCH_INTERNAL_ASSERT_DEBUG_ONLY(_tensor.device().type() == at::DeviceType::XPU);
     IPEXTensorImpl* ipex_impl = (IPEXTensorImpl *)_tensor.unsafeGetTensorImpl();
     ipex_impl->copy_meta_info(cpu_tensor_impl);
-    TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! cpuTensor.requires_grad());
+    ipex_impl->copy_auto_grad(cpu_tensor_impl);
+    // TORCH_INTERNAL_ASSERT_DEBUG_ONLY(! cpuTensor.requires_grad());
     CHECK_TENSOR_CRITICAL(_tensor, cpuTensor, true);
     //TODO: Cannot set reserved_
     //      dest_impl->reserved_ = src_impl->reserved_;
diff --git a/torch_ipex/csrc/cpu/CustomOPs.h b/torch_ipex/csrc/cpu/CustomOPs.h
index 92f35b05f..2b6f574c0 100644
--- a/torch_ipex/csrc/cpu/CustomOPs.h
+++ b/torch_ipex/csrc/cpu/CustomOPs.h
@@ -778,7 +778,7 @@ class NewEmbeddingBagOp : public torch::autograd::Function<NewEmbeddingBagOp> {
                     _ipex_bag_size, num_weights, scale_grad_by_freq, mode,
                     _ipex_per_sample_weights)
               : at::_embedding_bag_dense_backward(
-                    _ipex_grad, _ipex_indices, _ipex_offsets, _ipex_offset2bag_,
+                    _ipex_grad, _ipex_indices, _ipex_offset2bag_,
                     _ipex_bag_size, _ipex_maximum_indices, num_weights,
                     scale_grad_by_freq, mode, _ipex_per_sample_weights);
       auto &&_ipex_per_sample_weights_grad =
@@ -806,7 +806,7 @@ class NewEmbeddingBagOp : public torch::autograd::Function<NewEmbeddingBagOp> {
                     grad, indices, offsets, offset2bag_, bag_size, num_weights,
                     scale_grad_by_freq, mode, per_sample_weights)
               : at::_embedding_bag_dense_backward(
-                    grad, indices, offsets, offset2bag_, bag_size,
+                    grad, indices, offset2bag_, bag_size,
                     maximum_indices, num_weights, scale_grad_by_freq, mode,
                     per_sample_weights);
       auto per_sample_weights_grad =
diff --git a/torch_ipex/csrc/cpu/DevOPs.cpp b/torch_ipex/csrc/cpu/DevOPs.cpp
index 1904d5d7c..549c05e87 100644
--- a/torch_ipex/csrc/cpu/DevOPs.cpp
+++ b/torch_ipex/csrc/cpu/DevOPs.cpp
@@ -1232,6 +1232,45 @@ at::Tensor AtenIpexCPUDev::dil_dropout_backward(
   return dbl::comm::gen_aten_tensor_by(std::move(dX));
 }
 
+at::Tensor AtenIpexCPUDev::dil_batch_norm(
+    const at::Tensor& input,
+    const at::Tensor& weight,
+    const at::Tensor& bias,
+    const at::Tensor& running_mean,
+    const at::Tensor& running_var,
+    bool train,
+    double momentum,
+    double eps,
+    bool cudnn_enabled) {
+
+  #define CHECK_MISMATCH(arg_name, expected, actual) \
+    IPEX_CHECK(actual == expected, arg_name, " should contain ", expected, " elements not ", actual)
+
+  auto num_features = input.sizes()[1];
+  if (running_mean.defined()) {
+    CHECK_MISMATCH("running_mean", num_features, running_mean.numel());
+  }
+  if (running_var.defined()) {
+    CHECK_MISMATCH("running_var", num_features, running_var.numel());
+  }
+  if (weight.defined()) {
+    CHECK_MISMATCH("weight", num_features, weight.numel());
+  }
+  if (bias.defined()) {
+    CHECK_MISMATCH("bias", num_features, bias.numel());
+  }
+
+  return std::get<0>(at::native_batch_norm(
+    input,
+    weight,
+    bias,
+    running_mean,
+    running_var,
+    train,
+    momentum,
+    eps));
+}
+
 std::tuple<at::Tensor, at::Tensor, at::Tensor> AtenIpexCPUDev::dil_native_batch_norm(
     const at::Tensor& input,
     const at::Tensor& weight,
@@ -2074,9 +2113,8 @@ at::Tensor& AtenIpexCPUDev::dil_cat_out(at::Tensor& result, at::TensorList tenso
   dim = at::legacy_cat_wrap_dim(dim, tensors);
   std::vector<dil::tensor> x;
   for (auto i =0; i< tensors.size(); i++) {
-    IPEX_CHECK(!(tensors[i].dim() == 1 && tensors[i].sizes()[0] == 0),
-      "Currently Mkldnn cat operators do not support empty tensor.");
-
+    if(tensors[i].numel() == 0)
+      continue;
     dbl::comm::reorder_to_bf16_for_mix_prec(tensors[i], true);
 
     x.push_back(dbl::comm::try_gen_dil_tensor(tensors[i]));
@@ -2102,8 +2140,8 @@ at::Tensor AtenIpexCPUDev::dil_cat(at::TensorList tensors, int64_t dim) {
   std::vector<int32_t> data_shift;
 
   for (auto i = 0; i < tensors.size(); i++) {
-    IPEX_CHECK(!(tensors[i].dim() == 1 && tensors[i].sizes()[0] == 0),
-      "Currently Mkldnn cat operators do not support empty tensor.");
+    if(tensors[i].numel() == 0)
+      continue;
     tensors_contiguous[i] = IS_CONTIGUOUS_ANY(tensors[i]) ? tensors[i] : tensors[i].contiguous();
 
     dbl::comm::reorder_to_bf16_for_mix_prec(tensors_contiguous[i], true);
@@ -2924,7 +2962,7 @@ at::Tensor AtenIpexCPUDev::dil_div(const at::Tensor &self,
   return at::Tensor();
 }
 
-at::Tensor AtenIpexCPUDev::dil_div(const at::Tensor &self, at::Scalar &other) {
+at::Tensor AtenIpexCPUDev::dil_div(const at::Tensor &self, const at::Scalar &other) {
   auto tensor = at::scalar_to_tensor(other);
   DEBUG("AtenIpexCPUDev::dil_div_Scalar\n");
   auto impl = tensor.unsafeGetTensorImpl();
@@ -2940,7 +2978,7 @@ at::Tensor &AtenIpexCPUDev::dil_div_(at::Tensor &self,
   return AtenIpexCPUDev::dil_div_out(self, self, other);
 }
 
-at::Tensor &AtenIpexCPUDev::dil_div_(at::Tensor &self, at::Scalar &other) {
+at::Tensor &AtenIpexCPUDev::dil_div_(at::Tensor &self, const at::Scalar &other) {
   auto tensor = at::scalar_to_tensor(other);
   DEBUG("AtenIpexCPUDev::dil_div_Scalar\n");
   auto impl = tensor.unsafeGetTensorImpl();
@@ -2996,5 +3034,88 @@ at::Tensor AtenIpexCPUDev::dil_permute(const at::Tensor & self, at::IntArrayRef
   return dil_as_strided(self, newSizes, newStrides, self.storage_offset());
 }
 
+inline at::Tensor to_impl(const at::Tensor& self, const at::TensorOptions& options, bool non_blocking, bool copy) {
+  auto memory_format = options.memory_format_opt().value_or(at::MemoryFormat::Preserve);
+  if (self.dtype() == options.dtype() &&
+      self.layout() == options.layout() &&
+      self.device() == options.device() &&
+      !copy &&
+      (memory_format == at::MemoryFormat::Preserve || self.suggest_memory_format() == memory_format)) {
+    return self;
+  }
+
+  bool pin_out = false;
+  if (memory_format == at::MemoryFormat::Preserve) {
+    if (self.is_non_overlapping_and_dense()) {
+      // Copy all strides
+      auto r = at::empty_strided(self.sizes(),
+                                 self.strides(),
+                                 options.memory_format(c10::nullopt).pinned_memory(pin_out));
+      r.copy_(self, non_blocking);
+      return r;
+    } else {
+      memory_format = self.suggest_memory_format();
+    }
+  }
+  // See Note [Explicit nullopt MemoryFormat argument]
+  auto r = at::empty(self.sizes(),
+                     options.memory_format(memory_format).pinned_memory(pin_out),
+                     c10::nullopt);
+  r.copy_(self, non_blocking);
+  return r;
+}
+
+at::Tensor AtenIpexCPUDev::dil_to(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<c10::Layout> layout, c10::optional<c10::Device> device, c10::optional<bool> pin_memory, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> optional_memory_format){
+  DEBUG("AtenIpexCPUDev::dil_to_dtype_layout\n");
+  // See [Note: hacky wrapper removal for TensorOptions]
+  auto options_ = at::TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory);
+
+  TORCH_CHECK(
+    !(options_.has_memory_format() && optional_memory_format.has_value()),
+    "Cannot set memory_format both in TensorOptions and explicit argument; please delete "
+    "the redundant setter.");
+  auto options = options_.merge_memory_format(optional_memory_format);
+
+  TORCH_CHECK(options.requires_grad_opt() == c10::nullopt,
+           "to(options) expects unset requires_grad flag, but got "
+           "options.requires_grad set as ", options.requires_grad());
+
+  TORCH_CHECK(!options.has_layout() || self.layout() == options.layout(),
+           "to(options) doesn't support converting to a different layout, "
+           "but got self.layout being ", self.layout(),
+           " and options.layout set as ", options.layout());
+
+  auto specified_options = self.options().merge_in(options);
+  return to_impl(self, specified_options, non_blocking, copy);
+}
+
+at::Tensor AtenIpexCPUDev::dil_to(const at::Tensor & self, c10::Device device, at::ScalarType dtype, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> optional_memory_format){
+  DEBUG("AtenIpexCPUDev::dil_to_device\n");
+  return to_impl(
+    self,
+    self.options().device(device).dtype(dtype).memory_format(optional_memory_format),
+    non_blocking,
+    copy);
+}
+
+at::Tensor AtenIpexCPUDev::dil_to(const at::Tensor & self, at::ScalarType dtype, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> optional_memory_format) {
+  DEBUG("AtenIpexCPUDev::dil_to_dtype\n");
+  return to_impl(
+    self,
+    self.options().dtype(dtype).memory_format(optional_memory_format),
+    non_blocking,
+    copy);
+}
+
+at::Tensor AtenIpexCPUDev::dil_to(const at::Tensor& self, const at::Tensor& other, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format) {
+  DEBUG("AtenIpexCPUDev::dil_to_other\n");
+  auto options = other.options();
+  return to_impl(
+    self,
+    options.memory_format(optional_memory_format),
+    non_blocking,
+    copy);
+}
+
 }  // namespace cpu
 }  // namespace torch_ipex
diff --git a/torch_ipex/csrc/cpu/DevOPs.h b/torch_ipex/csrc/cpu/DevOPs.h
index 5e7bc9b79..151a77942 100644
--- a/torch_ipex/csrc/cpu/DevOPs.h
+++ b/torch_ipex/csrc/cpu/DevOPs.h
@@ -45,6 +45,7 @@ class AtenIpexCPUDev {
   static std::tuple<at::Tensor, at::Tensor, at::Tensor> dil_linear_backward(const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, std::array<bool,3> output_mask);
   static at::Tensor dil_dropout(const at::Tensor& self, double ratio, bool train);
   static at::Tensor dil_dropout_backward(const at::Tensor& grady, const at::Tensor& mask, double ratio);
+  static at::Tensor dil_batch_norm(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, const at::Tensor& running_mean, const at::Tensor& running_var, bool train, double momentum, double eps, bool cudnn_enabled);
   static std::tuple<at::Tensor, at::Tensor, at::Tensor> dil_native_batch_norm(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, const at::Tensor& running_mean, const at::Tensor& running_var, bool train, double momentum, double eps);
   static std::tuple<at::Tensor, at::Tensor, at::Tensor> dil_native_batch_norm_backward(const at::Tensor& grad_output, const at::Tensor& input, const at::Tensor& weight, const at::Tensor& running_mean, const at::Tensor& running_var, const at::Tensor& save_mean, const at::Tensor& save_invstd, bool train,double eps, std::array<bool,3> grad_input_mask);
   static at::Tensor dil_frozen_batch_norm(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, const at::Tensor& running_mean, const at::Tensor& running_var, double eps);
@@ -115,12 +116,16 @@ class AtenIpexCPUDev {
   static at::Tensor dil_upsample_trilinear3d_backward(const at::Tensor & grad_output, at::IntArrayRef output_size, at::IntArrayRef input_size, bool align_corners, c10::optional<double> scales_d, c10::optional<double> scales_h, c10::optional<double> scales_w);
   static at::Tensor dil_unsqueeze(const at::Tensor& self, int64_t dim);
   static at::Tensor dil_div(const at::Tensor &self, const at::Tensor &other);
-  static at::Tensor dil_div(const at::Tensor &self, at::Scalar &other);
+  static at::Tensor dil_div(const at::Tensor &self, const at::Scalar &other);
   static at::Tensor &dil_div_(at::Tensor &self, const at::Tensor &other);
-  static at::Tensor &dil_div_(at::Tensor &self, at::Scalar &other);
+  static at::Tensor &dil_div_(at::Tensor &self, const at::Scalar &other);
   static at::Tensor &dil_div_out(at::Tensor &out, const at::Tensor &self,
                                  const at::Tensor &other);
   static at::Tensor dil_permute(const at::Tensor & self, at::IntArrayRef dims);
+  static at::Tensor dil_to(const at::Tensor & self, c10::optional<at::ScalarType> dtype, c10::optional<c10::Layout> layout, c10::optional<c10::Device> device, c10::optional<bool> pin_memory, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format);
+  static at::Tensor dil_to(const at::Tensor & self, c10::Device device, at::ScalarType dtype, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format);
+  static at::Tensor dil_to(const at::Tensor & self, at::ScalarType dtype, bool non_blocking, bool copy, c10::optional<at::MemoryFormat> memory_format);
+  static at::Tensor dil_to(const at::Tensor& self, const at::Tensor& other, bool non_blocking, bool copy, c10::optional<c10::MemoryFormat> optional_memory_format);
 };
 
 }  // namespace cpu
diff --git a/torch_ipex/csrc/cpu/aten/operators/embedding_bag.cpp b/torch_ipex/csrc/cpu/aten/operators/embedding_bag.cpp
index f46d32f4c..1061744ca 100755
--- a/torch_ipex/csrc/cpu/aten/operators/embedding_bag.cpp
+++ b/torch_ipex/csrc/cpu/aten/operators/embedding_bag.cpp
@@ -181,7 +181,7 @@ static inline at::Tensor embedding_bag_dense_backward_sum_fast(const at::Tensor
   auto offset_numel = offsets.numel();
   at::Tensor offset2bag_ ;
   if (offset_numel != indices_numel) {
-    offset2bag_ = at::native::full({indices.sizes()[0] + 1}, 0, indices.options());
+    offset2bag_ = at::empty({indices.sizes()[0] + 1}, indices.options()).zero_();
     make_offset2bag(offsets, indices, offset2bag_);
     offset2bag_.resize_({indices.sizes()[0]});
   } else {
@@ -261,7 +261,7 @@ embedding_bag_get_offset2bag(const at::Tensor indices, const at::Tensor & offset
   int64_t indices_numel = indices.numel();
   at::Tensor offset2bag_ ;
   if (indices_numel != 0 && offset2bag.numel() == 0) {
-    offset2bag_ = at::native::full({indices.sizes()[0] + 1}, 0, indices.options());
+    offset2bag_ = at::empty({indices.sizes()[0] + 1}, indices.options()).zero_();
     make_offset2bag(offsets, indices, offset2bag_);
     offset2bag_.resize_({indices.sizes()[0]});
   } else {
@@ -279,7 +279,7 @@ at::Tensor embedding_bag_backward_impl(const at::Tensor & grad, const at::Tensor
       return embedding_bag_sparse_backward_sum_fast<at::BFloat16>(grad, indices, offsets, num_weights, mode);
     } else {
       return embedding_bag_sparse_backward_sum_fast<float>(grad, indices, offsets, num_weights, mode);
-    } 
+    }
   } else {
     auto grad_c = grad.contiguous();
     if (is_bfloat16_tensor(grad)) {
diff --git a/torch_ipex/csrc/init_python_bindings.h b/torch_ipex/csrc/init_python_bindings.h
deleted file mode 100644
index f0ee26e9a..000000000
--- a/torch_ipex/csrc/init_python_bindings.h
+++ /dev/null
@@ -1,12 +0,0 @@
-#pragma once
-
-#include <pybind11/pybind11.h>
-
-namespace py = pybind11;
-
-namespace torch_ipex {
-
-// Initialize bindings for IPE module, tensor and optimization passes.
-void InitIpexBindings(py::module m);
-
-}  // namespace torch_ipex
diff --git a/torch_ipex/csrc/ipex_tensor_impl.cpp b/torch_ipex/csrc/ipex_tensor_impl.cpp
index a01b56a53..5f07cb984 100644
--- a/torch_ipex/csrc/ipex_tensor_impl.cpp
+++ b/torch_ipex/csrc/ipex_tensor_impl.cpp
@@ -68,7 +68,7 @@ void IPEXTensorImpl::copy_auto_grad(c10::TensorImpl *src_impl) {
     return;
   }
 
-  if (! this->requires_grad()){
+  if (! this->requires_grad()) {
     auto cpu_autograd_meta = static_cast<torch::autograd::AutogradMeta*>(src_impl->autograd_meta());
     if (cpu_autograd_meta->is_view_){
       auto cpu_view_meta = static_cast<torch::autograd::DifferentiableViewMeta*>(src_impl->autograd_meta());
@@ -76,16 +76,20 @@ void IPEXTensorImpl::copy_auto_grad(c10::TensorImpl *src_impl) {
       c10::optional<torch::autograd::ViewInfo> backward_info_;
       c10::optional<torch::autograd::ViewInfo> forward_info_;
 
-      if (cpu_view_meta->has_fw_view()) {
+      if (cpu_view_meta->has_fw_view() && (!cpu_view_meta->shared_view_info())) {
         auto fw_view_info = cpu_view_meta->get_forward_view();
         torch::autograd::ViewInfo fw_view_info_copy(fw_view_info.base_, fw_view_info.view_fn_);
         forward_info_ = fw_view_info_copy;
+      } else {
+        forward_info_ = c10::nullopt;
       }
 
       if (cpu_view_meta->has_bw_view()) {
         auto bw_view_info = cpu_view_meta->get_backward_view();
         torch::autograd::ViewInfo bw_view_info_copy(bw_view_info.base_, bw_view_info.view_fn_);
         backward_info_ = bw_view_info_copy;
+      } else {
+        backward_info_ = c10::nullopt;
       }
 
       this->set_autograd_meta(
@@ -93,6 +97,7 @@ void IPEXTensorImpl::copy_auto_grad(c10::TensorImpl *src_impl) {
           this,
           backward_info_,
           forward_info_,
+          cpu_view_meta->shared_view_info(),
           cpu_view_meta->get_creation_meta()
         )
       );
diff --git a/torch_ipex/csrc/utils.h b/torch_ipex/csrc/utils.h
index 649066aa5..870e44860 100644
--- a/torch_ipex/csrc/utils.h
+++ b/torch_ipex/csrc/utils.h
@@ -72,16 +72,15 @@ void set_ipex_func_status(IPEXFuncStatus ipex_fun_status);
 
 // A light-weight TORCH_CHECK that does not collect any backtrace info
 #if defined(_DEBUG)
-#define IPEX_CHECK(cond, ...)                                                  \
+  #define IPEX_CHECK(cond, ...)                                                \
   if (!(cond)) {                                                               \
     throw std::runtime_error(                                                  \
-      c10::detail::if_empty_then(                                              \
-        c10::str(__VA_ARGS__),                                                 \
-        "Expected " #cond " to be true, but got false."));                     \
+      c10::detail::torchCheckMsgImpl(                                          \
+        "Expected " #cond " to be true, but got false.", ##__VA_ARGS__));      \
   }
 #else
-// quick path of IPEX_CHECK without reporting message
-#define IPEX_CHECK(cond, ...)                                                  \
+  // quick path of IPEX_CHECK without reporting message
+  #define IPEX_CHECK(cond, ...)                                                  \
   if (!(cond)) { throw std::exception(); }
 #endif
 
diff --git a/intel_pytorch_extension_py/launch.py b/torch_ipex/launch.py
similarity index 90%
rename from intel_pytorch_extension_py/launch.py
rename to torch_ipex/launch.py
index a7241d0c0..fdd4ada88 100644
--- a/intel_pytorch_extension_py/launch.py
+++ b/torch_ipex/launch.py
@@ -17,51 +17,51 @@
 
 r"""
 This is a script for launching PyTorch training and inference on Intel Xeon CPU with optimal configurations.
-Now, single instance inference/training, multi-instance inference/training and distributed training 
+Now, single instance inference/training, multi-instance inference/training and distributed training
 with oneCCL backend is enabled.
 
-To get the peak performance on Intel Xeon CPU, the script optimizes the configuration of thread and memory 
-management. For thread management, the script configures thread affinity and the preload of Intel OMP library. 
+To get the peak performance on Intel Xeon CPU, the script optimizes the configuration of thread and memory
+management. For thread management, the script configures thread affinity and the preload of Intel OMP library.
 For memory management, it configures NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc).
- 
+
 **How to use this module:**
 
-*** Single instance inference/training *** 
+*** Single instance inference/training ***
 
 1. Run single-instance inference or training on a single node with all CPU sockets.
 
 ::
 
-   >>> python -m intel_pytorch_extension.launch script.py args
+   >>> python -m torch_ipex.launch script.py args
 
 2. Run single-instance inference or training on a single CPU socket.
 
 ::
 
-   >>> python -m intel_pytorch_extension.launch --socket_id 1 script.py args
+   >>> python -m torch_ipex.launch --socket_id 1 script.py args
+
+*** Multi-instance inference ***
 
-*** Multi-instance inference *** 
+1. Multi-instance
+   By default, one instance per socket. if you want to set the instance numbers and core per instance,
+   --nintances and  --ncore_per_instance should be set.
 
-1. Multi-instance 
-   By default, one instance per socket. if you want to set the instance numbers and core per instance,  
-   --nintances and  --ncore_per_instance should be set. 
 
-   
-   >>> python -m intel_pytorch_extension.launch --multi_instance python_script args
+   >>> python -m torch_ipex.launch --multi_instance python_script args
 
-   eg: on CLX8280 with 14 instance, 4 cores per instance 
+   eg: on CLX8280 with 14 instance, 4 cores per instance
 ::
 
-   >>> python -m intel_pytorch_extension.launch --multi_instance --nintances 14 --ncore_per_instance 4 python_script args
+   >>> python -m torch_ipex.launch --multi_instance --nintances 14 --ncore_per_instance 4 python_script args
 
 
 *** Distributed Training ***
 
-spawns up multiple distributed training processes on each of the training nodes. For intel_pytorch_extension, oneCCL 
-is used as the communication backend and MPI used to launch multi-proc. To get the better 
-performance, you should specify the different cores for oneCCL communication and computation 
+spawns up multiple distributed training processes on each of the training nodes. For torch_ipex, oneCCL
+is used as the communication backend and MPI used to launch multi-proc. To get the better
+performance, you should specify the different cores for oneCCL communication and computation
 process seperately. This tool can automatically set these ENVs(such as I_MPI_PIN_DOMIN) and launch
-multi-proc for you.   
+multi-proc for you.
 
 The utility can be used for single-node distributed training, in which one or
 more processes per node will be spawned.  It can also be used in
@@ -73,7 +73,7 @@
 
 ::
 
-    >>> python  -m intel_pytorch_extension.launch --distributed  python_script  --arg1 --arg2 --arg3 and all other
+    >>> python  -m torch_ipex.launch --distributed  python_script  --arg1 --arg2 --arg3 and all other
                 arguments of your training script
 
 2. Multi-Node multi-process distributed training: (e.g. two nodes)
@@ -83,8 +83,8 @@
 
 ::
 
-    >>> python -m intel_pytorch_extension.launch --distributed --nproc_per_node=xxx
-               --nnodes=2 --hostfile hostfile python_sript --arg1 --arg2 --arg3 
+    >>> python -m torch_ipex.launch --distributed --nproc_per_node=xxx
+               --nnodes=2 --hostfile hostfile python_sript --arg1 --arg2 --arg3
                and all other arguments of your training script)
 
 
@@ -92,11 +92,11 @@
 
 ::
 
-    >>> python -m intel_pytorch_extension.launch --help
+    >>> python -m torch_ipex.launch --help
 
 *** Memory allocator  ***
 
-"--enable_tcmalloc" and "--enable_jemalloc" can be used to enable different memory allcator. 
+"--enable_tcmalloc" and "--enable_jemalloc" can be used to enable different memory allcator.
 
 """
 
@@ -129,8 +129,8 @@ def _get_socket_info(self):
             for line in self.cpuinfo:
                 if socket_id == int(line[2]):
                     if line[1] not in cur_socket_physical_core:
-                        cur_socket_physical_core.append(line[1])
-                    cur_socket_logical_core.append(line[0])
+                        cur_socket_physical__C.append(line[1])
+                    cur_socket_logical__C.append(line[0])
             self.socket_physical_cores.append(cur_socket_physical_core)
             self.socket_logical_cores.append(cur_socket_logical_core)
 
@@ -143,7 +143,7 @@ def physical_core_nums(self):
 
     def logical_core_nums(self):
         return len(self.socket_logical_cores) * len(self.socket_logical_cores[0])
-    
+
     def get_socket_physical_cores(self, socket_id):
         if socket_id < 0 or socket_id > self.sockets - 1:
             logger.error("Invalid socket id")
@@ -156,14 +156,14 @@ def get_socket_logical_cores(self, socket_id):
 
     def get_all_physical_cores(self):
         return np.array(self.socket_physical_cores).flatten().tolist()
-    
+
     def get_all_logical_cores(self):
         return np.array(self.socket_logical_cores).flatten().tolist()
-              
+
 
 def set_mpi_pin_domain(args):
     '''
-    I_MPI_PIN_DOMAIN specify the cores used for every MPI process. 
+    I_MPI_PIN_DOMAIN specify the cores used for every MPI process.
     The first ccl_worker_count cores of every rank for ccl communication
     and the other cores will be used to do computation.
     For example: on CascadeLake 8280 CPU, 2 ranks on one node. ccl_worker_count=4
@@ -181,7 +181,7 @@ def set_mpi_pin_domain(args):
     for proc in range(ppn):
         domain_binary = 0
         begin = proc * cores_per_rank + args.ccl_worker_count
-        end = proc * cores_per_rank + cores_per_rank -1 
+        end = proc * cores_per_rank + cores_per_rank -1
         for i in range(begin, end + 1):
             domain_binary |= (1 << i)
         pin_domain += hex(domain_binary) + ","
@@ -190,7 +190,7 @@ def set_mpi_pin_domain(args):
 def set_ccl_worker_affinity(args):
     '''
     computation and communication use different cores when using oneCCL
-    backend for distributed training. we use first ccl_worker_count cores of 
+    backend for distributed training. we use first ccl_worker_count cores of
     every rank for ccl communication
     '''
     cpuinfo = CPUinfo()
@@ -202,18 +202,18 @@ def set_ccl_worker_affinity(args):
     affinity = ''
     for proc in range(ppn):
         for ccl_worker in range(args.ccl_worker_count):
-            affinity += str(proc * cores_per_rank + ccl_worker)+ "," 
+            affinity += str(proc * cores_per_rank + ccl_worker)+ ","
     os.environ["CCL_WORKER_AFFINITY"] = affinity
 
 
 def add_lib_preload(lib_type=None):
     '''
-    Enale TCMalloc/JeMalloc/iomp 
+    Enale TCMalloc/JeMalloc/iomp
     '''
     library_paths = []
     if "CONDA_PREFIX" in os.environ:
         library_paths.append(os.environ["CONDA_PREFIX"] + "/lib/")
-    
+
     library_paths += ["{}/.local/lib/".format(expanduser("~")), "/usr/local/lib/",
                      "/usr/local/lib64/", "/usr/lib/", "/usr/lib64/"]
     lib_find = False
@@ -234,7 +234,7 @@ def set_memory_allocator(args):
         logger.error("Unable to enable TCMalloc and JEMalloc at the same time")
         exit(-1)
 
-    if args.enable_tcmalloc: 
+    if args.enable_tcmalloc:
         find_tc = add_lib_preload(lib_type="tcmalloc")
         if not find_tc:
             logger.warning("Unable to find the {} library file lib{}.so in $CONDA_PREFIX/lib or  /.local/lib/"
@@ -261,38 +261,38 @@ def set_memory_allocator(args):
         find_tc = add_lib_preload(lib_type="tcmalloc")
         if find_tc:
             logger.info("Use TCMalloc memory allocator")
-            return 
+            return
         find_je = add_lib_preload(lib_type="jemalloc")
         if find_je:
             logger.info("Use JeMallocl memory allocator")
-            return 
+            return
         logger.warning("Both TCMalloc and JeMalloc are not fount in $CONDA_PREFIX/lib or  /.local/lib/"
                        " or /usr/local/lib/ or /usr/local/lib64/ or /usr/lib or /usr/lib64 or "
                        "~/.local/lib/ so the LD_PRELOAD environment variable will not be set. This may drop the performance"
                        .format(expanduser("~")))
-         
+
 def set_multi_thread_and_allcator(args):
-    
+
     set_memory_allocator(args)
     if "OMP_NUM_THREADS" not in os.environ:
         os.environ["OMP_NUM_THREADS"] = str(args.ncore_per_instance)
     elif "OMP_NUM_THREADS" in os.environ:
         args.ncore_per_instance = int(os.environ["OMP_NUM_THREADS"])
-    
+
     if "KMP_AFFINITY" not in os.environ:
         os.environ["KMP_AFFINITY"] = args.kmp_affinity
-    
+
     if "KMP_BLOCKTIME" not in os.environ:
         os.environ["KMP_BLOCKTIME"] = "1"
-    
-    if "DNNL_PRIMITIVE_CACHE_CAPACITY" not in os.environ:    
+
+    if "DNNL_PRIMITIVE_CACHE_CAPACITY" not in os.environ:
        os.environ["DNNL_PRIMITIVE_CACHE_CAPACITY"] = '1024'
 
     logger.info("OMP_NUM_THREADS={} ".format(os.environ["OMP_NUM_THREADS"]))
     logger.info("KMP_AFFINITY={}".format(os.environ["KMP_AFFINITY"]))
     logger.info("KMP_BLOCKTIME={}".format(os.environ["KMP_BLOCKTIME"]))
     logger.info("DNNL_PRIMITIVE_CACHE_CAPACITY={}".format(os.environ["DNNL_PRIMITIVE_CACHE_CAPACITY"]))
-     
+
     if args.enable_iomp:
         find_iomp = add_lib_preload(lib_type="iomp")
         if not find_iomp:
@@ -301,21 +301,21 @@ def set_multi_thread_and_allcator(args):
                "~/.local/lib/ so the LD_PRELOAD environment variable will not be set."
                .format("iomp", "iomp", expanduser("~")))
         else:
-            logger.info("User iomp") 
- 
+            logger.info("User iomp")
+
 def launch(args):
     '''
-    single-instance / multi-instance launcher  
-    ''' 
+    single-instance / multi-instance launcher
+    '''
     processes = []
     cores = []
- 
+
     cpuinfo = CPUinfo()
     if args.core_list:#user specify what cores will be used by params
         cores = args.core_list.strip().split(",")
         if args.ncore_per_instance == -1:
             logger.error("please specify the '--ncore_per_instance' if you have pass the --core_list params")
-            exit(-1) 
+            exit(-1)
         elif args.ninstances > 1 and args.ncore_per_instance * args.ninstances < len(cores):
             logger.warning("only first {} cores will be used, but you specify {} cores in core_list".format
                   (args.ncore_per_instance * args.ninstances, len(cores)))
@@ -324,14 +324,14 @@ def launch(args):
     else:
         if args.use_logical_core:
             if args.socket_id != -1:
-                cores = cpuinfo.get_socket_logical_cores(args.socket_id) 
+                cores = cpuinfo.get_socket_logical_cores(args.socket_id)
             else:
-                cores = cpuinfo.get_all_logical_cores()            
+                cores = cpuinfo.get_all_logical_cores()
         else:
             if args.socket_id != -1:
                 cores = cpuinfo.get_socket_physical_cores(args.socket_id)
             else:
-                cores = cpuinfo.get_all_physical_cores()      
+                cores = cpuinfo.get_all_physical_cores()
         if not args.multi_instance and args.ninstances == -1 and args.ncore_per_instance == -1:
             args.ninstances = 1;
             args.ncore_per_instance = len(cores)
@@ -383,8 +383,8 @@ def launch(args):
         process.wait()
         if process.returncode != 0:
             raise subprocess.CalledProcessError(returncode=process.returncode,
-                                                cmd=cmd) 
-    
+                                                cmd=cmd)
+
 def mpi_dist_launch(args):
     '''
     Set ENVs and launch MPI process for distributed training.
@@ -417,13 +417,13 @@ def mpi_dist_launch(args):
         if not master_check:
            logger.error("MASTER_ADDR is not right. Please make sure the first ip {} in your hostfile is the current node".format(ip_list[0]))
            exit(-1)
- 
+
         logger.info("Begin to validate the ip connect")
         args.master_addr = ip_list[0]
         for ip in ip_list[1:]:
             completed_process = subprocess.run("ssh -o PasswordAuthentication=no {} ':'".format(ip), shell=True)
             if completed_process.returncode != 0:
-                logger.error("Passwordless SSH login to {} failed, please make sure you have setup SSH public key right") 
+                logger.error("Passwordless SSH login to {} failed, please make sure you have setup SSH public key right")
                 exit(-1)
             else:
                 logger.info("connection from master node {} to slave node {} is OK".format(args.master_addr, ip))
@@ -436,12 +436,12 @@ def mpi_dist_launch(args):
          mpi_pin_domain = set_mpi_pin_domain(args)
     else:
          mpi_pin_domain = os.environ["I_MPI_PIN_DOMAIN"]
-    
+
     cpuinfo = CPUinfo()
-    ppn = args.nproc_per_node 
+    ppn = args.nproc_per_node
     total_cores = len(cpuinfo.get_all_physical_cores())
     cores_per_rank = total_cores // ppn
-    
+
     if "OMP_NUM_THREADS" not in os.environ:
         opm_num_threads = cores_per_rank - args.ccl_worker_count
     else:
@@ -454,7 +454,7 @@ def mpi_dist_launch(args):
 
     if "CCL_ATL_TRANSPORT" not in os.environ:
         os.environ["CCL_ATL_TRANSPORT"] = "ofi"
-    
+
     if args.enable_iomp:
         find_iomp = add_lib_preload(lib_type="iomp")
         if not find_iomp:
@@ -494,7 +494,7 @@ def mpi_dist_launch(args):
     os.environ["LAUNCH_CMD"] = os.environ["LAUNCH_CMD"][:-2]
 
 def add_distributed_training_params(parser):
-    
+
     cpuinfo = CPUinfo()
     socket_nums = cpuinfo.socket_nums()
 
@@ -504,7 +504,7 @@ def add_distributed_training_params(parser):
                              "training")
     group.add_argument("--nproc_per_node", metavar='\b', type=int, default=socket_nums,
                         help="The number of processes to launch on each node")
-    #ccl control 
+    #ccl control
     group.add_argument("--ccl_worker_count", metavar='\b', default=4, type=int,
                         help="Core numbers per rank used for ccl communication")
     #mpi control
@@ -528,7 +528,7 @@ def add_distributed_training_params(parser):
 
 def add_memory_allocator_params(parser):
 
-    group = parser.add_argument_group("Memory Allocator Parameters") 
+    group = parser.add_argument_group("Memory Allocator Parameters")
         #allocator control
     group.add_argument("--enable_tcmalloc", action='store_true', default=False,
                         help="Enable tcmalloc allocator")
@@ -536,12 +536,12 @@ def add_memory_allocator_params(parser):
                         help="Enable jemalloc allocator")
     group.add_argument("--use_default_allocator",  action='store_true', default=False,
                         help="Use default memory allocator")
-        
+
 def add_multi_instance_params(parser):
-    
+
     group = parser.add_argument_group("Multi-instance Parameters")
      #multi-instance control
-    group.add_argument("--ncore_per_instance", metavar='\b', default=-1, type=int, 
+    group.add_argument("--ncore_per_instance", metavar='\b', default=-1, type=int,
                          help="Cores per instance")
     group.add_argument("--ninstances", metavar='\b', default=-1, type=int,
                          help="For multi-instance, you should give the cores number you used for per insantance.")
@@ -557,16 +557,16 @@ def add_multi_instance_params(parser):
                          help="Disable numactl")
     group.add_argument("--core_list", metavar='\b', default=None, type=str,
                          help="Specify the core list as 'core_id, core_id, ....', otherwise, all the cores will be used.")
- 
-def add_kmp_iomp_params(parser): 
 
-    group = parser.add_argument_group("KMP/IOMP Affinity Parameters") 
+def add_kmp_iomp_params(parser):
+
+    group = parser.add_argument_group("KMP/IOMP Affinity Parameters")
     group.add_argument("--kmp_affinity", metavar='\b', default="granularity=fine,compact,1,0", type=str,
                         help="KMP_AFFINITY setup, environment variable has higher priority than this args."
                              "defualt value is : granularity=fine,compact,1,0")
     group.add_argument("--enable_iomp", action='store_true', default=False,
-                        help="Enable iomp and libiomp.so will be add to LD_PRELOAD") 
-   
+                        help="Enable iomp and libiomp.so will be add to LD_PRELOAD")
+
 
 def parse_args():
     """
@@ -578,23 +578,23 @@ def parse_args():
                                         "inference/training and distributed training with oneCCL backend is enabled. "
                                         "To get the peak performance on Intel Xeon CPU, the script optimizes the configuration "
                                         "of thread and memory management. For thread management, the script configures thread "
-                                        "affinity and the preload of Intel OMP library. For memory management, it configures " 
+                                        "affinity and the preload of Intel OMP library. For memory management, it configures "
                                         "NUMA binding and preload optimized memory allocation library (e.g. tcmalloc, jemalloc) "
                                         "\n################################# Basic usage ############################# \n"
-                                        "\n 1. single instance\n" 
-                                         "\n   >>> python -m intel_pytorch_extension.launch python_script args \n"
+                                        "\n 1. single instance\n"
+                                         "\n   >>> python -m torch_ipex.launch python_script args \n"
                                         "\n2. multi-instance \n"
-                                        "\n    >>> python -m intel_pytorch_extension.launch --multi_instance python_script args\n"
+                                        "\n    >>> python -m torch_ipex.launch --multi_instance python_script args\n"
                                         "\n3. Single-Node multi-process distributed training\n"
-                                        "\n    >>> python  -m intel_pytorch_extension.launch --distributed  python_script args\n"
+                                        "\n    >>> python  -m torch_ipex.launch --distributed  python_script args\n"
                                         "\n4. Multi-Node multi-process distributed training: (e.g. two nodes)\n"
                                         "\n   rank 0: *(IP: 192.168.10.10, and has a free port: 295000)*\n"
-                                        "\n   >>> python -m intel_pytorch_extension.launch --distributed --nproc_per_node=2\n"
+                                        "\n   >>> python -m torch_ipex.launch --distributed --nproc_per_node=2\n"
                                         "\n       --nnodes=2 --hostfile hostfile python_script args\n",
                                         formatter_class=RawTextHelpFormatter)
-    
+
     parser.add_argument("--multi_instance", action='store_true', default=False,
-                        help="Enable multi-instance, by default one instance per socket")  
+                        help="Enable multi-instance, by default one instance per socket")
 
     parser.add_argument('--distributed', action='store_true', default=False,
                     help='Enable distributed training.')
@@ -608,7 +608,7 @@ def parse_args():
                              "it directly. Useful when the script is not a Python script.")
     add_memory_allocator_params(parser)
     add_kmp_iomp_params(parser)
-     
+
     add_distributed_training_params(parser)
     add_multi_instance_params(parser)
     # positional
@@ -630,7 +630,7 @@ def main():
 
     if args.distributed and args.multi_instance:
         raise RuntimeError("Either args.distributed or args.multi_instance should be set")
-    
+
     if args.latency_performance and args.throughput_performance:
         raise RuntimeError("Either args.latency_performance or args.throughput_performance  should be set")
 
@@ -644,7 +644,7 @@ def main():
 
     for x in sorted(set(os.environ.keys()) - env_before):
         logger.debug(f'{x}={os.environ[x]}')
- 
+
 if __name__ == "__main__":
     main()
 
diff --git a/intel_pytorch_extension_py/ops/__init__.py b/torch_ipex/ops/__init__.py
similarity index 88%
rename from intel_pytorch_extension_py/ops/__init__.py
rename to torch_ipex/ops/__init__.py
index 277184b8f..339356524 100644
--- a/intel_pytorch_extension_py/ops/__init__.py
+++ b/torch_ipex/ops/__init__.py
@@ -1,5 +1,5 @@
 from .interaction import interaction
-from .embeddingbag import embeddingbag
+from .embeddingbag import ipex_embedding_bag
 from .linear import *
 from .pooling import *
 from .mlp import *
diff --git a/torch_ipex/ops/embeddingbag.py b/torch_ipex/ops/embeddingbag.py
new file mode 100644
index 000000000..2a1b64ee6
--- /dev/null
+++ b/torch_ipex/ops/embeddingbag.py
@@ -0,0 +1,30 @@
+import torch
+from torch import nn
+from torch.autograd import Function
+import torch_ipex as ipex
+import torch_ipex._C as core
+from typing import Callable, List, Optional, Tuple
+
+# # extension for BF16 fast path only
+Tensor = torch.Tensor
+torch_embedding_bag = torch.embedding_bag
+
+def ipex_embedding_bag(
+    weight: Tensor,
+    input: Tensor,
+    offsets: Optional[Tensor] = None,
+    scale_grad_by_freq: bool = False,
+    mode: int = 0,
+    sparse: bool = False,
+    per_sample_weights: Optional[Tensor] = None,
+    include_last_offset: bool = False,
+    padding_idx: Optional[int] = None,
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    if weight.device.type in ipex.DEVICE:
+        assert padding_idx == None
+        ret = torch.ops.torch_ipex.embedding_bag(weight, input, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset)
+        return ret[0], torch.rand(0), torch.rand(0), torch.rand(0)
+    else:
+        return torch_embedding_bag(weight, input, offsets, scale_grad_by_freq, mode, sparse, per_sample_weights, include_last_offset, padding_idx)
+
+torch.embedding_bag = ipex_embedding_bag
diff --git a/intel_pytorch_extension_py/ops/frozen_batch_norm.py b/torch_ipex/ops/frozen_batch_norm.py
similarity index 100%
rename from intel_pytorch_extension_py/ops/frozen_batch_norm.py
rename to torch_ipex/ops/frozen_batch_norm.py
diff --git a/intel_pytorch_extension_py/ops/gru.py b/torch_ipex/ops/gru.py
similarity index 71%
rename from intel_pytorch_extension_py/ops/gru.py
rename to torch_ipex/ops/gru.py
index a8412f5ff..bea095958 100644
--- a/intel_pytorch_extension_py/ops/gru.py
+++ b/torch_ipex/ops/gru.py
@@ -10,6 +10,9 @@ def ipex_gru(input, hx, _flat_weights, bias, num_layers, dropout, training, bidi
     if input.device.type == 'xpu' and (dropout == 0 or training == False):
         return torch.ops.torch_ipex.gru(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
     else:
+        if training and input.device.type == 'xpu':
+            raise Exception("IPEX does not support LSTM training if its dropout is not 0. \
+                Please explicity convert the gru module and its tensors to CPU and convert the output tensor back to ipex.DEVICE.")
         return VF_gru(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
 
 def gru(*args):
@@ -18,4 +21,4 @@ def gru(*args):
     else:
         return ipex_gru(*args)
 
-_VF.gru = gru
\ No newline at end of file
+_VF.gru = gru
diff --git a/intel_pytorch_extension_py/ops/interaction.py b/torch_ipex/ops/interaction.py
similarity index 96%
rename from intel_pytorch_extension_py/ops/interaction.py
rename to torch_ipex/ops/interaction.py
index 0194cb3b3..2fc9033f0 100644
--- a/intel_pytorch_extension_py/ops/interaction.py
+++ b/torch_ipex/ops/interaction.py
@@ -1,7 +1,7 @@
 import torch
 from torch import nn
 from torch.autograd import Function
-import _torch_ipex as core
+import torch_ipex._C as core
 
 def interaction(*args):
     # Current pytorch dose not support vector<Tensor> input for c++ custom function
diff --git a/intel_pytorch_extension_py/ops/jit.py b/torch_ipex/ops/jit.py
similarity index 98%
rename from intel_pytorch_extension_py/ops/jit.py
rename to torch_ipex/ops/jit.py
index b2d882d05..216bc9d04 100644
--- a/intel_pytorch_extension_py/ops/jit.py
+++ b/torch_ipex/ops/jit.py
@@ -1,5 +1,5 @@
 import torch
-import _torch_ipex as core
+import torch_ipex._C as core
 from torch.jit._recursive import wrap_cpp_module
 
 torch._C._jit_set_profiling_mode(False)
diff --git a/intel_pytorch_extension_py/ops/layer_norm.py b/torch_ipex/ops/layer_norm.py
similarity index 93%
rename from intel_pytorch_extension_py/ops/layer_norm.py
rename to torch_ipex/ops/layer_norm.py
index 3c3186499..c9f32342b 100644
--- a/intel_pytorch_extension_py/ops/layer_norm.py
+++ b/torch_ipex/ops/layer_norm.py
@@ -1,5 +1,5 @@
 import torch
-import _torch_ipex as core
+import torch_ipex._C as core
 from typing import Optional
 
 torch_layer_norm = torch.layer_norm
diff --git a/intel_pytorch_extension_py/ops/linear.py b/torch_ipex/ops/linear.py
similarity index 94%
rename from intel_pytorch_extension_py/ops/linear.py
rename to torch_ipex/ops/linear.py
index 9d89fed1a..b92cf2910 100644
--- a/intel_pytorch_extension_py/ops/linear.py
+++ b/torch_ipex/ops/linear.py
@@ -1,7 +1,7 @@
 import torch
 from torch.autograd import Function
 import torch.nn.functional as F
-import _torch_ipex as core
+import torch_ipex._C as core
 from typing import Optional
 
 def linear(input, weight, bias: Optional[torch.Tensor] = None):
diff --git a/intel_pytorch_extension_py/ops/lstm.py b/torch_ipex/ops/lstm.py
similarity index 87%
rename from intel_pytorch_extension_py/ops/lstm.py
rename to torch_ipex/ops/lstm.py
index 25ad8ccfd..e6d05d7f5 100644
--- a/intel_pytorch_extension_py/ops/lstm.py
+++ b/torch_ipex/ops/lstm.py
@@ -6,6 +6,10 @@
 def ipex_lstm(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first, device):
     # For LSTM training with dropout, fallback to cpu due to performance issue in oneDNN mode
     if training and dropout != 0:
+        if input.device.type == 'xpu':
+            raise Exception("IPEX does not support LSTM training if its dropout is not 0. \
+                Please explicity convert the gru module and its tensors to CPU and convert the output tensor back to ipex.DEVICE.")
+        assert input.device.type != 'xpu'
         return fallback_lstm(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first, device=device)
     else:
         return torch.ops.torch_ipex.lstm(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
@@ -49,7 +53,7 @@ def lstm(*args):
     device = get_device(*args)
     if device == "cpu":
         return VF_lstm(*args)
-    
+
     # For LSTM with pack_padded_sequence as input, fallback to cpu due to performance issue in oneDNN mode
     if isinstance(args[1], torch.Tensor):
         return fallback_lstm(*args, device=device)
diff --git a/intel_pytorch_extension_py/ops/mlp.py b/torch_ipex/ops/mlp.py
similarity index 99%
rename from intel_pytorch_extension_py/ops/mlp.py
rename to torch_ipex/ops/mlp.py
index 04e354641..ec8a31799 100644
--- a/intel_pytorch_extension_py/ops/mlp.py
+++ b/torch_ipex/ops/mlp.py
@@ -4,7 +4,7 @@
 from torch.nn.parameter import Parameter
 from torch.nn import init
 from torch.autograd import Function
-import _torch_ipex as core
+import torch_ipex._C as core
 
 class IpexMLPHandle:
     def __init__(self, N, C, K, bn, bc, bk, dtype, fuse_bias, act_type):
diff --git a/intel_pytorch_extension_py/ops/nms.py b/torch_ipex/ops/nms.py
similarity index 65%
rename from intel_pytorch_extension_py/ops/nms.py
rename to torch_ipex/ops/nms.py
index 1c8b2730c..bb2629391 100644
--- a/intel_pytorch_extension_py/ops/nms.py
+++ b/torch_ipex/ops/nms.py
@@ -1,4 +1,4 @@
-import _torch_ipex as core
+import torch_ipex._C as core
 
 nms = core.nms
 batch_score_nms = core.batch_score_nms
\ No newline at end of file
diff --git a/intel_pytorch_extension_py/ops/pooling.py b/torch_ipex/ops/pooling.py
similarity index 97%
rename from intel_pytorch_extension_py/ops/pooling.py
rename to torch_ipex/ops/pooling.py
index 1e41dc35c..f13a33b38 100644
--- a/intel_pytorch_extension_py/ops/pooling.py
+++ b/torch_ipex/ops/pooling.py
@@ -1,7 +1,7 @@
 import torch
 from torch.autograd import Function
 import torch.nn.functional as F
-import _torch_ipex as core
+import torch_ipex._C as core
 from torch.nn.modules.utils import _single, _pair
 from typing import List
 
diff --git a/intel_pytorch_extension_py/ops/rnn.py b/torch_ipex/ops/rnn.py
similarity index 97%
rename from intel_pytorch_extension_py/ops/rnn.py
rename to torch_ipex/ops/rnn.py
index 7f710c720..ee9f24492 100644
--- a/intel_pytorch_extension_py/ops/rnn.py
+++ b/torch_ipex/ops/rnn.py
@@ -13,12 +13,18 @@ def rnn_tanh(input, hx, _flat_weights, bias, num_layers, dropout, training, bidi
     if input.device.type == 'xpu' and (dropout == 0 or training == False):
         return torch.ops.torch_ipex.rnn_tanh(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
     else:
+        if training and input.device.type == 'xpu':
+            raise Exception("IPEX does not support RNN-Tanh training if its dropout is not 0. \
+                Please explicity convert the gru module and its tensors to CPU and convert the output tensor back to ipex.DEVICE.")
         return _VF.rnn_tanh(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
 
 def rnn_relu(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first):
     if input.device.type == 'xpu' and (dropout == 0 or training == False):
         return torch.ops.torch_ipex.rnn_relu(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
     else:
+        if training and input.device.type == 'xpu':
+            raise Exception("IPEX does not support RNN-ReLU training if its dropout is not 0. \
+                Please explicity convert the gru module and its tensors to CPU and convert the output tensor back to ipex.DEVICE.")
         return _VF.rnn_relu(input, hx, _flat_weights, bias, num_layers, dropout, training, bidirectional, batch_first)
 
 _rnn_impls = {
@@ -412,4 +418,4 @@ def __init__(self, *args, **kwargs):
             raise ValueError("Unknown nonlinearity '{}'".format(self.nonlinearity))
         super(RNN, self).__init__(mode, *args, **kwargs)
 
-torch.nn.RNN = RNN
\ No newline at end of file
+torch.nn.RNN = RNN
diff --git a/intel_pytorch_extension_py/ops/roi_align.py b/torch_ipex/ops/roi_align.py
similarity index 98%
rename from intel_pytorch_extension_py/ops/roi_align.py
rename to torch_ipex/ops/roi_align.py
index 43bc08a3b..f91136dce 100644
--- a/intel_pytorch_extension_py/ops/roi_align.py
+++ b/torch_ipex/ops/roi_align.py
@@ -5,7 +5,7 @@
 from torch.autograd.function import once_differentiable
 from torch.nn.modules.utils import _pair
 
-import _torch_ipex as core
+import torch_ipex._C as core
 
 
 class _ROIAlign(Function):
diff --git a/intel_pytorch_extension_py/ops/save.py b/torch_ipex/ops/save.py
similarity index 100%
rename from intel_pytorch_extension_py/ops/save.py
rename to torch_ipex/ops/save.py
diff --git a/intel_pytorch_extension_py/ops/to.py b/torch_ipex/ops/to.py
similarity index 95%
rename from intel_pytorch_extension_py/ops/to.py
rename to torch_ipex/ops/to.py
index 7ea3d79e7..6fa6c3889 100644
--- a/intel_pytorch_extension_py/ops/to.py
+++ b/torch_ipex/ops/to.py
@@ -1,5 +1,5 @@
 import torch
-import _torch_ipex as core
+import torch_ipex._C as core
 
 torch_to = torch.nn.Module.to
 
diff --git a/intel_pytorch_extension_py/optim/__init__.py b/torch_ipex/optim/__init__.py
similarity index 100%
rename from intel_pytorch_extension_py/optim/__init__.py
rename to torch_ipex/optim/__init__.py
diff --git a/intel_pytorch_extension_py/optim/split_sgd.py b/torch_ipex/optim/split_sgd.py
similarity index 97%
rename from intel_pytorch_extension_py/optim/split_sgd.py
rename to torch_ipex/optim/split_sgd.py
index 422edac44..898a3d1e3 100644
--- a/intel_pytorch_extension_py/optim/split_sgd.py
+++ b/torch_ipex/optim/split_sgd.py
@@ -1,10 +1,10 @@
 import torch
 from torch.optim.optimizer import Optimizer, required
-import _torch_ipex
+import torch_ipex
 
 _available = False
 try:
-    from _torch_ipex import packed_add_ 
+    from torch_ipex._C import packed_add_ 
     _available = True
 except ImportError as e:
     pass
diff --git a/intel_pytorch_extension_py/tensor.py b/torch_ipex/tensor.py
similarity index 100%
rename from intel_pytorch_extension_py/tensor.py
rename to torch_ipex/tensor.py
diff --git a/tutorials/Performance_Tuning.md b/tutorials/Performance_Tuning.md
index f06b5406f..82f809bbf 100644
--- a/tutorials/Performance_Tuning.md
+++ b/tutorials/Performance_Tuning.md
@@ -21,6 +21,7 @@ Although by default primitives of PyTorch and IPEX are highly optimized, there a
   - Memory Allocator
     - Jemalloc
     - TCMalloc
+  - Denormal Number
 
 # Hardware Configuration
 
@@ -214,3 +215,11 @@ cd gperftools-<version>
 make
 make install
 ```
+
+## Denormal Number
+
+[Denormal number](https://en.wikipedia.org/wiki/Denormal_number) is used to store extremely small numbers which are close to 0. Computations with denormal numbers are remarkably slower than normalized number. To solve the low performance issue caused by denormal numbers, users can use the following PyTorch API function.
+
+```
+torch.set_flush_denormal(True)
+```