intel · xytintel · Jan 26, 2025 · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025
diff --git a/src/ATen/CMakeLists.txt b/src/ATen/CMakeLists.txt
@@ -3,7 +3,7 @@
 file(GLOB xpu_h "xpu/*.h")
 file(GLOB xpu_cpp "xpu/*.cpp")
 file(GLOB xpu_mkl "native/xpu/mkl/*.cpp")
-file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp")
+file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/sparse/xpu/*.cpp" "native/nested/*.cpp" "native/nested/xpu/*.cpp" "native/transformers/*.cpp" "native/quantized/*.cpp")
 file(GLOB xpu_sycl "native/xpu/sycl/*.cpp" "native/sparse/xpu/sycl/*.cpp" "native/nested/xpu/sycl/*.cpp" "native/transformers/sycl/*.cpp" "native/quantized/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})

diff --git a/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp b/src/ATen/native/nested/NestedTensorTransformerFunctions.cpp
@@ -0,0 +1,20 @@
+#include <ATen/ATen.h>
+#include <ATen/native/nested/NestedTensorTransformerFunctions.h>
+
+namespace at::native {
+
+Tensor NestedTensor_softmax_dropout_xpu(
+    const Tensor& self,
+    const Tensor& query) {
+  std::optional<Tensor> attn_mask;
+
+  attn_mask = NestedTensor_to_mask(query, 2, self.size(2));
+  attn_mask = attn_mask->to(query.device(), /*non-blocking=*/true);
+  return _masked_softmax(
+      self,
+      *attn_mask,
+      self.dim() - 1,
+      /*mask type */ 1); // NestedTensor_to_mask produces a BxT mask
+}
+
+} // namespace at::native
diff --git a/yaml/native/native_functions.yaml b/yaml/native/native_functions.yaml
@@ -4437,6 +4437,11 @@
     XPU: nested_from_padded_xpu
   autogen: _nested_from_padded.out
 
+- func: _nested_tensor_softmax_with_shape(Tensor self, Tensor query) -> Tensor
+  dispatch:
+    NestedTensorXPU: NestedTensor_softmax_dropout_xpu
+  tags: nondeterministic_seeded
+
 - func: avg_pool2d.out(Tensor self, int[2] kernel_size, int[2] stride=[], int[2] padding=0, bool ceil_mode=False, bool count_include_pad=True, int? divisor_override=None, *, Tensor(a!) out) -> Tensor(a!)
   python_module: nn
   structured: True