Merge branch 'main' into zhiwei/codegen

intel · Aug 29, 2024 · 283c6f7 · 283c6f7
2 parents 5badf42 + e04c892
commit 283c6f7
Show file tree

Hide file tree

Showing 4 changed files with 276 additions and 2 deletions.
diff --git a/src/ATen/native/xpu/sycl/UnaryKernels.cpp b/src/ATen/native/xpu/sycl/UnaryKernels.cpp
@@ -236,8 +236,8 @@ struct Expm1Functor {
 template <typename T>
 struct Expm1Functor<c10::complex<T>> {
   c10::complex<T> operator()(c10::complex<T> x) const {
-    auto a = std::sin(.5 * x.imag());
-    auto re = std::expm1(x.real()) * std::cos(x.imag()) - 2 * a * a;
+    auto a = std::sin(T(.5) * x.imag());
+    auto re = std::expm1(x.real()) * std::cos(x.imag()) - T(2) * a * a;
     auto im = std::exp(x.real()) * std::sin(x.imag());
     return c10::complex<T>(re, im);
   }

diff --git a/test/xpu/run_test_with_skip_arc.py b/test/xpu/run_test_with_skip_arc.py
@@ -3,6 +3,7 @@
 from skip_list_common import skip_dict
 from skip_list_arc import skip_dict as skip_dict_specifical
 from skip_list_win import skip_dict as skip_dict_win
+from skip_list_win_arc import skip_dict as skip_dict_win_arc
 from xpu_test_utils import launch_test
 
 
@@ -15,6 +16,8 @@
         skip_list += skip_dict_specifical[key]
     if IS_WINDOWS and key in skip_dict_win:
         skip_list += skip_dict_win[key]
+    if IS_WINDOWS and key in skip_dict_win_arc:
+        skip_list += skip_dict_win_arc[key]
     res += launch_test(key, skip_list)
 
 exit_code = os.WEXITSTATUS(res)

diff --git a/test/xpu/skip_list_win_arc.py b/test/xpu/skip_list_win_arc.py
@@ -0,0 +1,187 @@
+skip_dict = {
+    # SYCL Compiler on Windows removed the following operations when '-cl-poison-unsupported-fp64-kernels' is on
+    # Hence, skip the following windows specific errors
+    "test_ops_xpu.py": (
+        "test_compare_cpu_sqrt_xpu_complex64",
+        "test_backward_nn_functional_adaptive_avg_pool2d_xpu_float32",
+    ),
+    "test_binary_ufuncs_xpu": (
+        "test_batch_vs_slicing___rpow___xpu_complex64",
+        "test_batch_vs_slicing__refs_pow_xpu_complex64",
+        "test_batch_vs_slicing_pow_xpu_complex64",
+        "test_contig_size1___rpow___xpu_complex64",
+        "test_contig_size1__refs_pow_xpu_complex64",
+        "test_contig_size1_large_dim___rpow___xpu_complex64",
+        "test_contig_size1_large_dim__refs_pow_xpu_complex64",
+        "test_contig_size1_large_dim_pow_xpu_complex32",
+        "test_contig_size1_large_dim_pow_xpu_complex64",
+        "test_contig_size1_pow_xpu_complex32",
+        "test_contig_size1_pow_xpu_complex64",
+        "test_contig_vs_every_other___rpow___xpu_complex64",
+        "test_contig_vs_every_other__refs_pow_xpu_complex64",
+        "test_contig_vs_every_other_pow_xpu_complex32",
+        "test_contig_vs_every_other_pow_xpu_complex64",
+        "test_contig_vs_transposed___rpow___xpu_complex64",
+        "test_contig_vs_transposed__refs_pow_xpu_complex64",
+        "test_contig_vs_transposed_pow_xpu_complex32",
+        "test_contig_vs_transposed_pow_xpu_complex64",
+        "test_non_contig___rpow___xpu_complex64",
+        "test_non_contig__refs_pow_xpu_complex64",
+        "test_non_contig_expand___rpow___xpu_complex64",
+        "test_non_contig_expand__refs_pow_xpu_complex64",
+        "test_non_contig_expand_pow_xpu_complex32",
+        "test_non_contig_expand_pow_xpu_complex64",
+        "test_non_contig_index___rpow___xpu_complex64",
+        "test_non_contig_index__refs_pow_xpu_complex64",
+        "test_non_contig_index_pow_xpu_complex32",
+        "test_non_contig_index_pow_xpu_complex64",
+        "test_non_contig_pow_xpu_complex64",
+    ),
+    "test_nn_xpu.py": (
+        "test_adaptiveavg_pool1d_shmem_xpu",
+    ),
+    "test_unary_ufuncs_xpu.py": (
+        "test_batch_vs_slicing__refs_acos_xpu_complex64",
+        "test_batch_vs_slicing__refs_acosh_xpu_complex64",
+        "test_batch_vs_slicing__refs_log_xpu_complex64",
+        "test_batch_vs_slicing__refs_sqrt_xpu_complex64",
+        "test_batch_vs_slicing_acos_xpu_complex32",
+        "test_batch_vs_slicing_acos_xpu_complex64",
+        "test_batch_vs_slicing_acosh_xpu_complex32",
+        "test_batch_vs_slicing_acosh_xpu_complex64",
+        "test_batch_vs_slicing_log_xpu_complex32",
+        "test_batch_vs_slicing_log_xpu_complex64",
+        "test_batch_vs_slicing_sqrt_xpu_complex32",
+        "test_batch_vs_slicing_sqrt_xpu_complex64",
+        "test_batch_vs_slicing_square_xpu_complex64",
+        "test_contig_size1__refs_acos_xpu_complex64",
+        "test_contig_size1__refs_acosh_xpu_complex64",
+        "test_contig_size1__refs_log_xpu_complex64",
+        "test_contig_size1__refs_sqrt_xpu_complex64",
+        "test_contig_size1_acos_xpu_complex32",
+        "test_contig_size1_acos_xpu_complex64",
+        "test_contig_size1_acosh_xpu_complex32",
+        "test_contig_size1_acosh_xpu_complex64",
+        "test_contig_size1_large_dim__refs_acos_xpu_complex64",
+        "test_contig_size1_large_dim__refs_acosh_xpu_complex64",
+        "test_contig_size1_large_dim__refs_log_xpu_complex64",
+        "test_contig_size1_large_dim__refs_sqrt_xpu_complex64",
+        "test_contig_size1_large_dim_acos_xpu_complex32",
+        "test_contig_size1_large_dim_acos_xpu_complex64",
+        "test_contig_size1_large_dim_acosh_xpu_complex32",
+        "test_contig_size1_large_dim_acosh_xpu_complex64",
+        "test_contig_size1_large_dim_log_xpu_complex32",
+        "test_contig_size1_large_dim_log_xpu_complex64",
+        "test_contig_size1_large_dim_sqrt_xpu_complex32",
+        "test_contig_size1_large_dim_sqrt_xpu_complex64",
+        "test_contig_size1_large_dim_square_xpu_complex64",
+        "test_contig_size1_log_xpu_complex32",
+        "test_contig_size1_log_xpu_complex64",
+        "test_contig_size1_sqrt_xpu_complex32",
+        "test_contig_size1_sqrt_xpu_complex64",
+        "test_contig_size1_square_xpu_complex64",
+        "test_contig_vs_every_other__refs_acos_xpu_complex64",
+        "test_contig_vs_every_other__refs_acosh_xpu_complex64",
+        "test_contig_vs_every_other__refs_log_xpu_complex64",
+        "test_contig_vs_every_other__refs_sqrt_xpu_complex64",
+        "test_contig_vs_every_other_acos_xpu_complex32",
+        "test_contig_vs_every_other_acos_xpu_complex64",
+        "test_contig_vs_every_other_acosh_xpu_complex32",
+        "test_contig_vs_every_other_acosh_xpu_complex64",
+        "test_contig_vs_every_other_log_xpu_complex32",
+        "test_contig_vs_every_other_log_xpu_complex64",
+        "test_contig_vs_every_other_sqrt_xpu_complex32",
+        "test_contig_vs_every_other_sqrt_xpu_complex64",
+        "test_contig_vs_every_other_square_xpu_complex64",
+        "test_contig_vs_transposed__refs_acos_xpu_complex64",
+        "test_contig_vs_transposed__refs_acosh_xpu_complex64",
+        "test_contig_vs_transposed__refs_log_xpu_complex64",
+        "test_contig_vs_transposed__refs_sqrt_xpu_complex64",
+        "test_contig_vs_transposed_acos_xpu_complex32",
+        "test_contig_vs_transposed_acos_xpu_complex64",
+        "test_contig_vs_transposed_acosh_xpu_complex32",
+        "test_contig_vs_transposed_acosh_xpu_complex64",
+        "test_contig_vs_transposed_log_xpu_complex32",
+        "test_contig_vs_transposed_log_xpu_complex64",
+        "test_contig_vs_transposed_sqrt_xpu_complex32",
+        "test_contig_vs_transposed_sqrt_xpu_complex64",
+        "test_contig_vs_transposed_square_xpu_complex64",
+        "test_non_contig__refs_acos_xpu_complex64",
+        "test_non_contig__refs_acosh_xpu_complex64",
+        "test_non_contig__refs_log_xpu_complex64",
+        "test_non_contig__refs_sqrt_xpu_complex64",
+        "test_non_contig_acos_xpu_complex32",
+        "test_non_contig_acos_xpu_complex64",
+        "test_non_contig_acosh_xpu_complex32",
+        "test_non_contig_acosh_xpu_complex64",
+        "test_non_contig_expand__refs_acos_xpu_complex64",
+        "test_non_contig_expand__refs_acosh_xpu_complex64",
+        "test_non_contig_expand__refs_log_xpu_complex64",
+        "test_non_contig_expand__refs_sqrt_xpu_complex64",
+        "test_non_contig_expand_acos_xpu_complex32",
+        "test_non_contig_expand_acos_xpu_complex64",
+        "test_non_contig_expand_acosh_xpu_complex32",
+        "test_non_contig_expand_acosh_xpu_complex64",
+        "test_non_contig_expand_log_xpu_complex32",
+        "test_non_contig_expand_log_xpu_complex64",
+        "test_non_contig_expand_sqrt_xpu_complex32",
+        "test_non_contig_expand_sqrt_xpu_complex64",
+        "test_non_contig_expand_square_xpu_complex64",
+        "test_non_contig_index__refs_acos_xpu_complex64",
+        "test_non_contig_index__refs_acosh_xpu_complex64",
+        "test_non_contig_index__refs_log_xpu_complex64",
+        "test_non_contig_index__refs_sqrt_xpu_complex64",
+        "test_non_contig_index_acos_xpu_complex32",
+        "test_non_contig_index_acos_xpu_complex64",
+        "test_non_contig_index_acosh_xpu_complex32",
+        "test_non_contig_index_acosh_xpu_complex64",
+        "test_non_contig_index_log_xpu_complex32",
+        "test_non_contig_index_log_xpu_complex64",
+        "test_non_contig_index_sqrt_xpu_complex32",
+        "test_non_contig_index_sqrt_xpu_complex64",
+        "test_non_contig_index_square_xpu_complex64",
+        "test_non_contig_log_xpu_complex32",
+        "test_non_contig_log_xpu_complex64",
+        "test_non_contig_sqrt_xpu_complex32",
+        "test_non_contig_sqrt_xpu_complex64",
+        "test_non_contig_square_xpu_complex64",
+        "test_reference_numerics_extremal__refs_sqrt_xpu_complex64",
+        "test_reference_numerics_extremal_sqrt_xpu_complex64",
+        "test_reference_numerics_large__refs_acos_xpu_complex64",
+        "test_reference_numerics_large__refs_log_xpu_complex64",
+        "test_reference_numerics_large__refs_sqrt_xpu_complex64",
+        "test_reference_numerics_large_acos_xpu_complex32",
+        "test_reference_numerics_large_acos_xpu_complex64",
+        "test_reference_numerics_large_acosh_xpu_complex32",
+        "test_reference_numerics_large_log_xpu_complex32",
+        "test_reference_numerics_large_log_xpu_complex64",
+        "test_reference_numerics_large_sqrt_xpu_complex32",
+        "test_reference_numerics_large_sqrt_xpu_complex64",
+        "test_reference_numerics_normal__refs_acos_xpu_complex64",
+        "test_reference_numerics_normal__refs_acosh_xpu_complex64",
+        "test_reference_numerics_normal__refs_log_xpu_complex64",
+        "test_reference_numerics_normal__refs_sqrt_xpu_complex64",
+        "test_reference_numerics_normal_acos_xpu_complex32",
+        "test_reference_numerics_normal_acos_xpu_complex64",
+        "test_reference_numerics_normal_acosh_xpu_complex32",
+        "test_reference_numerics_normal_acosh_xpu_complex64",
+        "test_reference_numerics_normal_log_xpu_complex32",
+        "test_reference_numerics_normal_log_xpu_complex64",
+        "test_reference_numerics_normal_sqrt_xpu_complex32",
+        "test_reference_numerics_normal_sqrt_xpu_complex64",
+        "test_reference_numerics_normal_square_xpu_complex64",
+        "test_reference_numerics_small__refs_acos_xpu_complex64",
+        "test_reference_numerics_small__refs_acosh_xpu_complex64",
+        "test_reference_numerics_small__refs_log_xpu_complex64",
+        "test_reference_numerics_small__refs_sqrt_xpu_complex64",
+        "test_reference_numerics_small_acos_xpu_complex32",
+        "test_reference_numerics_small_acos_xpu_complex64",
+        "test_reference_numerics_small_acosh_xpu_complex32",
+        "test_reference_numerics_small_acosh_xpu_complex64",
+        "test_reference_numerics_small_log_xpu_complex32",
+        "test_reference_numerics_small_log_xpu_complex64",
+        "test_reference_numerics_small_sqrt_xpu_complex32",
+        "test_reference_numerics_small_sqrt_xpu_complex64",
+        "test_reference_numerics_small_square_xpu_complex64",
+    ),
+}
diff --git a/test/xpu/xpu_test_utils.py b/test/xpu/xpu_test_utils.py
@@ -569,6 +569,74 @@ def convert_dtype(obj, dtype, requires_grad=False):
 
 CriterionTest.test_cuda = CriterionTest_test_xpu
 
+from torch.testing._internal.common_methods_invocations import sample_inputs_cat_concat, S, M
+from torch.testing._internal.common_methods_invocations import make_tensor
+from functools import partial
+from torch.testing._internal.opinfo.core import SampleInput
+
+def reference_inputs_cat_nofp64(op, device, dtype, requires_grad, **kwargs):
+    yield from sample_inputs_cat_concat(op, device, dtype, requires_grad, **kwargs)
+
+    make_arg = partial(make_tensor, device=device, dtype=dtype, requires_grad=requires_grad)
+
+    # Noncontiguous type promoting tensors
+    a = make_arg((3, 4, 2))
+    #b = make_arg((3, 2, 2), noncontiguous=True, dtype=torch.double)
+    # for platform without fp64 support
+    b = make_arg((3, 2, 2), noncontiguous=True, dtype=torch.float)
+    c = make_arg((3, 3, 2), dtype=torch.float16).permute(1, 0, 2)
+
+    yield SampleInput((a, b, c), kwargs={'dim': 1})
+
+    # Special 1D tensor with dim length of 0 case
+    a = make_arg((0,))
+    b = make_arg((3, 2, 2))
+
+    yield SampleInput((a, b, a))
+    yield SampleInput((a, a, a))
+
+
+def index_variable_nofp64(shape, max_indices, device=torch.device('cpu')):
+    if not isinstance(shape, tuple):
+        shape = (shape,)
+    #index = torch.rand(*shape, dtype=torch.double, device=device).mul_(max_indices).floor_().long()
+    # for platform without fp64 support
+    index = torch.rand(*shape, dtype=torch.float32, device=device).mul_(max_indices).floor_().long()
+    return index
+
+
+def sample_inputs_softmax_variant_nofp64(
+    op_info,
+    device,
+    dtype,
+    requires_grad,
+    with_dtype=False,
+    use_zero_dimensions=True,
+    **kwargs,
+):
+    make_arg = partial(
+        make_tensor, device=device, dtype=dtype, requires_grad=requires_grad
+    )
+    cases = [
+        ((S,), (0,)),
+        ((S, S), (0,)),
+        ((S, S), (1,)),
+        ((S, S), (-1,)),
+        ((S, M, S), (2,)),
+        *([((S, 0, 0), (-1,))] if use_zero_dimensions else []),
+    ]
+    #kwargs = dict(dtype=torch.float64) if with_dtype else None
+    # for platform without fp64 support
+    kwargs = dict(dtype=torch.float32) if with_dtype else None
+
+    # PyTorch on XLA throws an error when passed with dim argument for 0d tensor.
+    # See https://github.com/pytorch/xla/issues/3061 for more details.
+    if torch.device(device).type != "xla":
+        cases.append(((), (0,)))
+
+    return (
+        SampleInput(make_arg(shape), args=dim, kwargs=kwargs) for shape, dim in cases
+    )
 
 class XPUPatchForImport:
     def __init__(self, patch_test_case=True) -> None:
@@ -603,6 +671,11 @@ def __init__(self, patch_test_case=True) -> None:
         self.cuda_is_available = cuda.is_available
         self.cuda_is_bf16_supported = cuda.is_bf16_supported
 
+        if "has_fp64=0" in str(torch.xpu.get_device_properties(0)):
+            self.sample_inputs_softmax_variant = common_methods_invocations.sample_inputs_softmax_variant
+            self.index_variable = common_methods_invocations.index_variable
+            self.reference_inputs_cat = common_methods_invocations.reference_inputs_cat
+
     def align_db_decorators(self, db):
         def gen_xpu_wrappers(op_name, wrappers):
             wrapper_xpu = []
@@ -669,6 +742,11 @@ def __enter__(self):
 
         common_device_type.onlyCUDA = common_device_type.onlyXPU
 
+        if "has_fp64=0" in str(torch.xpu.get_device_properties(0)):
+            common_methods_invocations.sample_inputs_softmax_variant = sample_inputs_softmax_variant_nofp64
+            common_methods_invocations.index_variable = index_variable_nofp64
+            common_methods_invocations.reference_inputs_cat = reference_inputs_cat_nofp64
+
         class dtypesIfXPU(common_device_type.dtypes):
             def __init__(self, *args):
                 super().__init__(*args, device_type="xpu")
@@ -768,6 +846,7 @@ def __init__(self, *args):
         cuda.is_bf16_supported = lambda: True
 
         sys.path.extend(self.test_package)
+
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
@@ -790,6 +869,11 @@ def __exit__(self, exc_type, exc_value, traceback):
         cuda.is_available = self.cuda_is_available
         cuda.is_bf16_supported = self.cuda_is_bf16_supported
 
+        if "has_fp64=0" in str(torch.xpu.get_device_properties(0)):
+            common_methods_invocations.sample_inputs_softmax_variant = self.sample_inputs_softmax_variant
+            common_methods_invocations.index_variable = self.index_variable
+            common_methods_invocations.reference_inputs_cat = self.reference_inputs_cat
+
 
 # Copy the test cases from generic_base_class to generic_test_class.
 # It serves to reuse test cases. Regarding some newly added hardware,