From edbb80c9a4070bdfef5712a11f47d473a85d4bb3 Mon Sep 17 00:00:00 2001 From: Greg Roth Date: Tue, 6 Aug 2024 10:50:23 -0600 Subject: [PATCH] Add SPIRV generation for HLSL dot Use the new LLVM dot intrinsics to build SPIRV instructions. This involves generating multiply and add operations for integers and the existing OpDot operation for floating point. This includes adding some generic opcodes for signed, unsigned and floats. These require updating an existing test for all such opcodes. New tests for generating SPIRV float and integer dot intrinsics are added as well. Fixes #88056 --- llvm/include/llvm/Support/TargetOpcodes.def | 9 ++ llvm/include/llvm/Target/GenericOpcodes.td | 21 +++++ llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp | 6 ++ .../Target/SPIRV/SPIRVInstructionSelector.cpp | 78 ++++++++++++++++ llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp | 3 + .../GlobalISel/legalizer-info-validation.mir | 9 ++ .../CodeGen/SPIRV/hlsl-intrinsics/fdot.ll | 75 ++++++++++++++++ .../CodeGen/SPIRV/hlsl-intrinsics/idot.ll | 88 +++++++++++++++++++ 8 files changed, 289 insertions(+) create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fdot.ll create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/idot.ll diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def index 9fb6de49fb2055..0808fd9d77be82 100644 --- a/llvm/include/llvm/Support/TargetOpcodes.def +++ b/llvm/include/llvm/Support/TargetOpcodes.def @@ -814,6 +814,15 @@ HANDLE_TARGET_OPCODE(G_FSINH) /// Floating point hyperbolic tangent. HANDLE_TARGET_OPCODE(G_FTANH) +/// Floating point vector dot product +HANDLE_TARGET_OPCODE(G_FDOTPROD) + +/// Unsigned integer vector dot product +HANDLE_TARGET_OPCODE(G_UDOTPROD) + +/// Signed integer vector dot product +HANDLE_TARGET_OPCODE(G_SDOTPROD) + /// Floating point square root. HANDLE_TARGET_OPCODE(G_FSQRT) diff --git a/llvm/include/llvm/Target/GenericOpcodes.td b/llvm/include/llvm/Target/GenericOpcodes.td index 36a0a087ba457c..648671f627d649 100644 --- a/llvm/include/llvm/Target/GenericOpcodes.td +++ b/llvm/include/llvm/Target/GenericOpcodes.td @@ -1057,6 +1057,27 @@ def G_FTANH : GenericInstruction { let hasSideEffects = false; } +/// Floating point vector dot product +def G_FDOTPROD : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = false; +} + +/// Signed integer vector dot product +def G_SDOTPROD : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = false; +} + +/// Unsigned integer vector dot product +def G_UDOTPROD : GenericInstruction { + let OutOperandList = (outs type0:$dst); + let InOperandList = (ins type0:$src1, type0:$src2); + let hasSideEffects = false; +} + // Floating point square root of a value. // This returns NaN for negative nonzero values. // NOTE: Unlike libm sqrt(), this never sets errno. In all other respects it's diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp index 0169a0e466d878..d471b172a7dcb2 100644 --- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp +++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp @@ -1903,6 +1903,12 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) { return TargetOpcode::G_CTPOP; case Intrinsic::exp: return TargetOpcode::G_FEXP; + case Intrinsic::fdot: + return TargetOpcode::G_FDOTPROD; + case Intrinsic::sdot: + return TargetOpcode::G_SDOTPROD; + case Intrinsic::udot: + return TargetOpcode::G_UDOTPROD; case Intrinsic::exp2: return TargetOpcode::G_FEXP2; case Intrinsic::exp10: diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp index ed786bd33aa05b..98b7847a611476 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp @@ -178,6 +178,9 @@ class SPIRVInstructionSelector : public InstructionSelector { bool selectRsqrt(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const; + bool selectIntegerDot(Register ResVReg, const SPIRVType *ResType, + MachineInstr &I) const; + void renderImm32(MachineInstrBuilder &MIB, const MachineInstr &I, int OpIdx) const; void renderFImm32(MachineInstrBuilder &MIB, const MachineInstr &I, @@ -380,6 +383,20 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg, MIB.addImm(V); return MIB.constrainAllUses(TII, TRI, RBI); } + + case TargetOpcode::G_FDOTPROD: { + MachineBasicBlock &BB = *I.getParent(); + return BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpDot)) + .addDef(ResVReg) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(I.getOperand(1).getReg()) + .addUse(I.getOperand(2).getReg()) + .constrainAllUses(TII, TRI, RBI); + } + case TargetOpcode::G_SDOTPROD: + case TargetOpcode::G_UDOTPROD: + return selectIntegerDot(ResVReg, ResType, I); + case TargetOpcode::G_MEMMOVE: case TargetOpcode::G_MEMCPY: case TargetOpcode::G_MEMSET: @@ -1366,6 +1383,67 @@ bool SPIRVInstructionSelector::selectRsqrt(Register ResVReg, .constrainAllUses(TII, TRI, RBI); } +// Since there is no integer dot implementation, expand by piecewise multiplying +// and adding the results, making use of FMA operations where possible. +bool SPIRVInstructionSelector::selectIntegerDot(Register ResVReg, + const SPIRVType *ResType, + MachineInstr &I) const { + assert(I.getNumOperands() == 3); + assert(I.getOperand(1).isReg()); + assert(I.getOperand(2).isReg()); + MachineBasicBlock &BB = *I.getParent(); + + // Multiply the vectors, then sum the results + Register Vec0 = I.getOperand(1).getReg(); + Register Vec1 = I.getOperand(2).getReg(); + Register TmpVec = MRI->createVirtualRegister(&SPIRV::IDRegClass); + SPIRVType *VecType = GR.getSPIRVTypeForVReg(Vec0); + + bool Result = BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIMulV)) + .addDef(TmpVec) + .addUse(GR.getSPIRVTypeID(VecType)) + .addUse(Vec0) + .addUse(Vec1) + .constrainAllUses(TII, TRI, RBI); + + assert(GR.getScalarOrVectorComponentCount(VecType) > 1 && + "dot product requires a vector of at least 2 components"); + + Register Res = MRI->createVirtualRegister(&SPIRV::IDRegClass); + Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract)) + .addDef(Res) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(TmpVec) + .addImm(0) + .constrainAllUses(TII, TRI, RBI); + + for (unsigned i = 1; i < GR.getScalarOrVectorComponentCount(VecType); i++) { + Register Elt = MRI->createVirtualRegister(&SPIRV::IDRegClass); + + Result |= + BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpCompositeExtract)) + .addDef(Elt) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(TmpVec) + .addImm(i) + .constrainAllUses(TII, TRI, RBI); + + Register Sum = i < GR.getScalarOrVectorComponentCount(VecType) - 1 + ? MRI->createVirtualRegister(&SPIRV::IDRegClass) + : ResVReg; + + Result |= BuildMI(BB, I, I.getDebugLoc(), TII.get(SPIRV::OpIAddS)) + .addDef(Sum) + .addUse(GR.getSPIRVTypeID(ResType)) + .addUse(Res) + .addUse(Elt) + .constrainAllUses(TII, TRI, RBI); + Res = Sum; + } + + return Result; +} + bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg, const SPIRVType *ResType, MachineInstr &I) const { diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp index e775f8c57b048e..fb39c107a417ad 100644 --- a/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp +++ b/llvm/lib/Target/SPIRV/SPIRVLegalizerInfo.cpp @@ -285,6 +285,9 @@ SPIRVLegalizerInfo::SPIRVLegalizerInfo(const SPIRVSubtarget &ST) { G_FCOSH, G_FSINH, G_FTANH, + G_FDOTPROD, + G_SDOTPROD, + G_UDOTPROD, G_FSQRT, G_FFLOOR, G_FRINT, diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir index 87a415b45cca9a..57faf42a5156c6 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir @@ -716,6 +716,15 @@ # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. the first uncovered type index: 1, OK # DEBUG-NEXT: .. the first uncovered imm index: 0, OK +# DEBUG-NEXT: G_FDOTPROD (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_UDOTPROD (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: G_SDOTPROD (opcode {{[0-9]+}}): 1 type index, 0 imm indices +# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined +# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined # DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}} # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fdot.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fdot.ll new file mode 100644 index 00000000000000..964decf237a5cf --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/fdot.ll @@ -0,0 +1,75 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure dxil operation function calls for dot are generated for float type vectors. + +; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#vec2_float_16:]] = OpTypeVector %[[#float_16]] 2 +; CHECK-DAG: %[[#vec3_float_16:]] = OpTypeVector %[[#float_16]] 3 +; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4 +; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#vec2_float_32:]] = OpTypeVector %[[#float_32]] 2 +; CHECK-DAG: %[[#vec3_float_32:]] = OpTypeVector %[[#float_32]] 3 +; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4 + + +define noundef half @dot_half2(<2 x half> noundef %a, <2 x half> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec2_float_16]] +; CHECK: OpDot %[[#float_16]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call half @llvm.fdot.v2f16(<2 x half> %a, <2 x half> %b) + ret half %dx.dot +} + +define noundef half @dot_half3(<3 x half> noundef %a, <3 x half> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]] +; CHECK: OpDot %[[#float_16]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call half @llvm.fdot.v3f16(<3 x half> %a, <3 x half> %b) + ret half %dx.dot +} + +define noundef half @dot_half4(<4 x half> noundef %a, <4 x half> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]] +; CHECK: OpDot %[[#float_16]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call half @llvm.fdot.v4f16(<4 x half> %a, <4 x half> %b) + ret half %dx.dot +} + +define noundef float @dot_float2(<2 x float> noundef %a, <2 x float> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_float_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec2_float_32]] +; CHECK: OpDot %[[#float_32]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call float @llvm.fdot.v2f32(<2 x float> %a, <2 x float> %b) + ret float %dx.dot +} + +define noundef float @dot_float3(<3 x float> noundef %a, <3 x float> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]] +; CHECK: OpDot %[[#float_32]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call float @llvm.fdot.v3f32(<3 x float> %a, <3 x float> %b) + ret float %dx.dot +} + +define noundef float @dot_float4(<4 x float> noundef %a, <4 x float> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]] +; CHECK: OpDot %[[#float_32]] %[[#arg0:]] %[[#arg1:]] + %dx.dot = call float @llvm.fdot.v4f32(<4 x float> %a, <4 x float> %b) + ret float %dx.dot +} + +declare half @llvm.fdot.v2f16(<2 x half> , <2 x half> ) +declare half @llvm.fdot.v3f16(<3 x half> , <3 x half> ) +declare half @llvm.fdot.v4f16(<4 x half> , <4 x half> ) +declare float @llvm.fdot.v2f32(<2 x float>, <2 x float>) +declare float @llvm.fdot.v3f32(<3 x float>, <3 x float>) +declare float @llvm.fdot.v4f32(<4 x float>, <4 x float>) diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/idot.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/idot.ll new file mode 100644 index 00000000000000..05f28920e205b7 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/idot.ll @@ -0,0 +1,88 @@ +; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %} + +; Make sure dxil operation function calls for dot are generated for int/uint vectors. + +; CHECK-DAG: %[[#int_16:]] = OpTypeInt 16 +; CHECK-DAG: %[[#vec2_int_16:]] = OpTypeVector %[[#int_16]] 2 +; CHECK-DAG: %[[#vec3_int_16:]] = OpTypeVector %[[#int_16]] 3 +; CHECK-DAG: %[[#int_32:]] = OpTypeInt 32 +; CHECK-DAG: %[[#vec4_int_32:]] = OpTypeVector %[[#int_32]] 4 +; CHECK-DAG: %[[#int_64:]] = OpTypeInt 64 +; CHECK-DAG: %[[#vec2_int_64:]] = OpTypeVector %[[#int_64]] 2 + +define noundef i16 @dot_int16_t2(<2 x i16> noundef %a, <2 x i16> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_int_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec2_int_16]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec2_int_16]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 1 +; CHECK: %[[#sum:]] = OpIAdd %[[#int_16]] %[[#elt0]] %[[#elt1]] + %dot = call i16 @llvm.sdot.v3i16(<2 x i16> %a, <2 x i16> %b) + ret i16 %dot +} + +define noundef i32 @dot_int4(<4 x i32> noundef %a, <4 x i32> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_int_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_int_32]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec4_int_32]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 1 +; CHECK: %[[#sum0:]] = OpIAdd %[[#int_32]] %[[#elt0]] %[[#elt1]] +; CHECK: %[[#elt2:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 2 +; CHECK: %[[#sum1:]] = OpIAdd %[[#int_32]] %[[#sum0]] %[[#elt2]] +; CHECK: %[[#elt3:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 3 +; CHECK: %[[#sum2:]] = OpIAdd %[[#int_32]] %[[#sum1]] %[[#elt3]] + %dot = call i32 @llvm.sdot.v4i32(<4 x i32> %a, <4 x i32> %b) + ret i32 %dot +} + +define noundef i16 @dot_uint16_t3(<3 x i16> noundef %a, <3 x i16> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_int_16]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_int_16]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec3_int_16]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 1 +; CHECK: %[[#sum0:]] = OpIAdd %[[#int_16]] %[[#elt0]] %[[#elt1]] +; CHECK: %[[#elt2:]] = OpCompositeExtract %[[#int_16]] %[[#mul_vec]] 2 +; CHECK: %[[#sum1:]] = OpIAdd %[[#int_16]] %[[#sum0]] %[[#elt2]] + %dot = call i16 @llvm.udot.v3i16(<3 x i16> %a, <3 x i16> %b) + ret i16 %dot +} + +define noundef i32 @dot_uint4(<4 x i32> noundef %a, <4 x i32> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_int_32]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_int_32]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec4_int_32]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 1 +; CHECK: %[[#sum0:]] = OpIAdd %[[#int_32]] %[[#elt0]] %[[#elt1]] +; CHECK: %[[#elt2:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 2 +; CHECK: %[[#sum1:]] = OpIAdd %[[#int_32]] %[[#sum0]] %[[#elt2]] +; CHECK: %[[#elt3:]] = OpCompositeExtract %[[#int_32]] %[[#mul_vec]] 3 +; CHECK: %[[#sum2:]] = OpIAdd %[[#int_32]] %[[#sum1]] %[[#elt3]] + %dot = call i32 @llvm.udot.v4i32(<4 x i32> %a, <4 x i32> %b) + ret i32 %dot +} + +define noundef i64 @dot_uint64_t4(<2 x i64> noundef %a, <2 x i64> noundef %b) { +entry: +; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec2_int_64]] +; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec2_int_64]] +; CHECK: %[[#mul_vec:]] = OpIMul %[[#vec2_int_64]] %[[#arg0]] %[[#arg1]] +; CHECK: %[[#elt0:]] = OpCompositeExtract %[[#int_64]] %[[#mul_vec]] 0 +; CHECK: %[[#elt1:]] = OpCompositeExtract %[[#int_64]] %[[#mul_vec]] 1 +; CHECK: %[[#sum0:]] = OpIAdd %[[#int_64]] %[[#elt0]] %[[#elt1]] + %dot = call i64 @llvm.udot.v2i64(<2 x i64> %a, <2 x i64> %b) + ret i64 %dot +} + +declare i16 @llvm.sdot.v2i16(<2 x i16>, <2 x i16>) +declare i32 @llvm.sdot.v4i32(<4 x i32>, <4 x i32>) +declare i16 @llvm.udot.v3i32(<3 x i16>, <3 x i16>) +declare i32 @llvm.udot.v4i32(<4 x i32>, <4 x i32>) +declare i64 @llvm.udot.v2i64(<2 x i64>, <2 x i64>)