Skip to content

[WebAssembly] v8i8 mul support #151145

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: main
Choose a base branch
from

Conversation

sparker-arm
Copy link
Contributor

During DAG combine, promote the operands to v8i16 by concanting with an undef vector and then use extmul_low to perform the mul at i16. Finally, shuffle the low bytes out of the i16 elements into the result vector.

During DAG combine, promote the operands to v8i16 by concanting with
an undef vector and then use extmul_low to perform the mul at i16.
Finally, shuffle the low bytes out of the i16 elements into the result
vector.
@llvmbot
Copy link
Member

llvmbot commented Jul 29, 2025

@llvm/pr-subscribers-backend-webassembly

Author: Sam Parker (sparker-arm)

Changes

During DAG combine, promote the operands to v8i16 by concanting with an undef vector and then use extmul_low to perform the mul at i16. Finally, shuffle the low bytes out of the i16 elements into the result vector.


Full diff: https://github.com/llvm/llvm-project/pull/151145.diff

2 Files Affected:

  • (modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+38-19)
  • (modified) llvm/test/CodeGen/WebAssembly/narrow-simd-mul.ll (+3-65)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 3f80b2ab2bd6d..f6f0a2cf6c769 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -3535,31 +3535,50 @@ static SDValue performMulCombine(SDNode *N,
   // We don't natively support v16i8 mul, but we do support v8i16 so split the
   // inputs and extend them to v8i16. Only do this before legalization in case
   // a narrow vector is widened and may be simplified later.
-  if (!DCI.isBeforeLegalize() || VT != MVT::v16i8)
+  if (!DCI.isBeforeLegalize() || (VT != MVT::v8i8 && VT != MVT::v16i8))
     return SDValue();
 
   SDLoc DL(N);
   SelectionDAG &DAG = DCI.DAG;
   SDValue LHS = N->getOperand(0);
   SDValue RHS = N->getOperand(1);
-  SDValue LowLHS =
-      DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, LHS);
-  SDValue HighLHS =
-      DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, LHS);
-  SDValue LowRHS =
-      DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MVT::v8i16, RHS);
-  SDValue HighRHS =
-      DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MVT::v8i16, RHS);
-
-  SDValue MulLow =
-      DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, LowLHS, LowRHS));
-  SDValue MulHigh = DAG.getBitcast(
-      VT, DAG.getNode(ISD::MUL, DL, MVT::v8i16, HighLHS, HighRHS));
-
-  // Take the low byte of each lane.
-  return DAG.getVectorShuffle(
-      VT, DL, MulLow, MulHigh,
-      {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+  EVT MulVT = MVT::v8i16;
+
+  if (VT == MVT::v8i8) {
+    SDValue PromotedLHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, LHS,
+                                      DAG.getUNDEF(MVT::v8i8));
+    SDValue PromotedRHS = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, RHS,
+                                      DAG.getUNDEF(MVT::v8i8));
+    SDValue LowLHS =
+        DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedLHS);
+    SDValue LowRHS =
+        DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, PromotedRHS);
+    SDValue MulLow = DAG.getBitcast(
+        MVT::v16i8, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS));
+    // Take the low byte of each lane.
+    SDValue Shuffle = DAG.getVectorShuffle(
+        MVT::v16i8, DL, MulLow, DAG.getUNDEF(MVT::v16i8),
+        {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
+    return extractSubVector(Shuffle, 0, DAG, DL, 64);
+  } else {
+    assert(VT == MVT::v16i8 && "Expected v16i8");
+    SDValue LowLHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, LHS);
+    SDValue LowRHS = DAG.getNode(WebAssemblyISD::EXTEND_LOW_U, DL, MulVT, RHS);
+    SDValue HighLHS =
+        DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, LHS);
+    SDValue HighRHS =
+        DAG.getNode(WebAssemblyISD::EXTEND_HIGH_U, DL, MulVT, RHS);
+
+    SDValue MulLow =
+        DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, LowLHS, LowRHS));
+    SDValue MulHigh =
+        DAG.getBitcast(VT, DAG.getNode(ISD::MUL, DL, MulVT, HighLHS, HighRHS));
+
+    // Take the low byte of each lane.
+    return DAG.getVectorShuffle(
+        VT, DL, MulLow, MulHigh,
+        {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30});
+  }
 }
 
 SDValue
diff --git a/llvm/test/CodeGen/WebAssembly/narrow-simd-mul.ll b/llvm/test/CodeGen/WebAssembly/narrow-simd-mul.ll
index 1f6c960c27aa8..310636d4c07d3 100644
--- a/llvm/test/CodeGen/WebAssembly/narrow-simd-mul.ll
+++ b/llvm/test/CodeGen/WebAssembly/narrow-simd-mul.ll
@@ -5,71 +5,9 @@ define <8 x i8> @mul_v8i8(<8 x i8> %a, <8 x i8> %b) {
 ; CHECK-LABEL: mul_v8i8:
 ; CHECK:         .functype mul_v8i8 (v128, v128) -> (v128)
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    i8x16.extract_lane_u $push4=, $0, 0
-; CHECK-NEXT:    i8x16.extract_lane_u $push3=, $1, 0
-; CHECK-NEXT:    i32.mul $push5=, $pop4, $pop3
-; CHECK-NEXT:    i8x16.splat $push6=, $pop5
-; CHECK-NEXT:    i8x16.extract_lane_u $push1=, $0, 1
-; CHECK-NEXT:    i8x16.extract_lane_u $push0=, $1, 1
-; CHECK-NEXT:    i32.mul $push2=, $pop1, $pop0
-; CHECK-NEXT:    i8x16.replace_lane $push7=, $pop6, 1, $pop2
-; CHECK-NEXT:    i8x16.extract_lane_u $push9=, $0, 2
-; CHECK-NEXT:    i8x16.extract_lane_u $push8=, $1, 2
-; CHECK-NEXT:    i32.mul $push10=, $pop9, $pop8
-; CHECK-NEXT:    i8x16.replace_lane $push11=, $pop7, 2, $pop10
-; CHECK-NEXT:    i8x16.extract_lane_u $push13=, $0, 3
-; CHECK-NEXT:    i8x16.extract_lane_u $push12=, $1, 3
-; CHECK-NEXT:    i32.mul $push14=, $pop13, $pop12
-; CHECK-NEXT:    i8x16.replace_lane $push15=, $pop11, 3, $pop14
-; CHECK-NEXT:    i8x16.extract_lane_u $push17=, $0, 4
-; CHECK-NEXT:    i8x16.extract_lane_u $push16=, $1, 4
-; CHECK-NEXT:    i32.mul $push18=, $pop17, $pop16
-; CHECK-NEXT:    i8x16.replace_lane $push19=, $pop15, 4, $pop18
-; CHECK-NEXT:    i8x16.extract_lane_u $push21=, $0, 5
-; CHECK-NEXT:    i8x16.extract_lane_u $push20=, $1, 5
-; CHECK-NEXT:    i32.mul $push22=, $pop21, $pop20
-; CHECK-NEXT:    i8x16.replace_lane $push23=, $pop19, 5, $pop22
-; CHECK-NEXT:    i8x16.extract_lane_u $push25=, $0, 6
-; CHECK-NEXT:    i8x16.extract_lane_u $push24=, $1, 6
-; CHECK-NEXT:    i32.mul $push26=, $pop25, $pop24
-; CHECK-NEXT:    i8x16.replace_lane $push27=, $pop23, 6, $pop26
-; CHECK-NEXT:    i8x16.extract_lane_u $push29=, $0, 7
-; CHECK-NEXT:    i8x16.extract_lane_u $push28=, $1, 7
-; CHECK-NEXT:    i32.mul $push30=, $pop29, $pop28
-; CHECK-NEXT:    i8x16.replace_lane $push31=, $pop27, 7, $pop30
-; CHECK-NEXT:    i8x16.extract_lane_u $push33=, $0, 8
-; CHECK-NEXT:    i8x16.extract_lane_u $push32=, $1, 8
-; CHECK-NEXT:    i32.mul $push34=, $pop33, $pop32
-; CHECK-NEXT:    i8x16.replace_lane $push35=, $pop31, 8, $pop34
-; CHECK-NEXT:    i8x16.extract_lane_u $push37=, $0, 9
-; CHECK-NEXT:    i8x16.extract_lane_u $push36=, $1, 9
-; CHECK-NEXT:    i32.mul $push38=, $pop37, $pop36
-; CHECK-NEXT:    i8x16.replace_lane $push39=, $pop35, 9, $pop38
-; CHECK-NEXT:    i8x16.extract_lane_u $push41=, $0, 10
-; CHECK-NEXT:    i8x16.extract_lane_u $push40=, $1, 10
-; CHECK-NEXT:    i32.mul $push42=, $pop41, $pop40
-; CHECK-NEXT:    i8x16.replace_lane $push43=, $pop39, 10, $pop42
-; CHECK-NEXT:    i8x16.extract_lane_u $push45=, $0, 11
-; CHECK-NEXT:    i8x16.extract_lane_u $push44=, $1, 11
-; CHECK-NEXT:    i32.mul $push46=, $pop45, $pop44
-; CHECK-NEXT:    i8x16.replace_lane $push47=, $pop43, 11, $pop46
-; CHECK-NEXT:    i8x16.extract_lane_u $push49=, $0, 12
-; CHECK-NEXT:    i8x16.extract_lane_u $push48=, $1, 12
-; CHECK-NEXT:    i32.mul $push50=, $pop49, $pop48
-; CHECK-NEXT:    i8x16.replace_lane $push51=, $pop47, 12, $pop50
-; CHECK-NEXT:    i8x16.extract_lane_u $push53=, $0, 13
-; CHECK-NEXT:    i8x16.extract_lane_u $push52=, $1, 13
-; CHECK-NEXT:    i32.mul $push54=, $pop53, $pop52
-; CHECK-NEXT:    i8x16.replace_lane $push55=, $pop51, 13, $pop54
-; CHECK-NEXT:    i8x16.extract_lane_u $push57=, $0, 14
-; CHECK-NEXT:    i8x16.extract_lane_u $push56=, $1, 14
-; CHECK-NEXT:    i32.mul $push58=, $pop57, $pop56
-; CHECK-NEXT:    i8x16.replace_lane $push59=, $pop55, 14, $pop58
-; CHECK-NEXT:    i8x16.extract_lane_u $push61=, $0, 15
-; CHECK-NEXT:    i8x16.extract_lane_u $push60=, $1, 15
-; CHECK-NEXT:    i32.mul $push62=, $pop61, $pop60
-; CHECK-NEXT:    i8x16.replace_lane $push63=, $pop59, 15, $pop62
-; CHECK-NEXT:    return $pop63
+; CHECK-NEXT:    i16x8.extmul_low_i8x16_u $push0=, $0, $1
+; CHECK-NEXT:    i8x16.shuffle $push1=, $pop0, $1, 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    return $pop1
   %mul = mul <8 x i8> %a, %b
   ret <8 x i8> %mul
 }

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants