halide · rootjalex · Jan 20, 2023 · Jan 16, 2023 · Jan 16, 2023 · Jan 17, 2023
diff --git a/src/CodeGen_WebAssembly.cpp b/src/CodeGen_WebAssembly.cpp
@@ -6,10 +6,12 @@
 #include "IRMatch.h"
 #include "IROperator.h"
 #include "LLVM_Headers.h"
+#include "Substitute.h"
 
 namespace Halide {
 namespace Internal {
 
+using std::pair;
 using std::string;
 using std::vector;
 
@@ -193,6 +195,12 @@ void CodeGen_WebAssembly::visit(const Call *op) {
         {"saturating_narrow", i16_sat(wild_i32x_), Target::WasmSimd128},
         {"saturating_narrow", u16_sat(wild_i32x_), Target::WasmSimd128},
     };
+    static const vector<pair<Expr, Expr>> cast_rewrites = {
+        // Some double-narrowing saturating casts can be better expressed as
+        // combinations of single-narrowing saturating casts.
+        {u8_sat(wild_i32x_), u8_sat(i16_sat(wild_i32x_))},
+        {i8_sat(wild_i32x_), i8_sat(i16_sat(wild_i32x_))},
+    };
     // clang-format on
 
     if (op->type.is_vector()) {
@@ -208,6 +216,14 @@ void CodeGen_WebAssembly::visit(const Call *op) {
                 }
             }
         }
+
+        for (const auto &i : cast_rewrites) {
+            if (expr_match(i.first, op, matches)) {
+                Expr replacement = substitute("*", matches[0], with_lanes(i.second, op->type.lanes()));
+                value = codegen(replacement);
+                return;
+            }
+        }
     }
 
     if (op->is_intrinsic(Call::round)) {

diff --git a/src/CodeGen_X86.cpp b/src/CodeGen_X86.cpp
@@ -7,11 +7,13 @@
 #include "IROperator.h"
 #include "LLVM_Headers.h"
 #include "Simplify.h"
+#include "Substitute.h"
 #include "Util.h"
 
 namespace Halide {
 namespace Internal {
 
+using std::pair;
 using std::string;
 using std::vector;
 
@@ -617,6 +619,20 @@ void CodeGen_X86::visit(const Call *op) {
         }
     }
 
+    static const vector<pair<Expr, Expr>> cast_rewrites = {
+        // Some double-narrowing saturating casts can be better expressed as
+        // combinations of single-narrowing saturating casts.
+        {u8_sat(wild_i32x_), u8_sat(i16_sat(wild_i32x_))},
+        {i8_sat(wild_i32x_), i8_sat(i16_sat(wild_i32x_))},
+    };
+    for (const auto &i : cast_rewrites) {
+        if (expr_match(i.first, op, matches)) {
+            Expr replacement = substitute("*", matches[0], with_lanes(i.second, op->type.lanes()));
+            value = codegen(replacement);
+            return;
+        }
+    }
+
     // Check for saturating_pmulhrs. On x86, pmulhrs is truncating, but it's still faster
     // to use pmulhrs than to lower (producing widening multiplication), and have a check
     // for the singular overflow case.

diff --git a/test/correctness/simd_op_check_wasm.cpp b/test/correctness/simd_op_check_wasm.cpp
@@ -511,6 +511,8 @@ class SimdOpCheckWASM : public SimdOpCheckTest {
                 check("i8x16.narrow_i16x8_u", 16 * w, u8_sat(i16_1));
                 check("i16x8.narrow_i32x4_s", 8 * w, i16_sat(i32_1));
                 check("i16x8.narrow_i32x4_u", 8 * w, u16_sat(i32_1));
+                check("i16x8.narrow_i32x4_s", 8 * w, i8_sat(i32_1));
+                check("i16x8.narrow_i32x4_s", 8 * w, u8_sat(i32_1));
 
                 // Integer to integer widening
                 check("i16x8.extend_low_i8x16_s", 16 * w, i16(i8_1));

diff --git a/test/correctness/simd_op_check_x86.cpp b/test/correctness/simd_op_check_x86.cpp
@@ -227,6 +227,8 @@ class SimdOpCheckX86 : public SimdOpCheckTest {
             check(std::string("packssdw") + check_suffix, 4 * w, i16_sat(i32_1));
             check(std::string("packsswb") + check_suffix, 8 * w, i8_sat(i16_1));
             check(std::string("packuswb") + check_suffix, 8 * w, u8_sat(i16_1));
+            check(std::string("packssdw") + check_suffix, 8 * w, u8_sat(i32_1));
+            check(std::string("packssdw") + check_suffix, 8 * w, i8_sat(i32_1));
 
             // Sum-of-absolute-difference ops
             {