Skip to content

Commit

Permalink
AArch64: Update div-bitmask to implement new optab instead of target …
Browse files Browse the repository at this point in the history
…hook [PR108583]

This replaces the custom division hook with just an implementation through
add_highpart.  For NEON we implement the add highpart (Addition + extraction of
the upper highpart of the register in the same precision) as ADD + LSR.

This representation allows us to easily optimize the sequence using existing
sequences. This gets us a pretty decent sequence using SRA:

        umull   v1.8h, v0.8b, v3.8b
        umull2  v0.8h, v0.16b, v3.16b
        add     v5.8h, v1.8h, v2.8h
        add     v4.8h, v0.8h, v2.8h
        usra    v1.8h, v5.8h, 8
        usra    v0.8h, v4.8h, 8
        uzp2    v1.16b, v1.16b, v0.16b

To get the most optimal sequence however we match (a + ((b + c) >> n)) where n
is half the precision of the mode of the operation into addhn + uaddw which is
a general good optimization on its own and gets us back to:

.L4:
        ldr     q0, [x3]
        umull   v1.8h, v0.8b, v5.8b
        umull2  v0.8h, v0.16b, v5.16b
        addhn   v3.8b, v1.8h, v4.8h
        addhn   v2.8b, v0.8h, v4.8h
        uaddw   v1.8h, v1.8h, v3.8b
        uaddw   v0.8h, v0.8h, v2.8b
        uzp2    v1.16b, v1.16b, v0.16b
        str     q1, [x3], 16
        cmp     x3, x4
        bne     .L4

For SVE2 we optimize the initial sequence to the same ADD + LSR which gets us:

.L3:
        ld1b    z0.h, p0/z, [x0, x3]
        mul     z0.h, p1/m, z0.h, z2.h
        add     z1.h, z0.h, z3.h
        usra    z0.h, z1.h, #8
        lsr     z0.h, z0.h, #8
        st1b    z0.h, p0, [x0, x3]
        inch    x3
        whilelo p0.h, w3, w2
        b.any   .L3
.L1:
        ret

and to get the most optimal sequence I match (a + b) >> n (same constraint on n)
to addhnb which gets us to:

.L3:
        ld1b    z0.h, p0/z, [x0, x3]
        mul     z0.h, p1/m, z0.h, z2.h
        addhnb  z1.b, z0.h, z3.h
        addhnb  z0.b, z0.h, z1.h
        st1b    z0.h, p0, [x0, x3]
        inch    x3
        whilelo p0.h, w3, w2
        b.any   .L3

There are multiple RTL representations possible for these optimizations, I did
not represent them using a zero_extend because we seem very inconsistent in this
in the backend.  Since they are unspecs we won't match them from vector ops
anyway. I figured maintainers would prefer this, but my maintainer ouija board
is still out for repairs :)

There are no new test as new correctness tests were added to the mid-end and
the existing codegen tests for this already exist.

gcc/ChangeLog:

	PR target/108583
	* config/aarch64/aarch64-simd.md (@aarch64_bitmask_udiv<mode>3): Remove.
	(*bitmask_shift_plus<mode>): New.
	* config/aarch64/aarch64-sve2.md (*bitmask_shift_plus<mode>): New.
	(@aarch64_bitmask_udiv<mode>3): Remove.
	* config/aarch64/aarch64.cc
	(aarch64_vectorize_can_special_div_by_constant,
	TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST): Removed.
	(TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT,
	aarch64_vectorize_preferred_div_as_shifts_over_mult): New.
  • Loading branch information
TamarChristinaArm committed Mar 12, 2023
1 parent 81fd62d commit f23dc72
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 137 deletions.
71 changes: 19 additions & 52 deletions gcc/config/aarch64/aarch64-simd.md
Original file line number Diff line number Diff line change
Expand Up @@ -4867,60 +4867,27 @@
}
)

;; div optimizations using narrowings
;; we can do the division e.g. shorts by 255 faster by calculating it as
;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
;; double the precision of x.
;;
;; If we imagine a short as being composed of two blocks of bytes then
;; adding 257 or 0b0000_0001_0000_0001 to the number is equivalent to
;; adding 1 to each sub component:
;;
;; short value of 16-bits
;; ┌──────────────┬────────────────┐
;; │ │ │
;; └──────────────┴────────────────┘
;; 8-bit part1 ▲ 8-bit part2 ▲
;; │ │
;; │ │
;; +1 +1
;;
;; after the first addition, we have to shift right by 8, and narrow the
;; results back to a byte. Remember that the addition must be done in
;; double the precision of the input. Since 8 is half the size of a short
;; we can use a narrowing halfing instruction in AArch64, addhn which also
;; does the addition in a wider precision and narrows back to a byte. The
;; shift itself is implicit in the operation as it writes back only the top
;; half of the result. i.e. bits 2*esize-1:esize.
;;
;; Since we have narrowed the result of the first part back to a byte, for
;; the second addition we can use a widening addition, uaddw.
;;
;; For the final shift, since it's unsigned arithmetic we emit an ushr by 8.
;;
;; The shift is later optimized by combine to a uzp2 with movi #0.
(define_expand "@aarch64_bitmask_udiv<mode>3"
[(match_operand:VQN 0 "register_operand")
(match_operand:VQN 1 "register_operand")
(match_operand:VQN 2 "immediate_operand")]
;; Optimize ((a + b) >> n) + c where n is half the bitsize of the vector
(define_insn_and_split "*bitmask_shift_plus<mode>"
[(set (match_operand:VQN 0 "register_operand" "=&w")
(plus:VQN
(lshiftrt:VQN
(plus:VQN (match_operand:VQN 1 "register_operand" "w")
(match_operand:VQN 2 "register_operand" "w"))
(match_operand:VQN 3 "aarch64_simd_shift_imm_vec_exact_top" ""))
(match_operand:VQN 4 "register_operand" "w")))]
"TARGET_SIMD"
"#"
"&& true"
[(const_int 0)]
{
unsigned HOST_WIDE_INT size
= (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode)) - 1;
rtx elt = unwrap_const_vec_duplicate (operands[2]);
if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
FAIL;

rtx addend = gen_reg_rtx (<MODE>mode);
rtx val = aarch64_simd_gen_const_vector_dup (<VNARROWQ2>mode, 1);
emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROWQ2>mode));
rtx tmp1 = gen_reg_rtx (<VNARROWQ>mode);
rtx tmp2 = gen_reg_rtx (<MODE>mode);
emit_insn (gen_aarch64_addhn<mode> (tmp1, operands[1], addend));
unsigned bitsize = GET_MODE_UNIT_BITSIZE (<VNARROWQ>mode);
rtx shift_vector = aarch64_simd_gen_const_vector_dup (<MODE>mode, bitsize);
emit_insn (gen_aarch64_uaddw<Vnarrowq> (tmp2, operands[1], tmp1));
emit_insn (gen_aarch64_simd_lshr<mode> (operands[0], tmp2, shift_vector));
rtx tmp;
if (can_create_pseudo_p ())
tmp = gen_reg_rtx (<VNARROWQ>mode);
else
tmp = gen_rtx_REG (<VNARROWQ>mode, REGNO (operands[0]));
emit_insn (gen_aarch64_addhn<mode> (tmp, operands[1], operands[2]));
emit_insn (gen_aarch64_uaddw<Vnarrowq> (operands[0], operands[4], tmp));
DONE;
})

Expand Down
57 changes: 16 additions & 41 deletions gcc/config/aarch64/aarch64-sve2.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@
;; ---- [INT] Reciprocal approximation
;; ---- [INT<-FP] Base-2 logarithm
;; ---- [INT] Polynomial multiplication
;; ---- [INT] Misc optab implementations
;;
;; == Permutation
;; ---- [INT,FP] General permutes
Expand Down Expand Up @@ -1600,6 +1599,22 @@
"<sve_int_op>\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
)

;; Optimize ((a + b) >> n) where n is half the bitsize of the vector
(define_insn "*bitmask_shift_plus<mode>"
[(set (match_operand:SVE_FULL_HSDI 0 "register_operand" "=w")
(unspec:SVE_FULL_HSDI
[(match_operand:<VPRED> 1)
(lshiftrt:SVE_FULL_HSDI
(plus:SVE_FULL_HSDI
(match_operand:SVE_FULL_HSDI 2 "register_operand" "w")
(match_operand:SVE_FULL_HSDI 3 "register_operand" "w"))
(match_operand:SVE_FULL_HSDI 4
"aarch64_simd_shift_imm_vec_exact_top" ""))]
UNSPEC_PRED_X))]
"TARGET_SVE2"
"addhnb\t%0.<Ventype>, %2.<Vetype>, %3.<Vetype>"
)

;; -------------------------------------------------------------------------
;; ---- [INT] Narrowing right shifts
;; -------------------------------------------------------------------------
Expand Down Expand Up @@ -2313,46 +2328,6 @@
"<sve_int_op>\t%0.<Vewtype>, %1.<Vetype>, %2.<Vetype>"
)

;; -------------------------------------------------------------------------
;; ---- [INT] Misc optab implementations
;; -------------------------------------------------------------------------
;; Includes:
;; - aarch64_bitmask_udiv
;; -------------------------------------------------------------------------

;; div optimizations using narrowings
;; we can do the division e.g. shorts by 255 faster by calculating it as
;; (x + ((x + 257) >> 8)) >> 8 assuming the operation is done in
;; double the precision of x.
;;
;; See aarch64-simd.md for bigger explanation.
(define_expand "@aarch64_bitmask_udiv<mode>3"
[(match_operand:SVE_FULL_HSDI 0 "register_operand")
(match_operand:SVE_FULL_HSDI 1 "register_operand")
(match_operand:SVE_FULL_HSDI 2 "immediate_operand")]
"TARGET_SVE2"
{
unsigned HOST_WIDE_INT size
= (1ULL << GET_MODE_UNIT_BITSIZE (<VNARROW>mode)) - 1;
rtx elt = unwrap_const_vec_duplicate (operands[2]);
if (!CONST_INT_P (elt) || UINTVAL (elt) != size)
FAIL;

rtx addend = gen_reg_rtx (<MODE>mode);
rtx tmp1 = gen_reg_rtx (<VNARROW>mode);
rtx tmp2 = gen_reg_rtx (<VNARROW>mode);
rtx val = aarch64_simd_gen_const_vector_dup (<VNARROW>mode, 1);
emit_move_insn (addend, lowpart_subreg (<MODE>mode, val, <VNARROW>mode));
emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp1, operands[1],
addend));
emit_insn (gen_aarch64_sve (UNSPEC_ADDHNB, <MODE>mode, tmp2, operands[1],
lowpart_subreg (<MODE>mode, tmp1,
<VNARROW>mode)));
emit_move_insn (operands[0],
lowpart_subreg (<MODE>mode, tmp2, <VNARROW>mode));
DONE;
})

;; =========================================================================
;; == Permutation
;; =========================================================================
Expand Down
61 changes: 17 additions & 44 deletions gcc/config/aarch64/aarch64.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3847,6 +3847,19 @@ aarch64_vectorize_related_mode (machine_mode vector_mode,
return default_vectorize_related_mode (vector_mode, element_mode, nunits);
}

/* Implement TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT. */

static bool
aarch64_vectorize_preferred_div_as_shifts_over_mult (const_tree type)
{
machine_mode mode = TYPE_MODE (type);
unsigned int vec_flags = aarch64_classify_vector_mode (mode);
bool sve_p = (vec_flags & VEC_ANY_SVE);
bool simd_p = (vec_flags & VEC_ADVSIMD);

return (sve_p && TARGET_SVE2) || (simd_p && TARGET_SIMD);
}

/* Implement TARGET_PREFERRED_ELSE_VALUE. For binary operations,
prefer to use the first arithmetic operand as the else value if
the else value doesn't matter, since that exactly matches the SVE
Expand Down Expand Up @@ -24361,46 +24374,6 @@ aarch64_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,

return ret;
}

/* Implement TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST. */

bool
aarch64_vectorize_can_special_div_by_constant (enum tree_code code,
tree vectype, wide_int cst,
rtx *output, rtx in0, rtx in1)
{
if (code != TRUNC_DIV_EXPR
|| !TYPE_UNSIGNED (vectype))
return false;

machine_mode mode = TYPE_MODE (vectype);
unsigned int flags = aarch64_classify_vector_mode (mode);
if ((flags & VEC_ANY_SVE) && !TARGET_SVE2)
return false;

int pow = wi::exact_log2 (cst + 1);
auto insn_code = maybe_code_for_aarch64_bitmask_udiv3 (TYPE_MODE (vectype));
/* SVE actually has a div operator, we may have gotten here through
that route. */
if (pow != (int) (element_precision (vectype) / 2)
|| insn_code == CODE_FOR_nothing)
return false;

/* We can use the optimized pattern. */
if (in0 == NULL_RTX && in1 == NULL_RTX)
return true;

gcc_assert (output);

expand_operand ops[3];
create_output_operand (&ops[0], *output, mode);
create_input_operand (&ops[1], in0, mode);
create_fixed_operand (&ops[2], in1);
expand_insn (insn_code, 3, ops);
*output = ops[0].value;
return true;
}

/* Generate a byte permute mask for a register of mode MODE,
which has NUNITS units. */

Expand Down Expand Up @@ -27902,13 +27875,13 @@ aarch64_libgcc_floating_mode_supported_p
#undef TARGET_MAX_ANCHOR_OFFSET
#define TARGET_MAX_ANCHOR_OFFSET 4095

#undef TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT
#define TARGET_VECTORIZE_PREFERRED_DIV_AS_SHIFTS_OVER_MULT \
aarch64_vectorize_preferred_div_as_shifts_over_mult

#undef TARGET_VECTOR_ALIGNMENT
#define TARGET_VECTOR_ALIGNMENT aarch64_simd_vector_alignment

#undef TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST
#define TARGET_VECTORIZE_CAN_SPECIAL_DIV_BY_CONST \
aarch64_vectorize_can_special_div_by_constant

#undef TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT
#define TARGET_VECTORIZE_PREFERRED_VECTOR_ALIGNMENT \
aarch64_vectorize_preferred_vector_alignment
Expand Down

0 comments on commit f23dc72

Please sign in to comment.