Skip to content

Commit

Permalink
Once more, with feeling
Browse files Browse the repository at this point in the history
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
  • Loading branch information
rincebrain committed Jul 17, 2022
1 parent 3e7c6a6 commit 1630bd4
Show file tree
Hide file tree
Showing 8 changed files with 225 additions and 200 deletions.
18 changes: 18 additions & 0 deletions include/sys/zfs_context.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,23 @@
extern "C" {
#endif

/*
* Sometimes, it's an extremely bad idea to allow the compiler to vectorize
* scalar code, either because we want the code to work even if SIMD is
* broken, because the compiler is known to produce much worse results,
* or because it's very unsafe to do.
*
* Of course, in kernel mode, all platforms have explicit CFLAGS for
* "no, don't auto-vectorize random segments, not ever"...
*/
#if defined(__GNUC__)
#define novector __attribute__((optimize("no-tree-vectorize")))
#elif defined(__clang__)
#define novector __attribute__((optimize("no-vectorize")))
#else
#define novector
#endif

/*
* This code compiles in three different contexts. When __KERNEL__ is defined,
* the code uses "unix-like" kernel interfaces. When _STANDALONE is defined, the
Expand Down Expand Up @@ -127,6 +144,7 @@ extern "C" {
*/

#define noinline __attribute__((noinline))

#define likely(x) __builtin_expect((x), 1)
#define unlikely(x) __builtin_expect((x), 0)

Expand Down
16 changes: 0 additions & 16 deletions include/zfs_fletcher.h
Original file line number Diff line number Diff line change
Expand Up @@ -160,20 +160,4 @@ _ZFS_FLETCHER_H const fletcher_4_ops_t fletcher_4_aarch64_neon_ops;
}
#endif

#if defined(ZFS_UBSAN_ENABLED)
#if defined(__has_attribute)
#if __has_attribute(no_sanitize_undefined)
#define ZFS_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize_undefined))
#elif __has_attribute(no_sanitize)
#define ZFS_NO_SANITIZE_UNDEFINED __attribute__((no_sanitize("undefined")))
#else
#error "Compiler has to support attribute "
"`no_sanitize_undefined` or `no_sanitize(\"undefined\")`"
"when compiling with UBSan enabled"
#endif /* __has_attribute(no_sanitize_undefined) */
#endif /* defined(__has_attribute) */
#else
#define ZFS_NO_SANITIZE_UNDEFINED
#endif /* defined(ZFS_UBSAN_ENABLED) */

#endif /* _ZFS_FLETCHER_H */
73 changes: 37 additions & 36 deletions module/zcommon/zfs_fletcher.c
Original file line number Diff line number Diff line change
Expand Up @@ -300,37 +300,52 @@ fletcher_2_byteswap(const void *buf, uint64_t size,
(void) fletcher_2_incremental_byteswap((void *) buf, size, zcp);
}

#if defined(__GNUC__) && !defined(__clang__)
__attribute__((optimize("no-tree-vectorize")))
#elif defined(__clang__)
__attribute__((optnone))
#endif
static void
/*
* Below, we forcibly disable vectorization in the compiler, and explicitly
* cast our input from a fletcher_4_ctx_t * to a zio_cksum_t *.
*
* The former is because we would like at least one implementation that we
* can trust to keep working even without any alignment properties or
* if the SIMD infrastructure is on the fritz.
*
* The latter is because fletcher_4_ctx_t, depending on the toolchain at
* compile time, has alignment requirements, and when we call this
* implementation, we often casually cast a zio_cksum_t * into a
* fletcher_4_ctx_t *...
*
* ...but zio_cksum_t * has no such alignment properties.
*
* So it's UB to hand over something that violates that, and the compiler
* is permitted to generate instructions that assume the alignment
* properties are true. Casting over to zio_cksum_t * convinces the
* compiler that no, actually, the thing we're passing around has no
* alignment properties it can rely on.
*
* (This also happens to mean if you did decide to enable vectorization
* on these implementations, it would no longer crash from generating
* alignment-requiring instructions on some systems.)
*
*/

novector static void
fletcher_4_scalar_init(fletcher_4_ctx_t *ctx)
{
/* See leading comment re: cast. */
ZIO_SET_CHECKSUM((zio_cksum_t *)ctx, 0, 0, 0, 0);
}

#if defined(__GNUC__) && !defined(__clang__)
__attribute__((optimize("no-tree-vectorize")))
#elif defined(__clang__)
__attribute__((optnone))
#endif
static void
novector static void
fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{
/* See leading comment re: cast. */
memcpy(zcp, (zio_cksum_t *)ctx, sizeof (zio_cksum_t));
}

#if defined(__GNUC__) && !defined(__clang__)
__attribute__((optimize("no-tree-vectorize")))
#elif defined(__clang__)
__attribute__((optnone))
#endif
static void
novector static void
fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
uint64_t size)
{
/* See leading comment re: cast. */
zio_cksum_t *zcp = (zio_cksum_t *)ctx;

const uint32_t *ip = buf;
Expand All @@ -352,15 +367,11 @@ fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf,
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
}

#if defined(__GNUC__) && !defined(__clang__)
__attribute__((optimize("no-tree-vectorize")))
#elif defined(__clang__)
__attribute__((optnone))
#endif
static void
novector static void
fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf,
uint64_t size)
{
/* See leading comment re: cast. */
zio_cksum_t *zcp = (zio_cksum_t *)ctx;

const uint32_t *ip = buf;
Expand Down Expand Up @@ -830,12 +841,7 @@ fletcher_4_fini(void)

/* ABD adapters */

#if defined(__GNUC__) && !defined(__clang__)
__attribute__((optimize("no-tree-vectorize")))
#elif defined(__clang__)
__attribute__((optnone))
#endif
static void
novector static void
abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
{
const fletcher_4_ops_t *ops = fletcher_4_impl_get();
Expand All @@ -847,12 +853,7 @@ abd_fletcher_4_init(zio_abd_checksum_data_t *cdp)
ops->init_byteswap(cdp->acd_ctx);
}

#if defined(__GNUC__) && !defined(__clang__)
__attribute__((optimize("no-tree-vectorize")))
#elif defined(__clang__)
__attribute__((optnone))
#endif
static void
novector static void
abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp)
{
fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private;
Expand Down
2 changes: 0 additions & 2 deletions module/zcommon/zfs_fletcher_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,14 +35,12 @@
#define __asm __asm__ __volatile__
#endif

ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx)
{
memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t));
}

ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{
Expand Down
2 changes: 0 additions & 2 deletions module/zcommon/zfs_fletcher_intel.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,12 @@
#include <sys/simd.h>
#include <zfs_fletcher.h>

ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_avx2_init(fletcher_4_ctx_t *ctx)
{
memset(ctx->avx, 0, 4 * sizeof (zfs_fletcher_avx_t));
}

ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{
Expand Down
2 changes: 0 additions & 2 deletions module/zcommon/zfs_fletcher_sse.c
Original file line number Diff line number Diff line change
Expand Up @@ -49,14 +49,12 @@
#include <sys/byteorder.h>
#include <zfs_fletcher.h>

ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_sse2_init(fletcher_4_ctx_t *ctx)
{
memset(ctx->sse, 0, 4 * sizeof (zfs_fletcher_sse_t));
}

ZFS_NO_SANITIZE_UNDEFINED
static void
fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{
Expand Down
113 changes: 64 additions & 49 deletions module/zcommon/zfs_fletcher_superscalar.c
Original file line number Diff line number Diff line change
Expand Up @@ -45,49 +45,62 @@
#include <sys/byteorder.h>
#include <sys/spa_checksum.h>
#include <sys/string.h>
#include <sys/zfs_context.h>
#include <zfs_fletcher.h>

ZFS_NO_SANITIZE_UNDEFINED
static void
/*
* See the large block comment in zfs_fletcher.c for an explanation of
* the explicit casts strategically placed below;
* zfs_fletcher_superscalar_t has a similar lack of alignment
* requirement to zio_cksum_t.
*/

novector static void
fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx)
{
memset(ctx->superscalar, 0, 4 * sizeof (zfs_fletcher_superscalar_t));
memset((zfs_fletcher_superscalar_t *)ctx->superscalar, 0,
4 * sizeof (zfs_fletcher_superscalar_t));
}

ZFS_NO_SANITIZE_UNDEFINED
static void
novector static void
fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp)
{
zfs_fletcher_superscalar_t *ss_ctx =
(zfs_fletcher_superscalar_t *)ctx->superscalar;
uint64_t A, B, C, D;
A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1];
B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] -
ctx->superscalar[0].v[1];
C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] +
4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1];
D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] +
8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] +
ctx->superscalar[1].v[1];

A = ss_ctx[0].v[0] + ss_ctx[0].v[1];
B = 2 * ss_ctx[1].v[0] + 2 * ss_ctx[1].v[1] -
ss_ctx[0].v[1];
C = 4 * ss_ctx[2].v[0] - ss_ctx[1].v[0] +
4 * ss_ctx[2].v[1] - 3 * ss_ctx[1].v[1];
D = 8 * ss_ctx[3].v[0] - 4 * ss_ctx[2].v[0] +
8 * ss_ctx[3].v[1] - 8 * ss_ctx[2].v[1] +
ss_ctx[1].v[1];

ZIO_SET_CHECKSUM(zcp, A, B, C, D);
}

ZFS_NO_SANITIZE_UNDEFINED
static void
novector static void
fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx,
const void *buf, uint64_t size)
{
zfs_fletcher_superscalar_t *ss_ctx =
(zfs_fletcher_superscalar_t *)ctx->superscalar;

const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d;
uint64_t a2, b2, c2, d2;

a = ctx->superscalar[0].v[0];
b = ctx->superscalar[1].v[0];
c = ctx->superscalar[2].v[0];
d = ctx->superscalar[3].v[0];
a2 = ctx->superscalar[0].v[1];
b2 = ctx->superscalar[1].v[1];
c2 = ctx->superscalar[2].v[1];
d2 = ctx->superscalar[3].v[1];
a = ss_ctx[0].v[0];
b = ss_ctx[1].v[0];
c = ss_ctx[2].v[0];
d = ss_ctx[3].v[0];
a2 = ss_ctx[0].v[1];
b2 = ss_ctx[1].v[1];
c2 = ss_ctx[2].v[1];
d2 = ss_ctx[3].v[1];

for (; ip < ipend; ip += 2) {
a += ip[0];
Expand All @@ -100,34 +113,36 @@ fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx,
d2 += c2;
}

ctx->superscalar[0].v[0] = a;
ctx->superscalar[1].v[0] = b;
ctx->superscalar[2].v[0] = c;
ctx->superscalar[3].v[0] = d;
ctx->superscalar[0].v[1] = a2;
ctx->superscalar[1].v[1] = b2;
ctx->superscalar[2].v[1] = c2;
ctx->superscalar[3].v[1] = d2;
ss_ctx[0].v[0] = a;
ss_ctx[1].v[0] = b;
ss_ctx[2].v[0] = c;
ss_ctx[3].v[0] = d;
ss_ctx[0].v[1] = a2;
ss_ctx[1].v[1] = b2;
ss_ctx[2].v[1] = c2;
ss_ctx[3].v[1] = d2;
}

ZFS_NO_SANITIZE_UNDEFINED
static void
novector static void
fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx,
const void *buf, uint64_t size)
{
zfs_fletcher_superscalar_t *ss_ctx =
(zfs_fletcher_superscalar_t *)ctx->superscalar;

const uint32_t *ip = buf;
const uint32_t *ipend = ip + (size / sizeof (uint32_t));
uint64_t a, b, c, d;
uint64_t a2, b2, c2, d2;

a = ctx->superscalar[0].v[0];
b = ctx->superscalar[1].v[0];
c = ctx->superscalar[2].v[0];
d = ctx->superscalar[3].v[0];
a2 = ctx->superscalar[0].v[1];
b2 = ctx->superscalar[1].v[1];
c2 = ctx->superscalar[2].v[1];
d2 = ctx->superscalar[3].v[1];
a = ss_ctx[0].v[0];
b = ss_ctx[1].v[0];
c = ss_ctx[2].v[0];
d = ss_ctx[3].v[0];
a2 = ss_ctx[0].v[1];
b2 = ss_ctx[1].v[1];
c2 = ss_ctx[2].v[1];
d2 = ss_ctx[3].v[1];

for (; ip < ipend; ip += 2) {
a += BSWAP_32(ip[0]);
Expand All @@ -140,14 +155,14 @@ fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx,
d2 += c2;
}

ctx->superscalar[0].v[0] = a;
ctx->superscalar[1].v[0] = b;
ctx->superscalar[2].v[0] = c;
ctx->superscalar[3].v[0] = d;
ctx->superscalar[0].v[1] = a2;
ctx->superscalar[1].v[1] = b2;
ctx->superscalar[2].v[1] = c2;
ctx->superscalar[3].v[1] = d2;
ss_ctx[0].v[0] = a;
ss_ctx[1].v[0] = b;
ss_ctx[2].v[0] = c;
ss_ctx[3].v[0] = d;
ss_ctx[0].v[1] = a2;
ss_ctx[1].v[1] = b2;
ss_ctx[2].v[1] = c2;
ss_ctx[3].v[1] = d2;
}

static boolean_t fletcher_4_superscalar_valid(void)
Expand Down
Loading

0 comments on commit 1630bd4

Please sign in to comment.