From ba89e8c1002f761ad09e3b12bdba7a33d2b6de42 Mon Sep 17 00:00:00 2001 From: han kuan chen Date: Mon, 18 Mar 2019 00:29:03 -0700 Subject: [PATCH 1/2] improve memcpy performance --- newlib/libc/machine/riscv/memcpy.c | 83 +--- newlib/libc/machine/riscv/memcpy32_speed.c | 288 +++++++++++++ newlib/libc/machine/riscv/memcpy64_speed.c | 469 +++++++++++++++++++++ newlib/libc/machine/riscv/memcpy_size.c | 26 ++ 4 files changed, 793 insertions(+), 73 deletions(-) create mode 100644 newlib/libc/machine/riscv/memcpy32_speed.c create mode 100644 newlib/libc/machine/riscv/memcpy64_speed.c create mode 100644 newlib/libc/machine/riscv/memcpy_size.c diff --git a/newlib/libc/machine/riscv/memcpy.c b/newlib/libc/machine/riscv/memcpy.c index a0ab78a0a0..2083a31323 100644 --- a/newlib/libc/machine/riscv/memcpy.c +++ b/newlib/libc/machine/riscv/memcpy.c @@ -1,4 +1,4 @@ -/* Copyright (c) 2017 SiFive Inc. All rights reserved. +/* Copyright (c) 2019 SiFive Inc. All rights reserved. This copyrighted material is made available to anyone wishing to use, modify, copy, or redistribute it subject to the terms and conditions @@ -9,75 +9,12 @@ http://www.opensource.org/licenses. */ -#include -#include -#include "../../string/local.h" - -#define unlikely(X) __builtin_expect (!!(X), 0) - -void * -__inhibit_loop_to_libcall -memcpy(void *__restrict aa, const void *__restrict bb, size_t n) -{ - #define BODY(a, b, t) { \ - t tt = *b; \ - a++, b++; \ - *(a - 1) = tt; \ - } - - char *a = (char *)aa; - const char *b = (const char *)bb; - char *end = a + n; - uintptr_t msk = sizeof (long) - 1; - if (unlikely ((((uintptr_t)a & msk) != ((uintptr_t)b & msk)) - || n < sizeof (long))) - { -small: - if (__builtin_expect (a < end, 1)) - while (a < end) - BODY (a, b, char); - return aa; - } - - if (unlikely (((uintptr_t)a & msk) != 0)) - while ((uintptr_t)a & msk) - BODY (a, b, char); - - long *la = (long *)a; - const long *lb = (const long *)b; - long *lend = (long *)((uintptr_t)end & ~msk); - - if (unlikely (la < (lend - 8))) - { - while (la < (lend - 8)) - { - long b0 = *lb++; - long b1 = *lb++; - long b2 = *lb++; - long b3 = *lb++; - long b4 = *lb++; - long b5 = *lb++; - long b6 = *lb++; - long b7 = *lb++; - long b8 = *lb++; - *la++ = b0; - *la++ = b1; - *la++ = b2; - *la++ = b3; - *la++ = b4; - *la++ = b5; - *la++ = b6; - *la++ = b7; - *la++ = b8; - } - } - - while (la < lend) - BODY (la, lb, long); - - a = (char *)la; - b = (const char *)lb; - if (unlikely (a < end)) - goto small; - return aa; -} +#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__) +#include "memcpy_size.c" +#else +#if __riscv_xlen == 64 +#include "memcpy64_speed.c" +#else +#include "memcpy32_speed.c" +#endif +#endif diff --git a/newlib/libc/machine/riscv/memcpy32_speed.c b/newlib/libc/machine/riscv/memcpy32_speed.c new file mode 100644 index 0000000000..ec14248611 --- /dev/null +++ b/newlib/libc/machine/riscv/memcpy32_speed.c @@ -0,0 +1,288 @@ +/* Copyright (c) 2019 SiFive Inc. All rights reserved. + + This copyrighted material is made available to anyone wishing to use, + modify, copy, or redistribute it subject to the terms and conditions + of the FreeBSD License. This program is distributed in the hope that + it will be useful, but WITHOUT ANY WARRANTY expressed or implied, + including the implied warranties of MERCHANTABILITY or FITNESS FOR + A PARTICULAR PURPOSE. A copy of this license is available at + http://www.opensource.org/licenses. +*/ + +#include +#include +#include "../../string/local.h" + +#define ALIGNOF(addr) (((uintptr_t)(addr)) & (sizeof(void *) - 1)) + +static void memcpy_4_0(uint8_t *des, const uint8_t *src) +{ + ((uint32_t *)des)[0] = ((const uint32_t *)src)[0]; +} + +static void memcpy_4_1(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = ((const uint32_t *)(src - 1))[0]; + uint32_t temp3 = src[3]; + temp2 >>= 8; + temp3 <<= 24; + temp2 |= temp3; + ((uint32_t *)des)[0] = temp2; +} + +static void memcpy_4_2(uint8_t *des, const uint8_t *src) +{ +#if 1 + uint16_t temp2 = ((const uint16_t *)src)[0]; + uint16_t temp3 = ((const uint16_t *)src)[1]; + ((uint16_t *)des)[0] = temp2; + ((uint16_t *)des)[1] = temp3; +#else + uint32_t temp2 = ((const uint16_t *)src)[0]; + uint32_t temp3 = ((const uint16_t *)src)[1]; + temp3 <<= 16; + temp2 |= temp3; + ((uint32_t *)des)[0] = temp2; +#endif +} + +static void memcpy_4_3(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = src[0]; + uint32_t temp3 = ((const uint32_t *)(src + 1))[0]; + temp3 <<= 8; + temp2 |= temp3; + ((uint32_t *)des)[0] = temp2; +} + +static void memcpy_8_0(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = ((const uint32_t *)src)[0]; + uint32_t temp3 = ((const uint32_t *)src)[1]; + ((uint32_t *)des)[0] = temp2; + ((uint32_t *)des)[1] = temp3; +} + +static void memcpy_8_1(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = ((const uint32_t *)(src - 1))[0]; + uint32_t temp3 = ((const uint32_t *)(src + 3))[0]; + temp2 >>= 8; + uint32_t temp4 = temp3 << 24; + uint32_t temp5 = src[7]; + temp2 |= temp4; + temp3 >>= 8; + temp5 <<= 24; + ((uint32_t *)des)[0] = temp2; + temp3 |= temp5; + ((uint32_t *)des)[1] = temp3; +} + +static void memcpy_8_2(uint8_t *des, const uint8_t *src) +{ + uint16_t temp2 = ((const uint16_t *)src)[0]; + uint16_t temp3 = ((const uint16_t *)src)[1]; + uint16_t temp4 = ((const uint16_t *)src)[2]; + uint16_t temp5 = ((const uint16_t *)src)[3]; + ((uint16_t *)des)[0] = temp2; + ((uint16_t *)des)[1] = temp3; + ((uint16_t *)des)[2] = temp4; + ((uint16_t *)des)[3] = temp5; +} + +static void memcpy_8_3(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = src[0]; + uint32_t temp3 = ((const uint32_t *)(src + 1))[0]; + uint32_t temp4 = ((const uint32_t *)(src + 5))[0]; + uint32_t temp5 = temp3 << 8; + temp4 <<= 8; + temp2 |= temp5; + temp3 >>= 24; + ((uint32_t *)des)[0] = temp2; + temp3 |= temp4; + ((uint32_t *)des)[1] = temp3; +} + +static void memcpy_16_0(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = ((const uint32_t *)src)[0]; + uint32_t temp3 = ((const uint32_t *)src)[1]; + uint32_t temp4 = ((const uint32_t *)src)[2]; + uint32_t temp5 = ((const uint32_t *)src)[3]; + ((uint32_t *)des)[0] = temp2; + ((uint32_t *)des)[1] = temp3; + ((uint32_t *)des)[2] = temp4; + ((uint32_t *)des)[3] = temp5; +} + +static void memcpy_16_1(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = ((const uint32_t *)(src - 1))[0]; + uint32_t temp3 = ((const uint32_t *)(src + 3))[0]; + temp2 >>= 8; + uint32_t temp4 = temp3 << 24; + uint32_t temp5 = ((const uint32_t *)(src + 7))[0]; + temp2 |= temp4; + temp3 >>= 8; + temp4 = temp5 << 24; + ((uint32_t *)des)[0] = temp2; + temp3 |= temp4; + temp2 = ((const uint32_t *)(src + 11))[0]; + temp5 >>= 8; + temp4 = temp2 << 24; + ((uint32_t *)des)[1] = temp3; + temp5 |= temp4; + temp3 = src[15]; + temp2 >>= 8; + temp3 <<= 24; + ((uint32_t *)des)[2] = temp5; + temp2 |= temp3; + ((uint32_t *)des)[3] = temp2; +} + +static void memcpy_16_2(uint8_t *des, const uint8_t *src) +{ + uint16_t temp2 = ((const uint16_t *)src)[0]; + uint16_t temp3 = ((const uint16_t *)src)[1]; + uint16_t temp4 = ((const uint16_t *)src)[2]; + uint16_t temp5 = ((const uint16_t *)src)[3]; + ((uint16_t *)des)[0] = temp2; + ((uint16_t *)des)[1] = temp3; + ((uint16_t *)des)[2] = temp4; + ((uint16_t *)des)[3] = temp5; + temp2 = ((const uint16_t *)src)[4]; + temp3 = ((const uint16_t *)src)[5]; + temp4 = ((const uint16_t *)src)[6]; + temp5 = ((const uint16_t *)src)[7]; + ((uint16_t *)des)[4] = temp2; + ((uint16_t *)des)[5] = temp3; + ((uint16_t *)des)[6] = temp4; + ((uint16_t *)des)[7] = temp5; +} + +static void memcpy_16_3(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = src[0]; + uint32_t temp3 = ((const uint32_t *)(src + 1))[0]; + uint32_t temp4 = temp3 << 8; + uint32_t temp5 = ((const uint32_t *)(src + 5))[0]; + temp2 |= temp4; + temp3 >>= 24; + temp4 = temp5 << 8; + ((uint32_t *)des)[0] = temp2; + temp3 |= temp4; + temp2 = ((const uint32_t *)(src + 9))[0]; + temp5 >>= 24; + temp4 = temp2 << 8; + ((uint32_t *)des)[1] = temp3; + temp5 |= temp4; + temp3 = ((const uint32_t *)(src + 13))[0]; + temp2 >>= 24; + temp3 <<= 8; + ((uint32_t *)des)[2] = temp5; + temp2 |= temp3; + ((uint32_t *)des)[3] = temp2; +} + +#define MEMCPY_ALIGNMENT(align_num) \ + { \ + while (src != src_end) \ + { \ + memcpy_16_##align_num(des, src); \ + src += 16; \ + des += 16; \ + } \ + switch (residual) \ + { \ + case 15: \ + des[14] = src[14]; \ + case 14: \ + des[13] = src[13]; \ + case 13: \ + des[12] = src[12]; \ + case 12: \ + memcpy_8_##align_num(des, src); \ + memcpy_4_##align_num(des + 8, src + 8); \ + break; \ + case 11: \ + des[10] = src[10]; \ + case 10: \ + des[9] = src[9]; \ + case 9: \ + des[8] = src[8]; \ + case 8: \ + memcpy_8_##align_num(des, src); \ + break; \ + case 7: \ + des[6] = src[6]; \ + case 6: \ + des[5] = src[5]; \ + case 5: \ + des[4] = src[4]; \ + case 4: \ + memcpy_4_##align_num(des, src); \ + break; \ + case 3: \ + des[2] = src[2]; \ + case 2: \ + des[1] = src[1]; \ + case 1: \ + des[0] = src[0]; \ + break; \ + } \ + } + +void * + __inhibit_loop_to_libcall + memcpy(void *restrict d, const void *restrict s, size_t count) +{ + uint8_t *des = (uint8_t *)d; + const uint8_t *src = (const uint8_t *)s; + if (3 < count) + { + uintptr_t alignof_des = (4 - ALIGNOF(des)); + switch (alignof_des) + { + case 3: + des[2] = src[2]; + case 2: + des[1] = src[1]; + case 1: + des[0] = src[0]; + count -= alignof_des; + des += alignof_des; + src += alignof_des; + break; + } + size_t residual = count & 15; + count &= 0xFFFFFFF0; + const uint8_t *src_end = src + count; + switch (ALIGNOF(src)) + { + case 0: + MEMCPY_ALIGNMENT(0); + break; + case 1: + MEMCPY_ALIGNMENT(1); + break; + case 2: + MEMCPY_ALIGNMENT(2); + break; + case 3: + MEMCPY_ALIGNMENT(3); + break; + } + } + else + switch (count) + { + case 3: + des[2] = src[2]; + case 2: + des[1] = src[1]; + case 1: + des[0] = src[0]; + } + return d; +} diff --git a/newlib/libc/machine/riscv/memcpy64_speed.c b/newlib/libc/machine/riscv/memcpy64_speed.c new file mode 100644 index 0000000000..e4b63db7d1 --- /dev/null +++ b/newlib/libc/machine/riscv/memcpy64_speed.c @@ -0,0 +1,469 @@ +/* Copyright (c) 2019 SiFive Inc. All rights reserved. + + This copyrighted material is made available to anyone wishing to use, + modify, copy, or redistribute it subject to the terms and conditions + of the FreeBSD License. This program is distributed in the hope that + it will be useful, but WITHOUT ANY WARRANTY expressed or implied, + including the implied warranties of MERCHANTABILITY or FITNESS FOR + A PARTICULAR PURPOSE. A copy of this license is available at + http://www.opensource.org/licenses. +*/ + +#include +#include +#include "../../string/local.h" + +#define ALIGNOF(addr) (((uintptr_t)(addr)) & (sizeof(void *) - 1)) + +static void memcpy_4_0(uint8_t *des, const uint8_t *src) +{ + ((uint32_t *)des)[0] = ((const uint32_t *)src)[0]; +} + +#define memcpy_4_4 memcpy_4_0 + +static void memcpy_4_1(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = ((const uint32_t *)(src - 1))[0]; + uint32_t temp3 = src[3]; + temp2 >>= 8; + temp3 <<= 24; + temp2 |= temp3; + ((uint32_t *)des)[0] = temp2; +} + +#define memcpy_4_5 memcpy_4_1 + +static void memcpy_4_2(uint8_t *des, const uint8_t *src) +{ +#if 1 + uint32_t temp2 = ((const uint16_t *)src)[0]; + uint32_t temp3 = ((const uint16_t *)src)[1]; + temp3 <<= 16; + temp2 |= temp3; + ((uint32_t *)des)[0] = temp2; +#else + uint16_t temp2 = ((const uint16_t *)src)[0]; + uint16_t temp3 = ((const uint16_t *)src)[1]; + ((uint16_t *)des)[0] = temp2; + ((uint16_t *)des)[1] = temp3; +#endif +} + +#define memcpy_4_6 memcpy_4_2 + +static void memcpy_4_3(uint8_t *des, const uint8_t *src) +{ + uint32_t temp2 = src[0]; + uint32_t temp3 = ((const uint32_t *)(src + 1))[0]; + temp3 <<= 8; + temp2 |= temp3; + ((uint32_t *)des)[0] = temp2; +} + +#define memcpy_4_7 memcpy_4_3 + +static void memcpy_8_0(uint8_t *des, const uint8_t *src) +{ + ((uint64_t *)des)[0] = ((const uint64_t *)src)[0]; +} + +static void memcpy_8_1(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = ((const uint64_t *)(src - 1))[0]; + uint64_t temp3 = src[7]; + temp2 >>= 8; + temp3 <<= 56; + temp2 |= temp3; + ((uint64_t *)des)[0] = temp2; +} + +static void memcpy_8_2(uint8_t *des, const uint8_t *src) +{ +#if 1 + uint64_t temp2 = ((const uint64_t *)(src - 2))[0]; + uint64_t temp3 = ((const uint16_t *)src)[3]; + temp2 >>= 16; + temp3 <<= 48; + temp2 |= temp3; + ((uint64_t *)des)[0] = temp2; +#else + uint16_t temp2 = ((const uint16_t *)src)[0]; + uint16_t temp3 = ((const uint16_t *)src)[1]; + uint16_t temp4 = ((const uint16_t *)src)[2]; + uint16_t temp5 = ((const uint16_t *)src)[3]; + ((uint16_t *)des)[0] = temp2; + ((uint16_t *)des)[1] = temp3; + ((uint16_t *)des)[2] = temp4; + ((uint16_t *)des)[3] = temp5; +#endif +} + +static void memcpy_8_3(uint8_t *des, const uint8_t *src) +{ +#if 1 + uint64_t temp2 = ((const uint64_t *)(src - 3))[0]; + uint64_t temp4 = ((const uint32_t *)(src + 5))[0]; + temp2 >>= 24; + temp4 <<= 40; + temp2 |= temp4; + ((uint64_t *)des)[0] = temp2; +#else + uint64_t temp2 = src[0]; + uint64_t temp3 = ((const uint32_t *)(src + 1))[0]; + uint64_t temp4 = ((const uint32_t *)(src + 5))[0]; + temp3 <<= 8; + temp4 <<= 40; + temp2 |= temp3; + temp2 |= temp4; + ((uint64_t *)des)[0] = temp2; +#endif +} + +static void memcpy_8_4(uint8_t *des, const uint8_t *src) +{ +#if 1 + uint64_t temp2 = ((const uint32_t *)src)[0]; + uint64_t temp3 = ((const uint32_t *)(src + 4))[0]; + temp3 <<= 32; + temp2 |= temp3; + ((uint64_t *)des)[0] = temp2; +#else + uint32_t temp2 = ((const uint32_t *)src)[0]; + uint32_t temp3 = ((const uint32_t *)(src + 4))[0]; + ((uint32_t *)des)[0] = temp2; + ((uint32_t *)des)[1] = temp3; +#endif +} + +static void memcpy_8_5(uint8_t *des, const uint8_t *src) +{ +#if 1 + uint64_t temp2 = ((const uint32_t *)(src - 1))[0]; + uint64_t temp3 = ((const uint64_t *)(src + 3))[0]; + temp2 >>= 8; + temp3 <<= 24; + temp2 |= temp3; + ((uint64_t *)des)[0] = temp2; +#else + uint64_t temp2 = ((const uint32_t *)(src - 1))[0]; + uint64_t temp3 = ((const uint32_t *)(src + 3))[0]; + uint64_t temp4 = (src + 7)[0]; + temp2 >>= 8; + temp3 <<= 24; + temp4 <<= 56; + temp2 |= temp3; + temp2 |= temp4; + ((uint64_t *)des)[0] = temp2; +#endif +} + +static void memcpy_8_6(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = ((const uint16_t *)src)[0]; + uint64_t temp3 = ((const uint64_t *)(src + 2))[0]; + temp3 <<= 16; + temp2 |= temp3; + ((uint64_t *)des)[0] = temp2; +} + +static void memcpy_8_7(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = src[0]; + uint64_t temp3 = ((const uint64_t *)(src + 1))[0]; + temp3 <<= 8; + temp2 |= temp3; + ((uint64_t *)des)[0] = temp2; +} + +static void memcpy_16_0(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = ((const uint64_t *)src)[0]; + uint64_t temp3 = ((const uint64_t *)src)[1]; + ((uint64_t *)des)[0] = temp2; + ((uint64_t *)des)[1] = temp3; +} + +static void memcpy_16_1(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = ((const uint64_t *)(src - 1))[0]; + uint64_t temp3 = ((const uint64_t *)(src + 7))[0]; + temp2 >>= 8; + uint64_t temp4 = temp3 << 56; + uint64_t temp5 = src[15]; + temp2 |= temp4; + temp3 >>= 8; + temp5 <<= 56; + ((uint64_t *)des)[0] = temp2; + temp3 |= temp5; + ((uint64_t *)des)[1] = temp3; +} + +static void memcpy_16_2(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = ((const uint64_t *)(src - 2))[0]; + uint64_t temp3 = ((const uint64_t *)(src + 6))[0]; + temp2 >>= 16; + uint64_t temp4 = temp3 << 48; + uint64_t temp5 = ((const uint16_t *)(src + 14))[0]; + temp2 |= temp4; + temp3 >>= 16; + temp5 <<= 48; + ((uint64_t *)des)[0] = temp2; + temp3 |= temp5; + ((uint64_t *)des)[1] = temp3; +} + +static void memcpy_16_3(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = ((const uint64_t *)(src - 3))[0]; + uint64_t temp3 = ((const uint64_t *)(src + 5))[0]; + temp2 >>= 24; + uint64_t temp4 = temp3 << 40; + uint64_t temp5 = ((const uint32_t *)(src + 13))[0]; + temp2 |= temp4; + temp3 >>= 24; + temp5 <<= 40; + ((uint64_t *)des)[0] = temp2; + temp3 |= temp5; + ((uint64_t *)des)[1] = temp3; +} + +static void memcpy_16_4(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = ((const uint32_t *)src)[0]; + uint64_t temp3 = ((const uint32_t *)(src + 4))[0]; + uint64_t temp4 = temp3 << 32; + uint64_t temp5 = ((const uint32_t *)(src + 12))[0]; + temp2 |= temp4; + temp3 >>= 32; + temp5 <<= 32; + ((uint64_t *)des)[0] = temp2; + temp3 |= temp5; + ((uint64_t *)des)[1] = temp3; +} + +static void memcpy_16_5(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = ((const uint32_t *)(src - 1))[0]; + uint64_t temp3 = ((const uint64_t *)(src + 3))[0]; + temp2 >>= 8; + uint64_t temp4 = temp3 << 24; + uint64_t temp5 = ((const uint64_t *)(src + 11))[0]; + temp2 |= temp4; + temp3 >>= 40; + temp5 <<= 24; + ((uint64_t *)des)[0] = temp2; + temp3 |= temp5; + ((uint64_t *)des)[1] = temp3; +} + +static void memcpy_16_6(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = ((const uint16_t *)src)[0]; + uint64_t temp3 = ((const uint64_t *)(src + 2))[0]; + uint64_t temp4 = ((const uint64_t *)(src + 10))[0]; + uint64_t temp5 = temp3 << 16; + temp4 <<= 16; + temp2 |= temp5; + temp3 >>= 48; + ((uint64_t *)des)[0] = temp2; + temp3 |= temp4; + ((uint64_t *)des)[1] = temp3; +} + +static void memcpy_16_7(uint8_t *des, const uint8_t *src) +{ + uint64_t temp2 = src[0]; + uint64_t temp3 = ((const uint64_t *)(src + 1))[0]; + uint64_t temp4 = ((const uint64_t *)(src + 9))[0]; + uint64_t temp5 = temp3 << 8; + temp4 <<= 8; + temp2 |= temp5; + temp3 >>= 56; + ((uint64_t *)des)[0] = temp2; + temp3 |= temp4; + ((uint64_t *)des)[1] = temp3; +} + +#define I16_SET_I8(des, src) \ + { \ + uint16_t temp0 = (src)[0]; \ + uint16_t temp1 = (src)[1]; \ + temp1 <<= 8; \ + temp0 |= temp1; \ + (des) = temp0; \ + } + +#define I32_SET_I8(des, src) \ + { \ + uint32_t temp0 = (src)[0]; \ + uint32_t temp1 = (src)[1]; \ + uint32_t temp2 = (src)[2]; \ + uint32_t temp3 = (src)[3]; \ + temp1 <<= 8; \ + temp2 <<= 16; \ + temp3 <<= 24; \ + temp0 |= temp1; \ + temp2 |= temp3; \ + temp0 |= temp2; \ + (des) = temp0; \ + } + +#define MEMCPY_ALIGNMENT(align_num) \ + { \ + while (src != src_end) \ + { \ + memcpy_16_##align_num(des, src); \ + src += 16; \ + des += 16; \ + } \ + switch (residual) \ + { \ + case 15: \ + des[14] = src[14]; \ + case 14: \ + des[13] = src[13]; \ + case 13: \ + des[12] = src[12]; \ + case 12: \ + memcpy_8_##align_num(des, src); \ + memcpy_4_##align_num(des + 8, src + 8); \ + break; \ + case 11: \ + des[10] = src[10]; \ + case 10: \ + des[9] = src[9]; \ + case 9: \ + des[8] = src[8]; \ + case 8: \ + memcpy_8_##align_num(des, src); \ + break; \ + case 7: \ + des[6] = src[6]; \ + case 6: \ + des[5] = src[5]; \ + case 5: \ + des[4] = src[4]; \ + case 4: \ + memcpy_4_##align_num(des, src); \ + break; \ + case 3: \ + des[2] = src[2]; \ + case 2: \ + des[1] = src[1]; \ + case 1: \ + des[0] = src[0]; \ + break; \ + } \ + } + +void * + __inhibit_loop_to_libcall + memcpy(void *restrict d, const void *restrict s, size_t count) +{ + uint8_t *des = (uint8_t *)d; + const uint8_t *src = (const uint8_t *)s; + if (7 < count) + { + switch (ALIGNOF(des)) + { + case 1: + des[0] = src[0]; + I16_SET_I8(((uint16_t *)(des + 1))[0], src + 1); + I32_SET_I8(((uint32_t *)(des + 3))[0], src + 3); + count -= 7; + des += 7; + src += 7; + break; + case 2: + I16_SET_I8(((uint16_t *)des)[0], src); + I32_SET_I8(((uint32_t *)(des + 2))[0], src + 2); + count -= 6; + des += 6; + src += 6; + break; + case 3: + des[0] = src[0]; + I32_SET_I8(((uint32_t *)(des + 1))[0], src + 1); + count -= 5; + des += 5; + src += 5; + break; + case 4: + I32_SET_I8(((uint32_t *)des)[0], src); + count -= 4; + des += 4; + src += 4; + break; + case 5: + des[0] = src[0]; + I16_SET_I8(((uint16_t *)(des + 1))[0], src + 1); + count -= 3; + des += 3; + src += 3; + break; + case 6: + I16_SET_I8(((uint16_t *)des)[0], src); + count -= 2; + des += 2; + src += 2; + break; + case 7: + des[0] = src[0]; + --count; + ++des; + ++src; + break; + } + size_t residual = count & 15; + count &= 0xFFFFFFF0; + const uint8_t *src_end = src + count; + switch (ALIGNOF(src)) + { + case 0: + MEMCPY_ALIGNMENT(0); + break; + case 1: + MEMCPY_ALIGNMENT(1); + break; + case 2: + MEMCPY_ALIGNMENT(2); + break; + case 3: + MEMCPY_ALIGNMENT(3); + break; + case 4: + MEMCPY_ALIGNMENT(4); + break; + case 5: + MEMCPY_ALIGNMENT(5); + break; + case 6: + MEMCPY_ALIGNMENT(6); + break; + case 7: + MEMCPY_ALIGNMENT(7); + break; + } + } + else + switch (count) + { + case 7: + des[6] = src[6]; + case 6: + des[5] = src[5]; + case 5: + des[4] = src[4]; + case 4: + des[3] = src[3]; + case 3: + des[2] = src[2]; + case 2: + des[1] = src[1]; + case 1: + des[0] = src[0]; + } + return d; +} diff --git a/newlib/libc/machine/riscv/memcpy_size.c b/newlib/libc/machine/riscv/memcpy_size.c new file mode 100644 index 0000000000..8c90bdb6dd --- /dev/null +++ b/newlib/libc/machine/riscv/memcpy_size.c @@ -0,0 +1,26 @@ +/* Copyright (c) 2019 SiFive Inc. All rights reserved. + + This copyrighted material is made available to anyone wishing to use, + modify, copy, or redistribute it subject to the terms and conditions + of the FreeBSD License. This program is distributed in the hope that + it will be useful, but WITHOUT ANY WARRANTY expressed or implied, + including the implied warranties of MERCHANTABILITY or FITNESS FOR + A PARTICULAR PURPOSE. A copy of this license is available at + http://www.opensource.org/licenses. +*/ + +#include +#include +#include "../../string/local.h" + +void * + __inhibit_loop_to_libcall + memcpy(void *restrict d, const void *restrict s, size_t count) +{ + uint8_t *des = (uint8_t *)d; + const uint8_t *src = (const uint8_t *)s; + const uint8_t *src_end = ((const uint8_t *)s) + count; + while (src != src_end) + *(des++) = *(src++); + return d; +} From 6b6825580e1f71b8837f1e11309e9fcf50e4aabf Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Mon, 18 Mar 2019 21:00:00 -0700 Subject: [PATCH 2/2] rename functions --- newlib/libc/machine/riscv/memcpy32_speed.c | 137 ++++++++------- newlib/libc/machine/riscv/memcpy64_speed.c | 188 +++++++++++---------- 2 files changed, 175 insertions(+), 150 deletions(-) diff --git a/newlib/libc/machine/riscv/memcpy32_speed.c b/newlib/libc/machine/riscv/memcpy32_speed.c index ec14248611..39bfbdc4c8 100644 --- a/newlib/libc/machine/riscv/memcpy32_speed.c +++ b/newlib/libc/machine/riscv/memcpy32_speed.c @@ -15,12 +15,21 @@ #define ALIGNOF(addr) (((uintptr_t)(addr)) & (sizeof(void *) - 1)) -static void memcpy_4_0(uint8_t *des, const uint8_t *src) +// load: aligned load +// loadu: unaligned load +// loaduX: unaligned load with offset X +// u32: unsigned 32 bit integer +// u64: unsigned 64 bit integer +// u128: unsigned 128 bit integer + +static void load_u32(uint8_t *des, const uint8_t *src) { ((uint32_t *)des)[0] = ((const uint32_t *)src)[0]; } -static void memcpy_4_1(uint8_t *des, const uint8_t *src) +#define loadu0_u32 load_u32 + +static void loadu1_u32(uint8_t *des, const uint8_t *src) { uint32_t temp2 = ((const uint32_t *)(src - 1))[0]; uint32_t temp3 = src[3]; @@ -30,7 +39,7 @@ static void memcpy_4_1(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[0] = temp2; } -static void memcpy_4_2(uint8_t *des, const uint8_t *src) +static void loadu2_u32(uint8_t *des, const uint8_t *src) { #if 1 uint16_t temp2 = ((const uint16_t *)src)[0]; @@ -46,7 +55,7 @@ static void memcpy_4_2(uint8_t *des, const uint8_t *src) #endif } -static void memcpy_4_3(uint8_t *des, const uint8_t *src) +static void loadu3_u32(uint8_t *des, const uint8_t *src) { uint32_t temp2 = src[0]; uint32_t temp3 = ((const uint32_t *)(src + 1))[0]; @@ -55,7 +64,7 @@ static void memcpy_4_3(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[0] = temp2; } -static void memcpy_8_0(uint8_t *des, const uint8_t *src) +static void load_u64(uint8_t *des, const uint8_t *src) { uint32_t temp2 = ((const uint32_t *)src)[0]; uint32_t temp3 = ((const uint32_t *)src)[1]; @@ -63,7 +72,9 @@ static void memcpy_8_0(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[1] = temp3; } -static void memcpy_8_1(uint8_t *des, const uint8_t *src) +#define loadu0_u64 load_u64 + +static void loadu1_u64(uint8_t *des, const uint8_t *src) { uint32_t temp2 = ((const uint32_t *)(src - 1))[0]; uint32_t temp3 = ((const uint32_t *)(src + 3))[0]; @@ -78,7 +89,7 @@ static void memcpy_8_1(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[1] = temp3; } -static void memcpy_8_2(uint8_t *des, const uint8_t *src) +static void loadu2_u64(uint8_t *des, const uint8_t *src) { uint16_t temp2 = ((const uint16_t *)src)[0]; uint16_t temp3 = ((const uint16_t *)src)[1]; @@ -90,7 +101,7 @@ static void memcpy_8_2(uint8_t *des, const uint8_t *src) ((uint16_t *)des)[3] = temp5; } -static void memcpy_8_3(uint8_t *des, const uint8_t *src) +static void loadu3_u64(uint8_t *des, const uint8_t *src) { uint32_t temp2 = src[0]; uint32_t temp3 = ((const uint32_t *)(src + 1))[0]; @@ -104,7 +115,7 @@ static void memcpy_8_3(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[1] = temp3; } -static void memcpy_16_0(uint8_t *des, const uint8_t *src) +static void load_u128(uint8_t *des, const uint8_t *src) { uint32_t temp2 = ((const uint32_t *)src)[0]; uint32_t temp3 = ((const uint32_t *)src)[1]; @@ -116,7 +127,9 @@ static void memcpy_16_0(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[3] = temp5; } -static void memcpy_16_1(uint8_t *des, const uint8_t *src) +#define loadu0_u128 load_u128 + +static void loadu1_u128(uint8_t *des, const uint8_t *src) { uint32_t temp2 = ((const uint32_t *)(src - 1))[0]; uint32_t temp3 = ((const uint32_t *)(src + 3))[0]; @@ -141,7 +154,7 @@ static void memcpy_16_1(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[3] = temp2; } -static void memcpy_16_2(uint8_t *des, const uint8_t *src) +static void loadu2_u128(uint8_t *des, const uint8_t *src) { uint16_t temp2 = ((const uint16_t *)src)[0]; uint16_t temp3 = ((const uint16_t *)src)[1]; @@ -161,7 +174,7 @@ static void memcpy_16_2(uint8_t *des, const uint8_t *src) ((uint16_t *)des)[7] = temp5; } -static void memcpy_16_3(uint8_t *des, const uint8_t *src) +static void loadu3_u128(uint8_t *des, const uint8_t *src) { uint32_t temp2 = src[0]; uint32_t temp3 = ((const uint32_t *)(src + 1))[0]; @@ -185,52 +198,52 @@ static void memcpy_16_3(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[3] = temp2; } -#define MEMCPY_ALIGNMENT(align_num) \ - { \ - while (src != src_end) \ - { \ - memcpy_16_##align_num(des, src); \ - src += 16; \ - des += 16; \ - } \ - switch (residual) \ - { \ - case 15: \ - des[14] = src[14]; \ - case 14: \ - des[13] = src[13]; \ - case 13: \ - des[12] = src[12]; \ - case 12: \ - memcpy_8_##align_num(des, src); \ - memcpy_4_##align_num(des + 8, src + 8); \ - break; \ - case 11: \ - des[10] = src[10]; \ - case 10: \ - des[9] = src[9]; \ - case 9: \ - des[8] = src[8]; \ - case 8: \ - memcpy_8_##align_num(des, src); \ - break; \ - case 7: \ - des[6] = src[6]; \ - case 6: \ - des[5] = src[5]; \ - case 5: \ - des[4] = src[4]; \ - case 4: \ - memcpy_4_##align_num(des, src); \ - break; \ - case 3: \ - des[2] = src[2]; \ - case 2: \ - des[1] = src[1]; \ - case 1: \ - des[0] = src[0]; \ - break; \ - } \ +#define MEMCPY_OFFSET(align_num) \ + { \ + while (src != src_end) \ + { \ + loadu##align_num##_u128(des, src); \ + src += 16; \ + des += 16; \ + } \ + switch (residual) \ + { \ + case 15: \ + des[14] = src[14]; \ + case 14: \ + des[13] = src[13]; \ + case 13: \ + des[12] = src[12]; \ + case 12: \ + loadu##align_num##_u64(des, src); \ + loadu##align_num##_u32(des + 8, src + 8); \ + break; \ + case 11: \ + des[10] = src[10]; \ + case 10: \ + des[9] = src[9]; \ + case 9: \ + des[8] = src[8]; \ + case 8: \ + loadu##align_num##_u64(des, src); \ + break; \ + case 7: \ + des[6] = src[6]; \ + case 6: \ + des[5] = src[5]; \ + case 5: \ + des[4] = src[4]; \ + case 4: \ + loadu##align_num##_u32(des, src); \ + break; \ + case 3: \ + des[2] = src[2]; \ + case 2: \ + des[1] = src[1]; \ + case 1: \ + des[0] = src[0]; \ + break; \ + } \ } void * @@ -261,16 +274,16 @@ void * switch (ALIGNOF(src)) { case 0: - MEMCPY_ALIGNMENT(0); + MEMCPY_OFFSET(0); break; case 1: - MEMCPY_ALIGNMENT(1); + MEMCPY_OFFSET(1); break; case 2: - MEMCPY_ALIGNMENT(2); + MEMCPY_OFFSET(2); break; case 3: - MEMCPY_ALIGNMENT(3); + MEMCPY_OFFSET(3); break; } } diff --git a/newlib/libc/machine/riscv/memcpy64_speed.c b/newlib/libc/machine/riscv/memcpy64_speed.c index e4b63db7d1..c53379d8c6 100644 --- a/newlib/libc/machine/riscv/memcpy64_speed.c +++ b/newlib/libc/machine/riscv/memcpy64_speed.c @@ -15,14 +15,22 @@ #define ALIGNOF(addr) (((uintptr_t)(addr)) & (sizeof(void *) - 1)) -static void memcpy_4_0(uint8_t *des, const uint8_t *src) +// load: aligned load +// loadu: unaligned load +// loaduX: unaligned load with offset X +// u32: unsigned 32 bit integer +// u64: unsigned 64 bit integer +// u128: unsigned 128 bit integer + +static void load_u32(uint8_t *des, const uint8_t *src) { ((uint32_t *)des)[0] = ((const uint32_t *)src)[0]; } -#define memcpy_4_4 memcpy_4_0 +#define loadu0_u32 load_u32 +#define loadu4_u32 load_u32 -static void memcpy_4_1(uint8_t *des, const uint8_t *src) +static void loadu1_u32(uint8_t *des, const uint8_t *src) { uint32_t temp2 = ((const uint32_t *)(src - 1))[0]; uint32_t temp3 = src[3]; @@ -32,9 +40,9 @@ static void memcpy_4_1(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[0] = temp2; } -#define memcpy_4_5 memcpy_4_1 +#define loadu5_u32 loadu1_u32 -static void memcpy_4_2(uint8_t *des, const uint8_t *src) +static void loadu2_u32(uint8_t *des, const uint8_t *src) { #if 1 uint32_t temp2 = ((const uint16_t *)src)[0]; @@ -50,9 +58,9 @@ static void memcpy_4_2(uint8_t *des, const uint8_t *src) #endif } -#define memcpy_4_6 memcpy_4_2 +#define loadu6_u32 loadu2_u32 -static void memcpy_4_3(uint8_t *des, const uint8_t *src) +static void loadu3_u32(uint8_t *des, const uint8_t *src) { uint32_t temp2 = src[0]; uint32_t temp3 = ((const uint32_t *)(src + 1))[0]; @@ -61,14 +69,16 @@ static void memcpy_4_3(uint8_t *des, const uint8_t *src) ((uint32_t *)des)[0] = temp2; } -#define memcpy_4_7 memcpy_4_3 +#define loadu7_u32 loadu3_u32 -static void memcpy_8_0(uint8_t *des, const uint8_t *src) +static void load_u64(uint8_t *des, const uint8_t *src) { ((uint64_t *)des)[0] = ((const uint64_t *)src)[0]; } -static void memcpy_8_1(uint8_t *des, const uint8_t *src) +#define loadu0_u64 load_u64 + +static void loadu1_u64(uint8_t *des, const uint8_t *src) { uint64_t temp2 = ((const uint64_t *)(src - 1))[0]; uint64_t temp3 = src[7]; @@ -78,7 +88,7 @@ static void memcpy_8_1(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[0] = temp2; } -static void memcpy_8_2(uint8_t *des, const uint8_t *src) +static void loadu2_u64(uint8_t *des, const uint8_t *src) { #if 1 uint64_t temp2 = ((const uint64_t *)(src - 2))[0]; @@ -99,7 +109,7 @@ static void memcpy_8_2(uint8_t *des, const uint8_t *src) #endif } -static void memcpy_8_3(uint8_t *des, const uint8_t *src) +static void loadu3_u64(uint8_t *des, const uint8_t *src) { #if 1 uint64_t temp2 = ((const uint64_t *)(src - 3))[0]; @@ -120,7 +130,7 @@ static void memcpy_8_3(uint8_t *des, const uint8_t *src) #endif } -static void memcpy_8_4(uint8_t *des, const uint8_t *src) +static void loadu4_u64(uint8_t *des, const uint8_t *src) { #if 1 uint64_t temp2 = ((const uint32_t *)src)[0]; @@ -136,7 +146,7 @@ static void memcpy_8_4(uint8_t *des, const uint8_t *src) #endif } -static void memcpy_8_5(uint8_t *des, const uint8_t *src) +static void loadu5_u64(uint8_t *des, const uint8_t *src) { #if 1 uint64_t temp2 = ((const uint32_t *)(src - 1))[0]; @@ -158,7 +168,7 @@ static void memcpy_8_5(uint8_t *des, const uint8_t *src) #endif } -static void memcpy_8_6(uint8_t *des, const uint8_t *src) +static void loadu6_u64(uint8_t *des, const uint8_t *src) { uint64_t temp2 = ((const uint16_t *)src)[0]; uint64_t temp3 = ((const uint64_t *)(src + 2))[0]; @@ -167,7 +177,7 @@ static void memcpy_8_6(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[0] = temp2; } -static void memcpy_8_7(uint8_t *des, const uint8_t *src) +static void loadu7_u64(uint8_t *des, const uint8_t *src) { uint64_t temp2 = src[0]; uint64_t temp3 = ((const uint64_t *)(src + 1))[0]; @@ -176,7 +186,7 @@ static void memcpy_8_7(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[0] = temp2; } -static void memcpy_16_0(uint8_t *des, const uint8_t *src) +static void load_u128(uint8_t *des, const uint8_t *src) { uint64_t temp2 = ((const uint64_t *)src)[0]; uint64_t temp3 = ((const uint64_t *)src)[1]; @@ -184,7 +194,9 @@ static void memcpy_16_0(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[1] = temp3; } -static void memcpy_16_1(uint8_t *des, const uint8_t *src) +#define loadu0_u128 load_u128 + +static void loadu1_u128(uint8_t *des, const uint8_t *src) { uint64_t temp2 = ((const uint64_t *)(src - 1))[0]; uint64_t temp3 = ((const uint64_t *)(src + 7))[0]; @@ -199,7 +211,7 @@ static void memcpy_16_1(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[1] = temp3; } -static void memcpy_16_2(uint8_t *des, const uint8_t *src) +static void loadu2_u128(uint8_t *des, const uint8_t *src) { uint64_t temp2 = ((const uint64_t *)(src - 2))[0]; uint64_t temp3 = ((const uint64_t *)(src + 6))[0]; @@ -214,7 +226,7 @@ static void memcpy_16_2(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[1] = temp3; } -static void memcpy_16_3(uint8_t *des, const uint8_t *src) +static void loadu3_u128(uint8_t *des, const uint8_t *src) { uint64_t temp2 = ((const uint64_t *)(src - 3))[0]; uint64_t temp3 = ((const uint64_t *)(src + 5))[0]; @@ -229,7 +241,7 @@ static void memcpy_16_3(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[1] = temp3; } -static void memcpy_16_4(uint8_t *des, const uint8_t *src) +static void loadu4_u128(uint8_t *des, const uint8_t *src) { uint64_t temp2 = ((const uint32_t *)src)[0]; uint64_t temp3 = ((const uint32_t *)(src + 4))[0]; @@ -243,7 +255,7 @@ static void memcpy_16_4(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[1] = temp3; } -static void memcpy_16_5(uint8_t *des, const uint8_t *src) +static void loadu5_u128(uint8_t *des, const uint8_t *src) { uint64_t temp2 = ((const uint32_t *)(src - 1))[0]; uint64_t temp3 = ((const uint64_t *)(src + 3))[0]; @@ -258,7 +270,7 @@ static void memcpy_16_5(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[1] = temp3; } -static void memcpy_16_6(uint8_t *des, const uint8_t *src) +static void loadu6_u128(uint8_t *des, const uint8_t *src) { uint64_t temp2 = ((const uint16_t *)src)[0]; uint64_t temp3 = ((const uint64_t *)(src + 2))[0]; @@ -272,7 +284,7 @@ static void memcpy_16_6(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[1] = temp3; } -static void memcpy_16_7(uint8_t *des, const uint8_t *src) +static void loadu7_u128(uint8_t *des, const uint8_t *src) { uint64_t temp2 = src[0]; uint64_t temp3 = ((const uint64_t *)(src + 1))[0]; @@ -286,7 +298,7 @@ static void memcpy_16_7(uint8_t *des, const uint8_t *src) ((uint64_t *)des)[1] = temp3; } -#define I16_SET_I8(des, src) \ +#define LOADU_U16(des, src) \ { \ uint16_t temp0 = (src)[0]; \ uint16_t temp1 = (src)[1]; \ @@ -295,7 +307,7 @@ static void memcpy_16_7(uint8_t *des, const uint8_t *src) (des) = temp0; \ } -#define I32_SET_I8(des, src) \ +#define LOADU_U32(des, src) \ { \ uint32_t temp0 = (src)[0]; \ uint32_t temp1 = (src)[1]; \ @@ -310,52 +322,52 @@ static void memcpy_16_7(uint8_t *des, const uint8_t *src) (des) = temp0; \ } -#define MEMCPY_ALIGNMENT(align_num) \ - { \ - while (src != src_end) \ - { \ - memcpy_16_##align_num(des, src); \ - src += 16; \ - des += 16; \ - } \ - switch (residual) \ - { \ - case 15: \ - des[14] = src[14]; \ - case 14: \ - des[13] = src[13]; \ - case 13: \ - des[12] = src[12]; \ - case 12: \ - memcpy_8_##align_num(des, src); \ - memcpy_4_##align_num(des + 8, src + 8); \ - break; \ - case 11: \ - des[10] = src[10]; \ - case 10: \ - des[9] = src[9]; \ - case 9: \ - des[8] = src[8]; \ - case 8: \ - memcpy_8_##align_num(des, src); \ - break; \ - case 7: \ - des[6] = src[6]; \ - case 6: \ - des[5] = src[5]; \ - case 5: \ - des[4] = src[4]; \ - case 4: \ - memcpy_4_##align_num(des, src); \ - break; \ - case 3: \ - des[2] = src[2]; \ - case 2: \ - des[1] = src[1]; \ - case 1: \ - des[0] = src[0]; \ - break; \ - } \ +#define MEMCPY_OFFSET(align_num) \ + { \ + while (src != src_end) \ + { \ + loadu##align_num##_u128(des, src); \ + src += 16; \ + des += 16; \ + } \ + switch (residual) \ + { \ + case 15: \ + des[14] = src[14]; \ + case 14: \ + des[13] = src[13]; \ + case 13: \ + des[12] = src[12]; \ + case 12: \ + loadu##align_num##_u64(des, src); \ + loadu##align_num##_u32(des + 8, src + 8); \ + break; \ + case 11: \ + des[10] = src[10]; \ + case 10: \ + des[9] = src[9]; \ + case 9: \ + des[8] = src[8]; \ + case 8: \ + loadu##align_num##_u64(des, src); \ + break; \ + case 7: \ + des[6] = src[6]; \ + case 6: \ + des[5] = src[5]; \ + case 5: \ + des[4] = src[4]; \ + case 4: \ + loadu##align_num##_u32(des, src); \ + break; \ + case 3: \ + des[2] = src[2]; \ + case 2: \ + des[1] = src[1]; \ + case 1: \ + des[0] = src[0]; \ + break; \ + } \ } void * @@ -370,41 +382,41 @@ void * { case 1: des[0] = src[0]; - I16_SET_I8(((uint16_t *)(des + 1))[0], src + 1); - I32_SET_I8(((uint32_t *)(des + 3))[0], src + 3); + LOADU_U16(((uint16_t *)(des + 1))[0], src + 1); + LOADU_U32(((uint32_t *)(des + 3))[0], src + 3); count -= 7; des += 7; src += 7; break; case 2: - I16_SET_I8(((uint16_t *)des)[0], src); - I32_SET_I8(((uint32_t *)(des + 2))[0], src + 2); + LOADU_U16(((uint16_t *)des)[0], src); + LOADU_U32(((uint32_t *)(des + 2))[0], src + 2); count -= 6; des += 6; src += 6; break; case 3: des[0] = src[0]; - I32_SET_I8(((uint32_t *)(des + 1))[0], src + 1); + LOADU_U32(((uint32_t *)(des + 1))[0], src + 1); count -= 5; des += 5; src += 5; break; case 4: - I32_SET_I8(((uint32_t *)des)[0], src); + LOADU_U32(((uint32_t *)des)[0], src); count -= 4; des += 4; src += 4; break; case 5: des[0] = src[0]; - I16_SET_I8(((uint16_t *)(des + 1))[0], src + 1); + LOADU_U16(((uint16_t *)(des + 1))[0], src + 1); count -= 3; des += 3; src += 3; break; case 6: - I16_SET_I8(((uint16_t *)des)[0], src); + LOADU_U16(((uint16_t *)des)[0], src); count -= 2; des += 2; src += 2; @@ -422,28 +434,28 @@ void * switch (ALIGNOF(src)) { case 0: - MEMCPY_ALIGNMENT(0); + MEMCPY_OFFSET(0); break; case 1: - MEMCPY_ALIGNMENT(1); + MEMCPY_OFFSET(1); break; case 2: - MEMCPY_ALIGNMENT(2); + MEMCPY_OFFSET(2); break; case 3: - MEMCPY_ALIGNMENT(3); + MEMCPY_OFFSET(3); break; case 4: - MEMCPY_ALIGNMENT(4); + MEMCPY_OFFSET(4); break; case 5: - MEMCPY_ALIGNMENT(5); + MEMCPY_OFFSET(5); break; case 6: - MEMCPY_ALIGNMENT(6); + MEMCPY_OFFSET(6); break; case 7: - MEMCPY_ALIGNMENT(7); + MEMCPY_OFFSET(7); break; } }