Skip to content
This repository has been archived by the owner on Aug 17, 2022. It is now read-only.

improve memcpy performance #43

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 10 additions & 73 deletions newlib/libc/machine/riscv/memcpy.c
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/* Copyright (c) 2017 SiFive Inc. All rights reserved.
/* Copyright (c) 2019 SiFive Inc. All rights reserved.

This copyrighted material is made available to anyone wishing to use,
modify, copy, or redistribute it subject to the terms and conditions
Expand All @@ -9,75 +9,12 @@
http://www.opensource.org/licenses.
*/

#include <string.h>
#include <stdint.h>
#include "../../string/local.h"

#define unlikely(X) __builtin_expect (!!(X), 0)

void *
__inhibit_loop_to_libcall
memcpy(void *__restrict aa, const void *__restrict bb, size_t n)
{
#define BODY(a, b, t) { \
t tt = *b; \
a++, b++; \
*(a - 1) = tt; \
}

char *a = (char *)aa;
const char *b = (const char *)bb;
char *end = a + n;
uintptr_t msk = sizeof (long) - 1;
if (unlikely ((((uintptr_t)a & msk) != ((uintptr_t)b & msk))
|| n < sizeof (long)))
{
small:
if (__builtin_expect (a < end, 1))
while (a < end)
BODY (a, b, char);
return aa;
}

if (unlikely (((uintptr_t)a & msk) != 0))
while ((uintptr_t)a & msk)
BODY (a, b, char);

long *la = (long *)a;
const long *lb = (const long *)b;
long *lend = (long *)((uintptr_t)end & ~msk);

if (unlikely (la < (lend - 8)))
{
while (la < (lend - 8))
{
long b0 = *lb++;
long b1 = *lb++;
long b2 = *lb++;
long b3 = *lb++;
long b4 = *lb++;
long b5 = *lb++;
long b6 = *lb++;
long b7 = *lb++;
long b8 = *lb++;
*la++ = b0;
*la++ = b1;
*la++ = b2;
*la++ = b3;
*la++ = b4;
*la++ = b5;
*la++ = b6;
*la++ = b7;
*la++ = b8;
}
}

while (la < lend)
BODY (la, lb, long);

a = (char *)la;
b = (const char *)lb;
if (unlikely (a < end))
goto small;
return aa;
}
#if defined(PREFER_SIZE_OVER_SPEED) || defined(__OPTIMIZE_SIZE__)
#include "memcpy_size.c"
#else
#if __riscv_xlen == 64
#include "memcpy64_speed.c"
#else
#include "memcpy32_speed.c"
#endif
#endif
301 changes: 301 additions & 0 deletions newlib/libc/machine/riscv/memcpy32_speed.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,301 @@
/* Copyright (c) 2019 SiFive Inc. All rights reserved.

This copyrighted material is made available to anyone wishing to use,
modify, copy, or redistribute it subject to the terms and conditions
of the FreeBSD License. This program is distributed in the hope that
it will be useful, but WITHOUT ANY WARRANTY expressed or implied,
including the implied warranties of MERCHANTABILITY or FITNESS FOR
A PARTICULAR PURPOSE. A copy of this license is available at
http://www.opensource.org/licenses.
*/

#include <stdint.h>
#include <string.h>
#include "../../string/local.h"

#define ALIGNOF(addr) (((uintptr_t)(addr)) & (sizeof(void *) - 1))

// load: aligned load
// loadu: unaligned load
// loaduX: unaligned load with offset X
// u32: unsigned 32 bit integer
// u64: unsigned 64 bit integer
// u128: unsigned 128 bit integer

static void load_u32(uint8_t *des, const uint8_t *src)
{
((uint32_t *)des)[0] = ((const uint32_t *)src)[0];
}

#define loadu0_u32 load_u32

static void loadu1_u32(uint8_t *des, const uint8_t *src)
{
uint32_t temp2 = ((const uint32_t *)(src - 1))[0];
uint32_t temp3 = src[3];
temp2 >>= 8;
temp3 <<= 24;
temp2 |= temp3;
((uint32_t *)des)[0] = temp2;
}

static void loadu2_u32(uint8_t *des, const uint8_t *src)
{
#if 1
uint16_t temp2 = ((const uint16_t *)src)[0];
uint16_t temp3 = ((const uint16_t *)src)[1];
((uint16_t *)des)[0] = temp2;
((uint16_t *)des)[1] = temp3;
#else
uint32_t temp2 = ((const uint16_t *)src)[0];
uint32_t temp3 = ((const uint16_t *)src)[1];
temp3 <<= 16;
temp2 |= temp3;
((uint32_t *)des)[0] = temp2;
#endif
}

static void loadu3_u32(uint8_t *des, const uint8_t *src)
{
uint32_t temp2 = src[0];
uint32_t temp3 = ((const uint32_t *)(src + 1))[0];
temp3 <<= 8;
temp2 |= temp3;
((uint32_t *)des)[0] = temp2;
}

static void load_u64(uint8_t *des, const uint8_t *src)
{
uint32_t temp2 = ((const uint32_t *)src)[0];
uint32_t temp3 = ((const uint32_t *)src)[1];
((uint32_t *)des)[0] = temp2;
((uint32_t *)des)[1] = temp3;
}

#define loadu0_u64 load_u64

static void loadu1_u64(uint8_t *des, const uint8_t *src)
{
uint32_t temp2 = ((const uint32_t *)(src - 1))[0];
uint32_t temp3 = ((const uint32_t *)(src + 3))[0];
temp2 >>= 8;
uint32_t temp4 = temp3 << 24;
uint32_t temp5 = src[7];
temp2 |= temp4;
temp3 >>= 8;
temp5 <<= 24;
((uint32_t *)des)[0] = temp2;
temp3 |= temp5;
((uint32_t *)des)[1] = temp3;
}

static void loadu2_u64(uint8_t *des, const uint8_t *src)
{
uint16_t temp2 = ((const uint16_t *)src)[0];
uint16_t temp3 = ((const uint16_t *)src)[1];
uint16_t temp4 = ((const uint16_t *)src)[2];
uint16_t temp5 = ((const uint16_t *)src)[3];
((uint16_t *)des)[0] = temp2;
((uint16_t *)des)[1] = temp3;
((uint16_t *)des)[2] = temp4;
((uint16_t *)des)[3] = temp5;
}

static void loadu3_u64(uint8_t *des, const uint8_t *src)
{
uint32_t temp2 = src[0];
uint32_t temp3 = ((const uint32_t *)(src + 1))[0];
uint32_t temp4 = ((const uint32_t *)(src + 5))[0];
uint32_t temp5 = temp3 << 8;
temp4 <<= 8;
temp2 |= temp5;
temp3 >>= 24;
((uint32_t *)des)[0] = temp2;
temp3 |= temp4;
((uint32_t *)des)[1] = temp3;
}

static void load_u128(uint8_t *des, const uint8_t *src)
{
uint32_t temp2 = ((const uint32_t *)src)[0];
uint32_t temp3 = ((const uint32_t *)src)[1];
uint32_t temp4 = ((const uint32_t *)src)[2];
uint32_t temp5 = ((const uint32_t *)src)[3];
((uint32_t *)des)[0] = temp2;
((uint32_t *)des)[1] = temp3;
((uint32_t *)des)[2] = temp4;
((uint32_t *)des)[3] = temp5;
}

#define loadu0_u128 load_u128

static void loadu1_u128(uint8_t *des, const uint8_t *src)
{
uint32_t temp2 = ((const uint32_t *)(src - 1))[0];
uint32_t temp3 = ((const uint32_t *)(src + 3))[0];
temp2 >>= 8;
uint32_t temp4 = temp3 << 24;
uint32_t temp5 = ((const uint32_t *)(src + 7))[0];
temp2 |= temp4;
temp3 >>= 8;
temp4 = temp5 << 24;
((uint32_t *)des)[0] = temp2;
temp3 |= temp4;
temp2 = ((const uint32_t *)(src + 11))[0];
temp5 >>= 8;
temp4 = temp2 << 24;
((uint32_t *)des)[1] = temp3;
temp5 |= temp4;
temp3 = src[15];
temp2 >>= 8;
temp3 <<= 24;
((uint32_t *)des)[2] = temp5;
temp2 |= temp3;
((uint32_t *)des)[3] = temp2;
}

static void loadu2_u128(uint8_t *des, const uint8_t *src)
{
uint16_t temp2 = ((const uint16_t *)src)[0];
uint16_t temp3 = ((const uint16_t *)src)[1];
uint16_t temp4 = ((const uint16_t *)src)[2];
uint16_t temp5 = ((const uint16_t *)src)[3];
((uint16_t *)des)[0] = temp2;
((uint16_t *)des)[1] = temp3;
((uint16_t *)des)[2] = temp4;
((uint16_t *)des)[3] = temp5;
temp2 = ((const uint16_t *)src)[4];
temp3 = ((const uint16_t *)src)[5];
temp4 = ((const uint16_t *)src)[6];
temp5 = ((const uint16_t *)src)[7];
((uint16_t *)des)[4] = temp2;
((uint16_t *)des)[5] = temp3;
((uint16_t *)des)[6] = temp4;
((uint16_t *)des)[7] = temp5;
}

static void loadu3_u128(uint8_t *des, const uint8_t *src)
{
uint32_t temp2 = src[0];
uint32_t temp3 = ((const uint32_t *)(src + 1))[0];
uint32_t temp4 = temp3 << 8;
uint32_t temp5 = ((const uint32_t *)(src + 5))[0];
temp2 |= temp4;
temp3 >>= 24;
temp4 = temp5 << 8;
((uint32_t *)des)[0] = temp2;
temp3 |= temp4;
temp2 = ((const uint32_t *)(src + 9))[0];
temp5 >>= 24;
temp4 = temp2 << 8;
((uint32_t *)des)[1] = temp3;
temp5 |= temp4;
temp3 = ((const uint32_t *)(src + 13))[0];
temp2 >>= 24;
temp3 <<= 8;
((uint32_t *)des)[2] = temp5;
temp2 |= temp3;
((uint32_t *)des)[3] = temp2;
}

#define MEMCPY_OFFSET(align_num) \
{ \
while (src != src_end) \
{ \
loadu##align_num##_u128(des, src); \
src += 16; \
des += 16; \
} \
switch (residual) \
{ \
case 15: \
des[14] = src[14]; \
case 14: \
des[13] = src[13]; \
case 13: \
des[12] = src[12]; \
case 12: \
loadu##align_num##_u64(des, src); \
loadu##align_num##_u32(des + 8, src + 8); \
break; \
case 11: \
des[10] = src[10]; \
case 10: \
des[9] = src[9]; \
case 9: \
des[8] = src[8]; \
case 8: \
loadu##align_num##_u64(des, src); \
break; \
case 7: \
des[6] = src[6]; \
case 6: \
des[5] = src[5]; \
case 5: \
des[4] = src[4]; \
case 4: \
loadu##align_num##_u32(des, src); \
break; \
case 3: \
des[2] = src[2]; \
case 2: \
des[1] = src[1]; \
case 1: \
des[0] = src[0]; \
break; \
} \
}

void *
__inhibit_loop_to_libcall
memcpy(void *restrict d, const void *restrict s, size_t count)
{
uint8_t *des = (uint8_t *)d;
const uint8_t *src = (const uint8_t *)s;
if (3 < count)
{
uintptr_t alignof_des = (4 - ALIGNOF(des));
switch (alignof_des)
{
case 3:
des[2] = src[2];
case 2:
des[1] = src[1];
case 1:
des[0] = src[0];
count -= alignof_des;
des += alignof_des;
src += alignof_des;
break;
}
size_t residual = count & 15;
count &= 0xFFFFFFF0;
const uint8_t *src_end = src + count;
switch (ALIGNOF(src))
{
case 0:
MEMCPY_OFFSET(0);
break;
case 1:
MEMCPY_OFFSET(1);
break;
case 2:
MEMCPY_OFFSET(2);
break;
case 3:
MEMCPY_OFFSET(3);
break;
}
}
else
switch (count)
{
case 3:
des[2] = src[2];
case 2:
des[1] = src[1];
case 1:
des[0] = src[0];
}
return d;
}
Loading