Skip to content

Commit

Permalink
Misc: Fix alignment errors on ARM32
Browse files Browse the repository at this point in the history
  • Loading branch information
stenzek committed Nov 23, 2024
1 parent bb24d40 commit 5c03e1d
Show file tree
Hide file tree
Showing 16 changed files with 230 additions and 57 deletions.
4 changes: 2 additions & 2 deletions src/common-tests/gsvector_yuvtorgb_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ static void YUVToRGB_Vector(const std::array<s16, 64>& Crblk, const std::array<s
const GSVector4i addval = signed_output ? GSVector4i::cxpr(0) : GSVector4i::cxpr(0x80808080);
for (u32 y = 0; y < 8; y++)
{
const GSVector4i Cr = GSVector4i::loadl(&Crblk[(y / 2) * 8]).s16to32();
const GSVector4i Cb = GSVector4i::loadl(&Cbblk[(y / 2) * 8]).s16to32();
const GSVector4i Cr = GSVector4i::loadl<false>(&Crblk[(y / 2) * 8]).s16to32();
const GSVector4i Cb = GSVector4i::loadl<false>(&Cbblk[(y / 2) * 8]).s16to32();
const GSVector4i Y = GSVector4i::load<true>(&Yblk[y * 8]);

// BT.601 YUV->RGB coefficients, rounding formula from Mednafen.
Expand Down
2 changes: 1 addition & 1 deletion src/common/gsvector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ GSMatrix2x2 GSMatrix2x2::Rotation(float angle_in_radians)

GSVector2 GSMatrix2x2::row(size_t i) const
{
return GSVector2::load(&E[i][0]);
return GSVector2::load<true>(&E[i][0]);
}

GSVector2 GSMatrix2x2::col(size_t i) const
Expand Down
2 changes: 1 addition & 1 deletion src/common/gsvector.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,5 @@ class GSMatrix2x2

void store(void* m);

float E[2][2];
alignas(8) float E[2][2];
};
138 changes: 126 additions & 12 deletions src/common/gsvector_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -690,15 +690,36 @@ class alignas(16) GSVector2i

ALWAYS_INLINE static GSVector2i zext32(s32 v) { return GSVector2i(vset_lane_s32(v, vdup_n_s32(0), 0)); }

ALWAYS_INLINE static GSVector2i load(const void* p) { return GSVector2i(vld1_s32((const int32_t*)p)); }
template<bool aligned>
ALWAYS_INLINE static GSVector2i load(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector2i(vreinterpret_s32_s8(vld1_s8((const int8_t*)p)));
#endif

return GSVector2i(vld1_s32((const int32_t*)p));
}

ALWAYS_INLINE static void store32(void* p, const GSVector2i& v)
{
s32 val = vget_lane_s32(v, 0);
std::memcpy(p, &val, sizeof(s32));
}

ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { vst1_s32((int32_t*)p, v.v2s); }
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vreinterpret_s8_s32(v.v2s));
return;
}
#endif

vst1_s32((int32_t*)p, v.v2s);
}

ALWAYS_INLINE void operator&=(const GSVector2i& v)
{
Expand Down Expand Up @@ -903,9 +924,30 @@ class alignas(16) GSVector2

ALWAYS_INLINE static GSVector2 xffffffff() { return GSVector2(vreinterpret_f32_u32(vdup_n_u32(0xFFFFFFFFu))); }

ALWAYS_INLINE static GSVector2 load(const void* p) { return GSVector2(vld1_f32(static_cast<const float*>(p))); }
template<bool aligned>
ALWAYS_INLINE static GSVector2 load(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector2(vreinterpret_f32_s8(vld1_s8((const int8_t*)p)));
#endif

return GSVector2(vld1_f32(static_cast<const float*>(p)));
}

template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8(static_cast<int8_t*>(p), vreinterpret_s8_f32(v.v2s));
return;
}
#endif

ALWAYS_INLINE static void store(void* p, const GSVector2& v) { vst1_f32(static_cast<float*>(p), v.v2s); }
vst1_f32(static_cast<float*>(p), v.v2s);
}

ALWAYS_INLINE GSVector2 operator-() const { return neg(); }

Expand Down Expand Up @@ -2134,13 +2176,25 @@ class alignas(16) GSVector4i

ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(vsetq_lane_s32(v, vdupq_n_s32(0), 0)); }

template<bool aligned>
ALWAYS_INLINE static GSVector4i loadl(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4i(vcombine_s32(vreinterpret_s32_s8(vld1_s8((int8_t*)p)), vcreate_s32(0)));
#endif

return GSVector4i(vcombine_s32(vld1_s32((const int32_t*)p), vcreate_s32(0)));
}

template<bool aligned>
ALWAYS_INLINE static GSVector4i loadh(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4i(vreinterpretq_s32_s8(vcombine_s8(vdup_n_s8(0), vld1_s8((int8_t*)p))));
#endif

return GSVector4i(vreinterpretq_s32_s64(vcombine_s64(vdup_n_s64(0), vld1_s64((int64_t*)p))));
}

Expand All @@ -2149,6 +2203,11 @@ class alignas(16) GSVector4i
template<bool aligned>
ALWAYS_INLINE static GSVector4i load(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4i(vreinterpretq_s32_s8(vld1q_s8((int8_t*)p)));
#endif

return GSVector4i(vreinterpretq_s32_s64(vld1q_s64((int64_t*)p)));
}

Expand All @@ -2167,19 +2226,45 @@ class alignas(16) GSVector4i
std::memcpy(p, &val, sizeof(u32));
}

template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vget_low_s8(vreinterpretq_s8_s32(v.v4s)));
return;
}
#endif

vst1_s64((int64_t*)p, vget_low_s64(vreinterpretq_s64_s32(v.v4s)));
}

template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vget_high_s8(vreinterpretq_s8_s32(v.v4s)));
return;
}
#endif

vst1_s64((int64_t*)p, vget_high_s64(vreinterpretq_s64_s32(v.v4s)));
}

template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1q_s8((int8_t*)p, vreinterpretq_s8_s32(v.v4s));
return;
}
#endif

vst1q_s64((int64_t*)p, vreinterpretq_s64_s32(v.v4s));
}

Expand Down Expand Up @@ -2652,8 +2737,14 @@ class alignas(16) GSVector4

ALWAYS_INLINE static GSVector4 xffffffff() { return GSVector4(vreinterpretq_f32_u32(vdupq_n_u32(0xFFFFFFFFu))); }

template<bool aligned>
ALWAYS_INLINE static GSVector4 loadl(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4(vcombine_f32(vreinterpret_f32_s8(vld1_s8((int8_t*)p)), vcreate_f32(0)));
#endif

return GSVector4(vcombine_f32(vld1_f32((const float*)p), vcreate_f32(0)));
}

Expand All @@ -2662,32 +2753,55 @@ class alignas(16) GSVector4
template<bool aligned>
ALWAYS_INLINE static GSVector4 load(const void* p)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
return GSVector4(vreinterpretq_f32_s8(vld1q_s8((int8_t*)p)));
#endif

return GSVector4(vld1q_f32((const float*)p));
}

ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { vst1q_f32((float*)p, v.v4s); }

template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
{
#ifdef CPU_ARCH_ARM64
vst1_f64((double*)p, vget_low_f64(vreinterpretq_f64_f32(v.v4s)));
#else
vst1_s64((s64*)p, vget_low_s64(vreinterpretq_s64_f32(v.v4s)));
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_low_f32(v.v4s)));
return;
}
#endif

vst1_f32((float*)p, vget_low_f32(v.v4s));
}

template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
{
#ifdef CPU_ARCH_ARM64
vst1_f64((double*)p, vget_high_f64(vreinterpretq_f64_f32(v.v4s)));
#else
vst1_s64((s64*)p, vget_high_s64(vreinterpretq_s64_f32(v.v4s)));
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1_s8((int8_t*)p, vreinterpret_s8_f32(vget_high_f32(v.v4s)));
return;
}
#endif

vst1_f32((float*)p, vget_high_f32(v.v4s));
}

template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
{
#ifdef CPU_ARCH_ARM32
if constexpr (!aligned)
{
vst1q_s8((int8_t*)p, vreinterpretq_s8_f32(v.v4s));
return;
}
#endif

vst1q_f32((float*)p, v.v4s);
}

Expand Down
47 changes: 40 additions & 7 deletions src/common/gsvector_nosimd.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,14 +467,19 @@ class alignas(16) GSVector2i

ALWAYS_INLINE static GSVector2i set32(s32 v) { return GSVector2i(v, 0); }

template<bool aligned>
ALWAYS_INLINE static GSVector2i load(const void* p)
{
GSVector2i ret;
std::memcpy(ret.S32, p, sizeof(ret.S32));
return ret;
}

ALWAYS_INLINE static void store(void* p, const GSVector2i& v) { std::memcpy(p, v.S32, sizeof(S32)); }
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2i& v)
{
std::memcpy(p, v.S32, sizeof(S32));
}

ALWAYS_INLINE static void store32(void* p, const GSVector2i& v) { std::memcpy(p, &v.x, sizeof(s32)); }

Expand Down Expand Up @@ -658,14 +663,19 @@ class alignas(16) GSVector2
return ret;
}

template<bool aligned>
ALWAYS_INLINE static GSVector2 load(const void* p)
{
GSVector2 ret;
std::memcpy(ret.F32, p, sizeof(F32));
return ret;
}

ALWAYS_INLINE static void store(void* p, const GSVector2& v) { std::memcpy(p, &v.F32, sizeof(F32)); }
template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector2& v)
{
std::memcpy(p, &v.F32, sizeof(F32));
}

ALWAYS_INLINE GSVector2 operator-() const { return neg(); }

Expand Down Expand Up @@ -1530,6 +1540,7 @@ class alignas(16) GSVector4i

ALWAYS_INLINE static GSVector4i zext32(s32 v) { return GSVector4i(v, 0, 0, 0); }

template<bool aligned>
ALWAYS_INLINE static GSVector4i loadl(const void* p)
{
GSVector4i ret;
Expand All @@ -1538,6 +1549,7 @@ class alignas(16) GSVector4i
return ret;
}

template<bool aligned>
ALWAYS_INLINE static GSVector4i loadh(const void* p)
{
GSVector4i ret;
Expand All @@ -1546,7 +1558,11 @@ class alignas(16) GSVector4i
return ret;
}

ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v) { return loadh(&v); }
template<bool aligned>
ALWAYS_INLINE static GSVector4i loadh(const GSVector2i& v)
{
return loadh<true>(&v);
}

template<bool aligned>
ALWAYS_INLINE static GSVector4i load(const void* p)
Expand All @@ -1558,9 +1574,17 @@ class alignas(16) GSVector4i

ALWAYS_INLINE static void storent(void* p, const GSVector4i& v) { std::memcpy(p, v.S32, sizeof(v.S32)); }

ALWAYS_INLINE static void storel(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[0], sizeof(s32) * 2); }
template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4i& v)
{
std::memcpy(p, &v.S32[0], sizeof(s32) * 2);
}

ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v) { std::memcpy(p, &v.S32[2], sizeof(s32) * 2); }
template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4i& v)
{
std::memcpy(p, &v.S32[2], sizeof(s32) * 2);
}

template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4i& v)
Expand Down Expand Up @@ -1958,6 +1982,7 @@ class alignas(16) GSVector4
return ret;
}

template<bool aligned>
ALWAYS_INLINE static GSVector4 loadl(const void* p)
{
GSVector4 ret;
Expand All @@ -1977,9 +2002,17 @@ class alignas(16) GSVector4

ALWAYS_INLINE static void storent(void* p, const GSVector4& v) { std::memcpy(p, &v, sizeof(v)); }

ALWAYS_INLINE static void storel(void* p, const GSVector4& v) { std::memcpy(p, &v.x, sizeof(float) * 2); }
template<bool aligned>
ALWAYS_INLINE static void storel(void* p, const GSVector4& v)
{
std::memcpy(p, &v.x, sizeof(float) * 2);
}

ALWAYS_INLINE static void storeh(void* p, const GSVector4& v) { std::memcpy(p, &v.z, sizeof(float) * 2); }
template<bool aligned>
ALWAYS_INLINE static void storeh(void* p, const GSVector4& v)
{
std::memcpy(p, &v.z, sizeof(float) * 2);
}

template<bool aligned>
ALWAYS_INLINE static void store(void* p, const GSVector4& v)
Expand Down
Loading

0 comments on commit 5c03e1d

Please sign in to comment.