Skip to content

Commit

Permalink
Merge pull request #20942 from trz42/20240629112950_new_pr_Highway104
Browse files Browse the repository at this point in the history
{lib}[GCCcore/12.3.0] Highway 1.0.4 fix for failing test on zen4
  • Loading branch information
boegel authored Jul 31, 2024
2 parents ae6c8d3 + 685f34d commit 524da37
Show file tree
Hide file tree
Showing 2 changed files with 66 additions and 1 deletion.
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,13 @@ toolchain = {'name': 'GCCcore', 'version': '12.3.0'}

source_urls = ['https://github.com/google/highway/archive/refs/tags/']
sources = ['%(version)s.tar.gz']
checksums = ['faccd343935c9e98afd1016e9d20e0b8b89d908508d1af958496f8c2d3004ac2']
patches = ['Highway-1.0.4-zen4-fix-TruncateTo-bug.patch']

checksums = [
{'1.0.4.tar.gz': 'faccd343935c9e98afd1016e9d20e0b8b89d908508d1af958496f8c2d3004ac2'},
{'Highway-1.0.4-zen4-fix-TruncateTo-bug.patch':
'e571413c290076a729dbb1df105a4bfa106099238d1b438e74a9dfc9557eb4a2'},
]

builddependencies = [
('binutils', '2.40'),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
A single test failed when building on AMD Genoa a.k.a Zen4 with the error message
reported in https://github.com/google/highway/issues/1913

Building v1.0.5 passed all tests. Hence, this patch uses some of the changes made
in v1.0.5 to let the single failing test succeed.

Looking at the PRs added by v1.0.5 a promising candidate was identified
(https://github.com/google/highway/pull/1276) and all changed files in the PR
were "studied in detail" (assessed if the changes could be related to the failing
test on Zen4). Luckily, the four changes provided in the patch below were
found to resolve the issue. It was also tested if a subset of these four changes
would resolve the issue, but this did not succeed.

Author: Thomas Roeblitz (University of Bergen)

diff --git a/hwy/ops/x86_256-inl.h b/hwy/ops/x86_256-inl.h
index 4e2e83e8..2fbf99c7 100644
--- a/hwy/ops/x86_256-inl.h
+++ b/hwy/ops/x86_256-inl.h
@@ -4185,7 +4185,7 @@ HWY_INLINE Vec128<uint32_t> LookupAndConcatHalves(Vec256<T> v) {
#if HWY_TARGET <= HWY_AVX3_DL
alignas(32) static constexpr uint32_t kMap[8] = {
LO, HI, 0x10101010 + LO, 0x10101010 + HI, 0, 0, 0, 0};
- const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
+ const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw);
#else
alignas(32) static constexpr uint32_t kMap[8] = {LO, HI, ~0u, ~0u,
~0u, ~0u, LO, HI};
@@ -4208,7 +4208,7 @@ HWY_INLINE Vec128<uint32_t, 2> LookupAndConcatQuarters(Vec256<T> v) {
#if HWY_TARGET <= HWY_AVX3_DL
alignas(32) static constexpr uint16_t kMap[16] = {
LO, HI, 0x1010 + LO, 0x1010 + HI, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
- const auto result = _mm256_permutexvar_epi8(v.raw, Load(d16, kMap).raw);
+ const auto result = _mm256_permutexvar_epi8(Load(d16, kMap).raw, v.raw);
return LowerHalf(Vec128<uint32_t>{_mm256_castsi256_si128(result)});
#else
constexpr uint16_t ff = static_cast<uint16_t>(~0u);
@@ -4229,7 +4229,7 @@ HWY_API Vec32<uint8_t> TruncateTo(D /* tag */, Vec256<uint64_t> v) {
#if HWY_TARGET <= HWY_AVX3_DL
alignas(32) static constexpr uint32_t kMap[8] = {0x18100800u, 0, 0, 0,
0, 0, 0, 0};
- const auto result = _mm256_permutexvar_epi8(v.raw, Load(d32, kMap).raw);
+ const auto result = _mm256_permutexvar_epi8(Load(d32, kMap).raw, v.raw);
return LowerHalf(LowerHalf(LowerHalf(Vec256<uint8_t>{result})));
#else
alignas(32) static constexpr uint32_t kMap[8] = {0xFFFF0800u, ~0u, ~0u, ~0u,
diff --git a/hwy/ops/x86_512-inl.h b/hwy/ops/x86_512-inl.h
index 167922d8..83f2ee67 100644
--- a/hwy/ops/x86_512-inl.h
+++ b/hwy/ops/x86_512-inl.h
@@ -3497,7 +3497,7 @@ HWY_API Vec128<uint8_t> TruncateTo(D /* tag */, const Vec512<uint32_t> v) {
alignas(16) static constexpr uint8_t k8From32[16] = {
0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60};
const Vec512<uint8_t> bytes{
- _mm512_permutexvar_epi32(LoadDup128(d8, k8From32).raw, v.raw)};
+ _mm512_permutexvar_epi8(LoadDup128(d8, k8From32).raw, v.raw)};
#else
const Full512<uint32_t> d32;
// In each 128 bit block, gather the lower byte of 4 uint32_t lanes into the

0 comments on commit 524da37

Please sign in to comment.