Skip to content

Commit

Permalink
- add some missing files
Browse files Browse the repository at this point in the history
  • Loading branch information
christoph-hart committed Dec 30, 2023
1 parent a295c6c commit 7fbfda7
Show file tree
Hide file tree
Showing 10 changed files with 30,489 additions and 0 deletions.
24,441 changes: 24,441 additions & 0 deletions hi_tools/hi_neural/RTNeural/modules/json/json.hpp

Large diffs are not rendered by default.

1,671 changes: 1,671 additions & 0 deletions hi_tools/hi_neural/RTNeural/modules/xsimd/arch/xsimd_avx.hpp

Large diffs are not rendered by default.

950 changes: 950 additions & 0 deletions hi_tools/hi_neural/RTNeural/modules/xsimd/arch/xsimd_avx2.hpp

Large diffs are not rendered by default.

627 changes: 627 additions & 0 deletions hi_tools/hi_neural/RTNeural/modules/xsimd/arch/xsimd_avx512bw.hpp

Large diffs are not rendered by default.

28 changes: 28 additions & 0 deletions hi_tools/hi_neural/RTNeural/modules/xsimd/arch/xsimd_avx512cd.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/

#ifndef XSIMD_AVX512CD_HPP
#define XSIMD_AVX512CD_HPP

#include "../types/xsimd_avx512cd_register.hpp"

namespace xsimd
{

namespace kernel
{
// Nothing there yet.

}

}

#endif
212 changes: 212 additions & 0 deletions hi_tools/hi_neural/RTNeural/modules/xsimd/arch/xsimd_avx512dq.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,212 @@
/***************************************************************************
* Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
* Martin Renou *
* Copyright (c) QuantStack *
* Copyright (c) Serge Guelton *
* *
* Distributed under the terms of the BSD 3-Clause License. *
* *
* The full license is in the file LICENSE, distributed with this software. *
****************************************************************************/

#ifndef XSIMD_AVX512_DQHPP
#define XSIMD_AVX512_D_HPP

#include "../types/xsimd_avx512dq_register.hpp"

namespace xsimd
{

namespace kernel
{
using namespace types;

// bitwise_and
template <class A>
inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_and_ps(self, other);
}
template <class A>
inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_and_pd(self, other);
}

// bitwise_andnot
template <class A>
inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_andnot_ps(other, self);
}
template <class A>
inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_andnot_pd(other, self);
}

// bitwise_not
template <class A>
inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
{
return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
}
template <class A>
inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
{
return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
}

// bitwise_or
template <class A>
inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_or_ps(self, other);
}
template <class A>
inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_or_pd(self, other);
}

template <class A, class T>
inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
{
using register_type = typename batch_bool<T, A>::register_type;
return register_type(self.data | other.data);
}

// bitwise_xor
template <class A>
inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_xor_ps(self, other);
}
template <class A>
inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_xor_pd(self, other);
}

// haddp
template <class A>
inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
{
// The following folds over the vector once:
// tmp1 = [a0..8, b0..8]
// tmp2 = [a8..f, b8..f]
#define XSIMD_AVX512_HADDP_STEP1(I, a, b) \
batch<float, avx512f> res##I; \
{ \
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
res##I = _mm512_add_ps(tmp1, tmp2); \
}

XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);

#undef XSIMD_AVX512_HADDP_STEP1

// The following flds the code and shuffles so that hadd_ps produces the correct result
// tmp1 = [a0..4, a8..12, b0..4, b8..12] (same for tmp3)
// tmp2 = [a5..8, a12..16, b5..8, b12..16] (same for tmp4)
// tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d) \
batch<float, avx2> halfx##I; \
{ \
auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0)); \
auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1)); \
\
auto resx1 = _mm512_add_ps(tmp1, tmp2); \
\
auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0)); \
auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1)); \
\
auto resx2 = _mm512_add_ps(tmp3, tmp4); \
\
auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
\
auto resx3 = _mm512_add_ps(tmp5, tmp6); \
\
halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0), \
_mm512_extractf32x8_ps(resx3, 1)); \
}

XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);

#undef XSIMD_AVX512_HADDP_STEP2

auto concat = _mm512_castps256_ps512(halfx0);
concat = _mm512_insertf32x8(concat, halfx1, 1);
return concat;
}

// ldexp
template <class A>
inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
{
return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
}

// mul
template <class A>
inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_mullo_epi64(self, other);
}

template <class A>
inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
{
return _mm512_mullo_epi64(self, other);
}

// nearbyint_as_int
template <class A>
inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
requires_arch<avx512dq>) noexcept
{
return _mm512_cvtpd_epi64(self);
}

// reduce_add
template <class A>
inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
{
__m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
__m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
__m256 res1 = _mm256_add_ps(tmp1, tmp2);
return reduce_add(batch<float, avx2>(res1), avx2 {});
}

// convert
namespace detail
{
template <class A>
inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
{
return _mm512_cvtepi64_pd(self);
}

template <class A>
inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
{
return _mm512_cvttpd_epi64(self);
}

}

}

}

#endif
Loading

0 comments on commit 7fbfda7

Please sign in to comment.