Skip to content

Commit

Permalink
Merge pull request #760 from Ka-zam/rcp_avx512
Browse files Browse the repository at this point in the history
New AVX512F implementation
  • Loading branch information
jdemel authored Mar 30, 2024
2 parents 5658541 + 89bfc55 commit 8a015bb
Show file tree
Hide file tree
Showing 2 changed files with 203 additions and 1 deletion.
201 changes: 201 additions & 0 deletions kernels/volk/volk_32f_reciprocal_32f.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
/* -*- c++ -*- */
/*
* Copyright 2024 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
* SPDX-License-Identifier: LGPL-3.0-or-later
*/

/*!
* \page volk_32f_reciprocal_32f
*
* \b Overview
*
* Computes the reciprocal of the input vector and stores the results
* in the output vector. For the AVX512F implementation the relative
* error is < 2**(-14) = 6.1e-05
*
* <b>Dispatcher Prototype</b>
* \code
* void volk_32f_reciprocal_32f(float* out, const float* in, unsigned int num_points)
* \endcode
*
* \b Inputs
* \li in: A pointer to the input vector of floats.
* \li num_points: The number of data points.
*
* \b Outputs
* \li bVector: A pointer to the output vector of floats.
*
* \b Example
* \code
int N = 10;
unsigned int alignment = volk_get_alignment();
float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
float* out = (float*)volk_malloc(sizeof(float)*N, alignment);
for(unsigned int ii = 1; ii < N; ++ii){
in[ii] = (float)(ii*ii);
}
volk_32f_reciprocal_32f(out, in, N);
for(unsigned int ii = 0; ii < N; ++ii){
printf("out(%i) = %f\n", ii, out[ii]);
}
volk_free(in);
volk_free(out);
* \endcode
*/

#ifndef INCLUDED_volk_32f_reciprocal_32f_a_H
#define INCLUDED_volk_32f_reciprocal_32f_a_H

#ifdef LV_HAVE_GENERIC
static inline void
volk_32f_reciprocal_32f_generic(float* out, const float* in, unsigned int num_points)
{
for (unsigned int i = 0; i < num_points; i++) {
out[i] = 1.f / in[i];
}
}
#endif /* LV_HAVE_GENERIC */

#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
static inline void
volk_32f_reciprocal_32f_a_sse(float* out, const float* in, unsigned int num_points)
{
const __m128 ONE = _mm_set_ps1(1.f);
const unsigned int quarter_points = num_points / 4;

for (unsigned int number = 0; number < quarter_points; number++) {
__m128 x = _mm_load_ps(in);
in += 4;
__m128 r = _mm_div_ps(ONE, x);
_mm_store_ps(out, r);
out += 4;
}

const unsigned int done = quarter_points * 4;

volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_SSE */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_reciprocal_32f_a_avx(float* out, const float* in, unsigned int num_points)
{
const __m256 ONE = _mm256_set1_ps(1.f);
const unsigned int eighth_points = num_points / 8;

for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_load_ps(in);
in += 8;
__m256 r = _mm256_div_ps(ONE, x);
_mm256_store_ps(out, r);
out += 8;
}

const unsigned int done = eighth_points * 8;

volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX */

#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
static inline void
volk_32f_reciprocal_32f_a_avx512(float* out, const float* in, unsigned int num_points)
{
const unsigned int sixteenth_points = num_points / 16;

for (unsigned int number = 0; number < sixteenth_points; number++) {
__m512 x = _mm512_load_ps(in);
in += 16;
__m512 r = _mm512_rcp14_ps(x);
_mm512_store_ps(out, r);
out += 16;
}

const unsigned int done = sixteenth_points * 16;

volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX512F */

#endif /* INCLUDED_volk_32f_reciprocal_32f_a_H */

#ifndef INCLUDED_volk_32f_reciprocal_32f_u_H
#define INCLUDED_volk_32f_reciprocal_32f_u_H

#ifdef LV_HAVE_SSE
#include <xmmintrin.h>
static inline void
volk_32f_reciprocal_32f_u_sse(float* out, const float* in, unsigned int num_points)
{
const __m128 ONE = _mm_set_ps1(1.f);
const unsigned int quarter_points = num_points / 4;

for (unsigned int number = 0; number < quarter_points; number++) {
__m128 x = _mm_loadu_ps(in);
in += 4;
__m128 r = _mm_div_ps(ONE, x);
_mm_storeu_ps(out, r);
out += 4;
}

const unsigned int done = quarter_points * 4;

volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_SSE */

#ifdef LV_HAVE_AVX
#include <immintrin.h>
static inline void
volk_32f_reciprocal_32f_u_avx(float* out, const float* in, unsigned int num_points)
{
const __m256 ONE = _mm256_set1_ps(1.f);
const unsigned int eighth_points = num_points / 8;

for (unsigned int number = 0; number < eighth_points; number++) {
__m256 x = _mm256_loadu_ps(in);
in += 8;
__m256 r = _mm256_div_ps(ONE, x);
_mm256_storeu_ps(out, r);
out += 8;
}

const unsigned int done = eighth_points * 8;

volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX */

#ifdef LV_HAVE_AVX512F
#include <immintrin.h>
static inline void
volk_32f_reciprocal_32f_u_avx512(float* out, const float* in, unsigned int num_points)
{
const unsigned int sixteenth_points = num_points / 16;

for (unsigned int number = 0; number < sixteenth_points; number++) {
__m512 x = _mm512_loadu_ps(in);
in += 16;
__m512 r = _mm512_rcp14_ps(x);
_mm512_storeu_ps(out, r);
out += 16;
}

const unsigned int done = sixteenth_points * 16;

volk_32f_reciprocal_32f_generic(out, in, num_points - done);
}
#endif /* LV_HAVE_AVX512F */

#endif /* INCLUDED_volk_32f_reciprocal_32f_u_H */
3 changes: 2 additions & 1 deletion lib/kernel_tests.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
/* -*- c++ -*- */
/*
* Copyright 2014 - 2021 Free Software Foundation, Inc.
* Copyright 2023 Magnus Lundmark <magnuslundmark@gmail.com>
* Copyright 2023, 2024 Magnus Lundmark <magnuslundmark@gmail.com>
*
* This file is part of VOLK
*
Expand Down Expand Up @@ -141,6 +141,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
QA(VOLK_INIT_TEST(volk_32f_64f_add_64f, test_params))
QA(VOLK_INIT_TEST(volk_32f_s32f_normalize, test_params))
QA(VOLK_INIT_TEST(volk_32f_s32f_power_32f, test_params))
QA(VOLK_INIT_TEST(volk_32f_reciprocal_32f, test_params.make_tol(6.15e-5)))
QA(VOLK_INIT_TEST(volk_32f_sqrt_32f, test_params_inacc))
QA(VOLK_INIT_TEST(volk_32f_s32f_stddev_32f, test_params_inacc))
QA(VOLK_INIT_TEST(volk_32f_stddev_and_mean_32f_x2, test_params.make_absolute(1e-5)))
Expand Down

0 comments on commit 8a015bb

Please sign in to comment.