Skip to content

Commit

Permalink
Allow SIMD, Eigen, and libm functions in scattering analysis
Browse files Browse the repository at this point in the history
Different implementations exhibit very different performance (factor
of 5 observed) depending on the compiler and system environment.
Add unit and performance tests.
  • Loading branch information
rc83 committed Feb 7, 2020
1 parent 748b601 commit 801ffc2
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 32 deletions.
1 change: 1 addition & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ set(tsts
${CMAKE_SOURCE_DIR}/src/molecule_test.h
${CMAKE_SOURCE_DIR}/src/particle_test.h
${CMAKE_SOURCE_DIR}/src/potentials_test.h
${CMAKE_SOURCE_DIR}/src/scatter_test.h
${CMAKE_SOURCE_DIR}/src/space_test.h
${CMAKE_SOURCE_DIR}/src/tensor_test.h
)
Expand Down
72 changes: 40 additions & 32 deletions src/scatter.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ namespace Faunus {
*/
namespace Scatter {

enum Algorithm { SIMD, EIGEN, GENERIC }; //!< Selections for math algorithms

/** @brief Form factor, `F(q)`, for a hard sphere of radius `R`.
*/
template <class T = float> class FormFactorSphere {
Expand Down Expand Up @@ -250,7 +252,8 @@ template <typename T> class SamplingPolicy {
*
* For more information, see @see http://doi.org/d8zgw5 and @see http://doi.org/10.1063/1.449987.
*/
template <typename T = float, typename TSamplingPolicy = SamplingPolicy<T>> class StructureFactorPBC : private TSamplingPolicy {
template <typename T = float, Algorithm method = SIMD, typename TSamplingPolicy = SamplingPolicy<T>>
class StructureFactorPBC : private TSamplingPolicy {
//! sample directions (h,k,l)
const std::vector<Point> directions = {
{1, 0, 0}, {0, 1, 0}, {0, 0, 1}, // 3 permutations
Expand All @@ -264,42 +267,48 @@ template <typename T = float, typename TSamplingPolicy = SamplingPolicy<T>> clas
public:
StructureFactorPBC(int q_multiplier) : p_max(q_multiplier){}

template <class Tpvec> void sample(const Tpvec &positions, const double boxlength) {
template <class Tpositions> void sample(const Tpositions &positions, const double boxlength) {
// https://gcc.gnu.org/gcc-9/porting_to.html#ompdatasharing
// #pragma omp parallel for collapse(2) default(none) shared(directions, p_max, boxlength) shared(positions)
#pragma omp parallel for collapse(2) default(shared)
for (int i = 0; i < directions.size(); ++i) {
for (int p = 1; p <= p_max; ++p) { // loop over multiples of q
const Point q = (2 * pc::pi * p / boxlength) * directions[i]; // scattering vector
#ifdef __GNUC__
// When sine and cosine is computed in separate loops, advanced sine and cosine implementation
// utilizing SIMD instructions may be used to get at least 4 times performance boost.
// As of January 2020, only GCC exploits this using libmvec library if --ffast-math is enabled.
std::vector<T> qr_products(positions.size());
std::transform(positions.begin(), positions.end(), qr_products.begin(),
[&q](auto &r) { return q.dot(r); });
// as of January 2020 the std::transform_reduce is not implemented in libc++
T sum_sin = 0;
for (auto &qr : qr_products) {
sum_sin += std::sin(qr);
}
// as of January 2020 the std::transform_reduce is not implemented in libc++
T sum_cos = 0;
for (auto &qr : qr_products) {
sum_cos += std::cos(qr);
}
#else
// TODO: Optimize also for other compilers than GCC by using a vector math library, e.g.,
// TODO: https://github.com/vectorclass/version2
T sum_sin = 0, sum_cos = 0;
for (auto &r : positions) { // loop over positions
T qr = _q.dot(r); // scalar product q*r
sum_sin += sin(qr);
sum_cos += cos(qr);
}
#endif
T sum_sin = 0.0;
T sum_cos = 0.0;
if constexpr (method == SIMD) {
// When sine and cosine is computed in separate loops, advanced sine and cosine implementation
// utilizing SIMD instructions may be used to get at least 4 times performance boost.
// As of January 2020, only GCC exploits this using libmvec library if --ffast-math is enabled.
std::vector<T> qr_std(positions.size());
std::transform(positions.begin(), positions.end(), qr_std.begin(),
[&q](auto &r) { return q.dot(r); });
// as of January 2020 the std::transform_reduce is not implemented in libc++
for (auto &qr : qr_std) {
sum_sin += std::sin(qr);
}
// as of January 2020 the std::transform_reduce is not implemented in libc++
for (auto &qr : qr_std) {
sum_cos += std::cos(qr);
}
} else if constexpr (method == EIGEN) {
// Map is a Nx3 matrix facade into original std::vector. Eigen does not accept `float`,
// hence `double` must be used instead of the template parameter T here.
auto qr = Eigen::Map<Eigen::MatrixXd, 0, Eigen::Stride<1, 3>>((double *)positions.data(),
positions.size(), 3) * q;
sum_sin = qr.array().cast<T>().sin().sum();
sum_cos = qr.array().cast<T>().cos().sum();
} else if constexpr (method == GENERIC) {
// TODO: Optimize also for other compilers than GCC by using a vector math library, e.g.,
// TODO: https://github.com/vectorclass/version2
for (auto &r : positions) { // loop over positions
T qr = q.dot(r); // scalar product q*r
sum_sin += sin(qr);
sum_cos += cos(qr);
}
};
// collect average, `norm()` gives the scattering vector length
const T sf = (sum_sin * sum_sin + sum_cos * sum_cos) / (float)(positions.size());
const T sf = (sum_sin * sum_sin + sum_cos * sum_cos) / (T)(positions.size());
#pragma omp critical
// avoid race conditions when updating the map
addSampling(q.norm(), sf, 1.0);
Expand All @@ -314,7 +323,6 @@ template <typename T = float, typename TSamplingPolicy = SamplingPolicy<T>> clas
using TSamplingPolicy::getSampling;
};


/**
* @brief Calculate structure factor using explicit q averaging in isotropic periodic boundary conditions (IPBC).
*
Expand All @@ -332,7 +340,7 @@ template <typename T = float, typename TSamplingPolicy = SamplingPolicy<T>> clas
public:
StructureFactorIPBC(int q_multiplier) : p_max(q_multiplier){}

template <class Tpvec> void sample(const Tpvec &positions, const double boxlength) {
template <class Tpositions> void sample(const Tpositions &positions, const double boxlength) {
// https://gcc.gnu.org/gcc-9/porting_to.html#ompdatasharing
// #pragma omp parallel for collapse(2) default(none) shared(directions, p_max, positions, boxlength)
#pragma omp parallel for collapse(2) default(shared)
Expand Down
58 changes: 58 additions & 0 deletions src/scatter_test.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#include "analysis.h"
#include "io.h"

#define ANKERL_NANOBENCH_IMPLEMENT
#include "nanobench.h"

namespace Faunus {
namespace Scatter {

using doctest::Approx;

double box = 80;
const std::vector<Point> positions = {
{10, 20, 30}, {-32, 19, 1}, {34, -2, 23}, {0, 0, 1}, {25, 0, -12},
{-6, -4, -29}, {-12, 23, -3}, {3, 1, -4}, {-31, 29, -20}}; // random position vector

TEST_CASE_TEMPLATE("[Faunus] StructureFactorPBC", T, StructureFactorPBC<float, SIMD>,
StructureFactorPBC<float, EIGEN>, StructureFactorPBC<float, GENERIC>) {
size_t cnt = 0;
std::vector<float> result = {0.0785, 1.48621, 0.1111, 0.567279, 0.136, 1.39515,
0.1571, 0.730579, 0.2221, 0.701547, 0.2721, 0.692064};
T scatter(2);
scatter.sample(positions, box);
for (auto [q, S] : scatter.getSampling()) {
CHECK(q == Approx(result[cnt++]));
CHECK(S == Approx(result[cnt++]));
}
CHECK(cnt == result.size());
}

#ifdef ANKERL_NANOBENCH_H_INCLUDED
TEST_CASE("Benchmark") {
std::vector<Point> pos(1000);
for (auto &p : pos)
p = Eigen::Vector3d::Random() * box;
ankerl::nanobench::Config bench;
bench.minEpochIterations(100);
bench.run("SIMD", [&] { StructureFactorPBC<double, SIMD>(10).sample(pos, box); }).doNotOptimizeAway();
bench.run("EIGEN", [&] { StructureFactorPBC<double, EIGEN>(10).sample(pos, box); }).doNotOptimizeAway();
bench.run("GENERIC", [&] { StructureFactorPBC<double, GENERIC>(10).sample(pos, box); }).doNotOptimizeAway();
}
#endif

TEST_CASE("[Faunus] StructureFactorIPBC") {
size_t cnt = 0;
std::vector<double> result = {0.0785, 0.384363, 0.1111, 1.51652, 0.136, 1.18027,
0.1571, 1.40662, 0.2221, 2.06042, 0.2721, 1.53482};
StructureFactorIPBC scatter(2);
scatter.sample(positions, box);
for (auto [q, S] : scatter.getSampling()) {
CHECK(q == Approx(result[cnt++]));
CHECK(S == Approx(result[cnt++]));
}
CHECK(cnt == result.size());
}

} // namespace Scatter
} // namespace Faunus
1 change: 1 addition & 0 deletions src/unittests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "space_test.h"
#include "tensor_test.h"
#include "externalpotential_test.h"
#include "scatter_test.h"

#include "mpicontroller.h"
#include "auxiliary.h"
Expand Down

0 comments on commit 801ffc2

Please sign in to comment.