Allow SIMD, Eigen, and libm functions in scattering analysis

Different implementations exhibit very different performance (factor of 5 observed) depending on the compiler and system environment. Add unit and performance tests.
mlund · Feb 7, 2020 · 801ffc2 · 801ffc2
1 parent 748b601
commit 801ffc2
Show file tree

Hide file tree

Showing 4 changed files with 100 additions and 32 deletions.
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -84,6 +84,7 @@ set(tsts
     ${CMAKE_SOURCE_DIR}/src/molecule_test.h
     ${CMAKE_SOURCE_DIR}/src/particle_test.h
     ${CMAKE_SOURCE_DIR}/src/potentials_test.h
+    ${CMAKE_SOURCE_DIR}/src/scatter_test.h
     ${CMAKE_SOURCE_DIR}/src/space_test.h
     ${CMAKE_SOURCE_DIR}/src/tensor_test.h
     )

diff --git a/src/scatter.h b/src/scatter.h
@@ -10,6 +10,8 @@ namespace Faunus {
  */
 namespace Scatter {
 
+enum Algorithm { SIMD, EIGEN, GENERIC }; //!< Selections for math algorithms
+
 /** @brief Form factor, `F(q)`, for a hard sphere of radius `R`.
  */
 template <class T = float> class FormFactorSphere {
@@ -250,7 +252,8 @@ template <typename T> class SamplingPolicy {
  *
  * For more information, see @see http://doi.org/d8zgw5 and @see http://doi.org/10.1063/1.449987.
  */
-template <typename T = float, typename TSamplingPolicy = SamplingPolicy<T>> class StructureFactorPBC : private TSamplingPolicy {
+template <typename T = float, Algorithm method = SIMD, typename TSamplingPolicy = SamplingPolicy<T>>
+class StructureFactorPBC : private TSamplingPolicy {
     //! sample directions (h,k,l)
     const std::vector<Point> directions = {
         {1, 0, 0}, {0, 1, 0},  {0, 0, 1},                                      // 3 permutations
@@ -264,42 +267,48 @@ template <typename T = float, typename TSamplingPolicy = SamplingPolicy<T>> clas
   public:
     StructureFactorPBC(int q_multiplier) : p_max(q_multiplier){}
 
-    template <class Tpvec> void sample(const Tpvec &positions, const double boxlength) {
+    template <class Tpositions> void sample(const Tpositions &positions, const double boxlength) {
         // https://gcc.gnu.org/gcc-9/porting_to.html#ompdatasharing
         // #pragma omp parallel for collapse(2) default(none) shared(directions, p_max, boxlength) shared(positions)
         #pragma omp parallel for collapse(2) default(shared)
         for (int i = 0; i < directions.size(); ++i) {
             for (int p = 1; p <= p_max; ++p) {                                // loop over multiples of q
                 const Point q = (2 * pc::pi * p / boxlength) * directions[i]; // scattering vector
-                #ifdef __GNUC__
-                // When sine and cosine is computed in separate loops, advanced sine and cosine implementation
-                // utilizing SIMD instructions may be used to get at least 4 times performance boost.
-                // As of January 2020, only GCC exploits this using libmvec library if --ffast-math is enabled.
-                std::vector<T> qr_products(positions.size());
-                std::transform(positions.begin(), positions.end(), qr_products.begin(),
-                               [&q](auto &r) { return q.dot(r); });
-                // as of January 2020 the std::transform_reduce is not implemented in libc++
-                T sum_sin = 0;
-                for (auto &qr : qr_products) {
-                    sum_sin += std::sin(qr);
-                }
-                // as of January 2020 the std::transform_reduce is not implemented in libc++
-                T sum_cos = 0;
-                for (auto &qr : qr_products) {
-                    sum_cos += std::cos(qr);
-                }
-                #else
-                // TODO: Optimize also for other compilers than GCC by using a vector math library, e.g.,
-                // TODO: https://github.com/vectorclass/version2
-                T sum_sin = 0, sum_cos = 0;
-                for (auto &r : positions) { // loop over positions
-                    T qr = _q.dot(r);       // scalar product q*r
-                    sum_sin += sin(qr);
-                    sum_cos += cos(qr);
-                }
-                #endif
+                T sum_sin = 0.0;
+                T sum_cos = 0.0;
+                if constexpr (method == SIMD) {
+                    // When sine and cosine is computed in separate loops, advanced sine and cosine implementation
+                    // utilizing SIMD instructions may be used to get at least 4 times performance boost.
+                    // As of January 2020, only GCC exploits this using libmvec library if --ffast-math is enabled.
+                    std::vector<T> qr_std(positions.size());
+                    std::transform(positions.begin(), positions.end(), qr_std.begin(),
+                                   [&q](auto &r) { return q.dot(r); });
+                    // as of January 2020 the std::transform_reduce is not implemented in libc++
+                    for (auto &qr : qr_std) {
+                        sum_sin += std::sin(qr);
+                    }
+                    // as of January 2020 the std::transform_reduce is not implemented in libc++
+                    for (auto &qr : qr_std) {
+                        sum_cos += std::cos(qr);
+                    }
+                } else if constexpr (method == EIGEN) {
+                    // Map is a Nx3 matrix facade into original std::vector. Eigen does not accept `float`,
+                    // hence `double` must be used instead of the template parameter T here.
+                    auto qr = Eigen::Map<Eigen::MatrixXd, 0, Eigen::Stride<1, 3>>((double *)positions.data(),
+                        positions.size(), 3) * q;
+                    sum_sin = qr.array().cast<T>().sin().sum();
+                    sum_cos = qr.array().cast<T>().cos().sum();
+                } else if constexpr (method == GENERIC) {
+                    // TODO: Optimize also for other compilers than GCC by using a vector math library, e.g.,
+                    // TODO: https://github.com/vectorclass/version2
+                    for (auto &r : positions) { // loop over positions
+                        T qr = q.dot(r);        // scalar product q*r
+                        sum_sin += sin(qr);
+                        sum_cos += cos(qr);
+                    }
+                };
                 // collect average, `norm()` gives the scattering vector length
-                const T sf = (sum_sin * sum_sin + sum_cos * sum_cos) / (float)(positions.size());
+                const T sf = (sum_sin * sum_sin + sum_cos * sum_cos) / (T)(positions.size());
                 #pragma omp critical
                 // avoid race conditions when updating the map
                 addSampling(q.norm(), sf, 1.0);
@@ -314,7 +323,6 @@ template <typename T = float, typename TSamplingPolicy = SamplingPolicy<T>> clas
     using TSamplingPolicy::getSampling;
 };
 
-
 /**
  * @brief Calculate structure factor using explicit q averaging in isotropic periodic boundary conditions (IPBC).
  *
@@ -332,7 +340,7 @@ template <typename T = float, typename TSamplingPolicy = SamplingPolicy<T>> clas
   public:
     StructureFactorIPBC(int q_multiplier) : p_max(q_multiplier){}
 
-    template <class Tpvec> void sample(const Tpvec &positions, const double boxlength) {
+    template <class Tpositions> void sample(const Tpositions &positions, const double boxlength) {
         // https://gcc.gnu.org/gcc-9/porting_to.html#ompdatasharing
         // #pragma omp parallel for collapse(2) default(none) shared(directions, p_max, positions, boxlength)
         #pragma omp parallel for collapse(2) default(shared)

diff --git a/src/scatter_test.h b/src/scatter_test.h
@@ -0,0 +1,58 @@
+#include "analysis.h"
+#include "io.h"
+
+#define ANKERL_NANOBENCH_IMPLEMENT
+#include "nanobench.h"
+
+namespace Faunus {
+namespace Scatter {
+
+using doctest::Approx;
+
+double box = 80;
+const std::vector<Point> positions = {
+    {10, 20, 30},  {-32, 19, 1},  {34, -2, 23}, {0, 0, 1},     {25, 0, -12},
+    {-6, -4, -29}, {-12, 23, -3}, {3, 1, -4},   {-31, 29, -20}}; // random position vector
+
+TEST_CASE_TEMPLATE("[Faunus] StructureFactorPBC", T, StructureFactorPBC<float, SIMD>,
+                   StructureFactorPBC<float, EIGEN>, StructureFactorPBC<float, GENERIC>) {
+    size_t cnt = 0;
+    std::vector<float> result = {0.0785, 1.48621,  0.1111, 0.567279, 0.136,  1.39515,
+                                 0.1571, 0.730579, 0.2221, 0.701547, 0.2721, 0.692064};
+    T scatter(2);
+    scatter.sample(positions, box);
+    for (auto [q, S] : scatter.getSampling()) {
+        CHECK(q == Approx(result[cnt++]));
+        CHECK(S == Approx(result[cnt++]));
+    }
+    CHECK(cnt == result.size());
+}
+
+#ifdef ANKERL_NANOBENCH_H_INCLUDED
+TEST_CASE("Benchmark") {
+    std::vector<Point> pos(1000);
+    for (auto &p : pos)
+        p = Eigen::Vector3d::Random() * box;
+    ankerl::nanobench::Config bench;
+    bench.minEpochIterations(100);
+    bench.run("SIMD", [&] { StructureFactorPBC<double, SIMD>(10).sample(pos, box); }).doNotOptimizeAway();
+    bench.run("EIGEN", [&] { StructureFactorPBC<double, EIGEN>(10).sample(pos, box); }).doNotOptimizeAway();
+    bench.run("GENERIC", [&] { StructureFactorPBC<double, GENERIC>(10).sample(pos, box); }).doNotOptimizeAway();
+}
+#endif
+
+TEST_CASE("[Faunus] StructureFactorIPBC") {
+    size_t cnt = 0;
+    std::vector<double> result = {0.0785, 0.384363, 0.1111, 1.51652, 0.136,  1.18027,
+                                  0.1571, 1.40662,  0.2221, 2.06042, 0.2721, 1.53482};
+    StructureFactorIPBC scatter(2);
+    scatter.sample(positions, box);
+    for (auto [q, S] : scatter.getSampling()) {
+        CHECK(q == Approx(result[cnt++]));
+        CHECK(S == Approx(result[cnt++]));
+    }
+    CHECK(cnt == result.size());
+}
+
+} // namespace Scatter
+} // namespace Faunus
diff --git a/src/unittests.cpp b/src/unittests.cpp
@@ -26,6 +26,7 @@
 #include "space_test.h"
 #include "tensor_test.h"
 #include "externalpotential_test.h"
+#include "scatter_test.h"
 
 #include "mpicontroller.h"
 #include "auxiliary.h"