From 499acbfe51b3bde1642fbb00faf0f6391be95734 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Sun, 10 Apr 2022 11:40:56 -0700
Subject: [PATCH 01/58] update CL description

---
 examples/algorithms/mst/mst.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/algorithms/mst/mst.cu b/examples/algorithms/mst/mst.cu
index 17258047..06cb817c 100644
--- a/examples/algorithms/mst/mst.cu
+++ b/examples/algorithms/mst/mst.cu
@@ -19,7 +19,7 @@ struct parameters_t {
    * @param argv Command line arguments.
    */
   parameters_t(int argc, char** argv)
-      : options(argv[0], "Sparse Matrix-Vector Multiplication example") {
+      : options(argv[0], "Minimum Spanning Tree example") {
     // Add command line options
     options.add_options()("help", "Print help")                      // help
         ("validate", "CPU validation")                               // validate

From a323515f2e0b6de26cec08f8ee1d6a924dfbd024 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Sat, 16 Apr 2022 16:42:32 -0700
Subject: [PATCH 02/58] typo fix

---
 include/gunrock/algorithms/mst.hxx | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/gunrock/algorithms/mst.hxx b/include/gunrock/algorithms/mst.hxx
index 40a76ea3..43699a40 100644
--- a/include/gunrock/algorithms/mst.hxx
+++ b/include/gunrock/algorithms/mst.hxx
@@ -151,7 +151,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
             ) -> void {
       // Find the minimum neighbor for each vertex. Use atomic min to break ties
       // between neighbors that have the same weight.
-      // Consistent ordering (using max here) will prevent loops.
+      // Consistent ordering (using min here) will prevent loops.
       // Edges with dest < source are flipped so that reverse edges are treated
       // as equivalent. Must check that the weight equals the min weight for
       // that vertex, because some edges can be added to the frontier that are
@@ -289,4 +289,4 @@ float run(graph_t& G,
 }
 
 }  // namespace mst
-}  // namespace gunrock
\ No newline at end of file
+}  // namespace gunrock

From f648f020a7109cd801c9c222c49f3dab37c4f082 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Mon, 2 May 2022 14:50:23 -0700
Subject: [PATCH 03/58] benchmarking

---
 CMakeLists.txt                      |  6 +--
 benchmarks/CMakeLists.txt           |  1 +
 benchmarks/mst_bench.cu             | 45 ++++++++++++++++++
 include/gunrock/io/sample_large.hxx | 74 +++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100644 benchmarks/mst_bench.cu
 create mode 100644 include/gunrock/io/sample_large.hxx

diff --git a/CMakeLists.txt b/CMakeLists.txt
index b42d3f0b..4bdaf5a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,7 +71,7 @@ set_target_properties(essentials
     CUDA_EXTENSIONS OFF
     CUDA_RESOLVE_DEVICE_SYMBOLS ON
     CUDA_SEPARABLE_COMPILATION ON
-    CUDA_ARCHITECTURES 61 # Set required architecture.
+    CUDA_ARCHITECTURES 86 # Set required architecture.
     # CUDA_PTX_COMPILATION ON # Can only be applied to OBJ.
 )
 
@@ -203,11 +203,11 @@ option(ESSENTIALS_BUILD_BENCHMARKS
   OFF)
 
 # Subdirectories for examples, testing and documentation
-if(ESSENTIALS_BUILD_BENCHMARKS)
+#if(ESSENTIALS_BUILD_BENCHMARKS)
   # ... see https://github.com/NVIDIA/nvbench/issues/66
   set(NVBench_ENABLE_NVML OFF)
   # ... set cuda architecture for nvbench.
   set(CMAKE_CUDA_ARCHITECTURES ${ESSENTIALS_ARCHITECTURES})
   include(${PROJECT_SOURCE_DIR}/cmake/FetchNVBench.cmake)
   add_subdirectory(benchmarks)
-endif(ESSENTIALS_BUILD_BENCHMARKS)
+#endif(ESSENTIALS_BUILD_BENCHMARKS)
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index afd6ec69..5969fff9 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(BENCHMARK_SOURCES
   for.cu
+  mst_bench.cu
 )
 
 foreach(SOURCE IN LISTS BENCHMARK_SOURCES)
diff --git a/benchmarks/mst_bench.cu b/benchmarks/mst_bench.cu
new file mode 100644
index 00000000..f2106b71
--- /dev/null
+++ b/benchmarks/mst_bench.cu
@@ -0,0 +1,45 @@
+#include <gunrock/error.hxx>
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/cuda/cuda.hxx>
+#include <gunrock/framework/operators/for/for.hxx>
+#include <gunrock/io/sample_large.hxx>
+#include <nvbench/nvbench.cuh>
+#include <iostream>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/mst.hxx>
+
+char** args;
+
+namespace gunrock {
+namespace benchmark {
+void parallel_for(nvbench::state& state) {
+  // Build a graph using a sample csr.
+  auto csr = io::sample_large::csr();
+  auto G =
+      graph::build::from_csr<memory_space_t::device, graph::view_t::csr>(csr);
+
+  // Initialize the context.
+  cuda::device_id_t device = 0;
+  cuda::multi_context_t context(device);
+
+  thrust::device_vector<weight_t> mst_weight(1);
+
+  // --
+  // GPU Run
+  
+  std::cout << G.get_number_of_edges() << "\n";
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::mst::run(G, mst_weight.data().get());
+  });
+}
+
+void parse_arg(int argc, char** argv) {
+  std::cout << argv[0] << "\n";
+}
+
+NVBENCH_BENCH(parallel_for).set_name("parallel_for");
+
+}  // namespace benchmark
+}  // namespace gunrock
diff --git a/include/gunrock/io/sample_large.hxx b/include/gunrock/io/sample_large.hxx
new file mode 100644
index 00000000..c8222a21
--- /dev/null
+++ b/include/gunrock/io/sample_large.hxx
@@ -0,0 +1,74 @@
+/**
+ * @file sample.hxx
+ * @author Muhammad Osama (mosama@ucdavis.edu)
+ * @brief
+ * @version 0.1
+ * @date 2021-12-23
+ *
+ * @copyright Copyright (c) 2021
+ *
+ */
+#pragma once
+
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/graph/graph.hxx>
+
+namespace gunrock {
+namespace io {
+namespace sample_large {
+
+using namespace memory;
+
+/**
+ * @brief Returns a large sample CSR matrix of size 100000 x 100000.
+ *
+ * @par Overview
+ *
+ * @tparam space Memory space of the CSR matrix.
+ * @tparam vertex_t Type of vertex.
+ * @tparam edge_t Type of edge.
+ * @tparam weight_t Type of weight.
+ * @return format::csr_t<space, vertex_t, edge_t, weight_t> CSR matrix.
+ */
+template <memory_space_t space = memory_space_t::device,
+          typename vertex_t = int,
+          typename edge_t = int,
+          typename weight_t = float>
+format::csr_t<space, vertex_t, edge_t, weight_t> csr() {
+  using csr_t = format::csr_t<memory_space_t::host, vertex_t, edge_t, weight_t>;
+
+  int dim = 10;
+  csr_t matrix(dim, dim, dim * dim);
+
+  // Row Offsets
+  thrust::host_vector<int> hv(dim * dim);
+  thrust::host_vector<int> mod(dim * dim);
+  thrust::sequence(hv.begin(), hv.end());
+  thrust::fill(mod.begin(), mod.end(), dim);
+  thrust::transform(hv.begin(), hv.end(), mod.begin(),
+                    matrix.row_offsets.begin(), thrust::multiplies<int>());
+  for(int i = 0; i < matrix.row_offsets.size(); i++)
+        std::cout << "H[" << i << "] = " << matrix.row_offsets[i] << std::endl;
+
+  // Column Indices
+  thrust::transform(hv.begin(), hv.end(), mod.begin(),
+                    matrix.column_indices.begin(), thrust::modulus<int>());
+  for(int i = 0; i < matrix.column_indices.size(); i++)
+        std::cout << "H[" << i << "] = " << matrix.column_indices[i] << std::endl;
+  
+  // Non-zero values
+  thrust::fill(matrix.nonzero_values.begin(), matrix.nonzero_values.end(), 1);
+
+  if (space == memory_space_t::host) {
+    return matrix;
+  } else {
+    using d_csr_t =
+        format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+    d_csr_t d_matrix(matrix);
+    return d_matrix;
+  }
+}
+
+}  // namespace sample_large
+}  // namespace io
+}  // namespace gunrock
\ No newline at end of file

From 2a93db0bf22dc13ec7db6faceac6380789f2fd25 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Mon, 9 May 2022 21:47:45 -0700
Subject: [PATCH 04/58] benchmarking fixes

---
 CMakeLists.txt                      |  6 +++---
 benchmarks/mst_bench.cu             | 10 ++--------
 include/gunrock/io/sample_large.hxx | 29 ++++++++++++++---------------
 3 files changed, 19 insertions(+), 26 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6020f699..368acf29 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -73,7 +73,7 @@ set_target_properties(essentials
     CUDA_EXTENSIONS OFF
     CUDA_RESOLVE_DEVICE_SYMBOLS ON
     CUDA_SEPARABLE_COMPILATION ON
-    CUDA_ARCHITECTURES 86 # Set required architecture.
+    CUDA_ARCHITECTURES 61 # Set required architecture.
     # CUDA_PTX_COMPILATION ON # Can only be applied to OBJ.
 )
 
@@ -211,11 +211,11 @@ option(ESSENTIALS_BUILD_BENCHMARKS
   OFF)
 
 # Subdirectories for examples, testing and documentation
-#if(ESSENTIALS_BUILD_BENCHMARKS)
+if(ESSENTIALS_BUILD_BENCHMARKS)
   # ... see https://github.com/NVIDIA/nvbench/issues/66
   set(NVBench_ENABLE_NVML OFF)
   # ... set cuda architecture for nvbench.
   set(CMAKE_CUDA_ARCHITECTURES ${ESSENTIALS_ARCHITECTURES})
   include(${PROJECT_SOURCE_DIR}/cmake/FetchNVBench.cmake)
   add_subdirectory(benchmarks)
-#endif(ESSENTIALS_BUILD_BENCHMARKS)
+endif(ESSENTIALS_BUILD_BENCHMARKS)
diff --git a/benchmarks/mst_bench.cu b/benchmarks/mst_bench.cu
index f2106b71..a2a78db4 100644
--- a/benchmarks/mst_bench.cu
+++ b/benchmarks/mst_bench.cu
@@ -13,7 +13,7 @@ char** args;
 
 namespace gunrock {
 namespace benchmark {
-void parallel_for(nvbench::state& state) {
+void mst_bench(nvbench::state& state) {
   // Build a graph using a sample csr.
   auto csr = io::sample_large::csr();
   auto G =
@@ -28,18 +28,12 @@ void parallel_for(nvbench::state& state) {
   // --
   // GPU Run
   
-  std::cout << G.get_number_of_edges() << "\n";
-
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     gunrock::mst::run(G, mst_weight.data().get());
   });
 }
 
-void parse_arg(int argc, char** argv) {
-  std::cout << argv[0] << "\n";
-}
-
-NVBENCH_BENCH(parallel_for).set_name("parallel_for");
+NVBENCH_BENCH(mst_bench).set_name("mst_bench");
 
 }  // namespace benchmark
 }  // namespace gunrock
diff --git a/include/gunrock/io/sample_large.hxx b/include/gunrock/io/sample_large.hxx
index c8222a21..89107d2c 100644
--- a/include/gunrock/io/sample_large.hxx
+++ b/include/gunrock/io/sample_large.hxx
@@ -20,9 +20,8 @@ namespace sample_large {
 using namespace memory;
 
 /**
- * @brief Returns a large sample CSR matrix of size 100000 x 100000.
- *
- * @par Overview
+ * @brief Returns a large sample CSR matrix of size 10000 x 10000,
+ * filled with ones.
  *
  * @tparam space Memory space of the CSR matrix.
  * @tparam vertex_t Type of vertex.
@@ -37,25 +36,25 @@ template <memory_space_t space = memory_space_t::device,
 format::csr_t<space, vertex_t, edge_t, weight_t> csr() {
   using csr_t = format::csr_t<memory_space_t::host, vertex_t, edge_t, weight_t>;
 
-  int dim = 10;
+  int dim = 10000;
   csr_t matrix(dim, dim, dim * dim);
 
   // Row Offsets
-  thrust::host_vector<int> hv(dim * dim);
-  thrust::host_vector<int> mod(dim * dim);
-  thrust::sequence(hv.begin(), hv.end());
-  thrust::fill(mod.begin(), mod.end(), dim);
-  thrust::transform(hv.begin(), hv.end(), mod.begin(),
+  thrust::host_vector<int> rowSeq(dim + 1);
+  thrust::host_vector<int> multVector(dim + 1);
+  thrust::sequence(rowSeq.begin(), rowSeq.end());
+  thrust::fill(multVector.begin(), multVector.end(), dim);
+  thrust::transform(rowSeq.begin(), rowSeq.end(), multVector.begin(),
                     matrix.row_offsets.begin(), thrust::multiplies<int>());
-  for(int i = 0; i < matrix.row_offsets.size(); i++)
-        std::cout << "H[" << i << "] = " << matrix.row_offsets[i] << std::endl;
 
   // Column Indices
-  thrust::transform(hv.begin(), hv.end(), mod.begin(),
+  thrust::host_vector<int> colSeq(dim * dim);
+  thrust::host_vector<int> modVector(dim * dim);
+  thrust::sequence(colSeq.begin(), colSeq.end());
+  thrust::fill(modVector.begin(), modVector.end(), dim);
+  thrust::transform(colSeq.begin(), colSeq.end(), modVector.begin(),
                     matrix.column_indices.begin(), thrust::modulus<int>());
-  for(int i = 0; i < matrix.column_indices.size(); i++)
-        std::cout << "H[" << i << "] = " << matrix.column_indices[i] << std::endl;
-  
+
   // Non-zero values
   thrust::fill(matrix.nonzero_values.begin(), matrix.nonzero_values.end(), 1);
 

From 0ec56de1250d133cb327930ff1066e6b47a61e62 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Mon, 9 May 2022 21:51:07 -0700
Subject: [PATCH 05/58] name updates

---
 include/gunrock/io/sample_large.hxx | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/include/gunrock/io/sample_large.hxx b/include/gunrock/io/sample_large.hxx
index 89107d2c..2fda592c 100644
--- a/include/gunrock/io/sample_large.hxx
+++ b/include/gunrock/io/sample_large.hxx
@@ -41,18 +41,18 @@ format::csr_t<space, vertex_t, edge_t, weight_t> csr() {
 
   // Row Offsets
   thrust::host_vector<int> rowSeq(dim + 1);
-  thrust::host_vector<int> multVector(dim + 1);
+  thrust::host_vector<int> multVect(dim + 1);
   thrust::sequence(rowSeq.begin(), rowSeq.end());
-  thrust::fill(multVector.begin(), multVector.end(), dim);
-  thrust::transform(rowSeq.begin(), rowSeq.end(), multVector.begin(),
+  thrust::fill(multVect.begin(), multVect.end(), dim);
+  thrust::transform(rowSeq.begin(), rowSeq.end(), multVect.begin(),
                     matrix.row_offsets.begin(), thrust::multiplies<int>());
 
   // Column Indices
   thrust::host_vector<int> colSeq(dim * dim);
-  thrust::host_vector<int> modVector(dim * dim);
+  thrust::host_vector<int> modVect(dim * dim);
   thrust::sequence(colSeq.begin(), colSeq.end());
-  thrust::fill(modVector.begin(), modVector.end(), dim);
-  thrust::transform(colSeq.begin(), colSeq.end(), modVector.begin(),
+  thrust::fill(modVect.begin(), modVect.end(), dim);
+  thrust::transform(colSeq.begin(), colSeq.end(), modVect.begin(),
                     matrix.column_indices.begin(), thrust::modulus<int>());
 
   // Non-zero values

From 2c6f0397110d36614278b4d41a41cdf14ef91ceb Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Mon, 9 May 2022 22:22:13 -0700
Subject: [PATCH 06/58] more benchmarks

---
 benchmarks/CMakeLists.txt             |  2 +-
 benchmarks/{mst_bench.cu => bench.cu} | 35 ++++++++++++++++++++++++---
 2 files changed, 32 insertions(+), 5 deletions(-)
 rename benchmarks/{mst_bench.cu => bench.cu} (51%)

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 5969fff9..40f0a599 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(BENCHMARK_SOURCES
   for.cu
-  mst_bench.cu
+  bench.cu
 )
 
 foreach(SOURCE IN LISTS BENCHMARK_SOURCES)
diff --git a/benchmarks/mst_bench.cu b/benchmarks/bench.cu
similarity index 51%
rename from benchmarks/mst_bench.cu
rename to benchmarks/bench.cu
index a2a78db4..660b7d97 100644
--- a/benchmarks/mst_bench.cu
+++ b/benchmarks/bench.cu
@@ -8,11 +8,11 @@
 #include <iostream>
 #include <gunrock/algorithms/algorithms.hxx>
 #include <gunrock/algorithms/mst.hxx>
-
-char** args;
+#include <gunrock/algorithms/bfs.hxx>
 
 namespace gunrock {
 namespace benchmark {
+
 void mst_bench(nvbench::state& state) {
   // Build a graph using a sample csr.
   auto csr = io::sample_large::csr();
@@ -23,17 +23,44 @@ void mst_bench(nvbench::state& state) {
   cuda::device_id_t device = 0;
   cuda::multi_context_t context(device);
 
+  // --
+  // Params and memory allocation
   thrust::device_vector<weight_t> mst_weight(1);
 
   // --
   // GPU Run
-  
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     gunrock::mst::run(G, mst_weight.data().get());
   });
 }
 
-NVBENCH_BENCH(mst_bench).set_name("mst_bench");
+void bfs_bench(nvbench::state& state) {
+  // Build a graph using a sample csr.
+  auto csr = io::sample_large::csr();
+  auto G =
+      graph::build::from_csr<memory_space_t::device, graph::view_t::csr>(csr);
+
+  // Initialize the context.
+  cuda::device_id_t device = 0;
+  cuda::multi_context_t context(device);
+
+  // --
+  // Params and memory allocation
+  vertex_t single_source = 0;
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<vertex_t> distances(n_vertices);
+  thrust::device_vector<vertex_t> predecessors(n_vertices);
+
+  // --
+  // GPU Run
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::bfs::run(G, single_source, distances.data().get(),
+                      predecessors.data().get());
+  });
+}
+
+NVBENCH_BENCH(mst_bench);
+NVBENCH_BENCH(bfs_bench);
 
 }  // namespace benchmark
 }  // namespace gunrock

From 0899edcff4c685457c71a9a188b8eeaf8cbf680b Mon Sep 17 00:00:00 2001
From: Jonathan Wapman <jdwapman@ucdavis.edu>
Date: Wed, 18 May 2022 11:32:56 -0700
Subject: [PATCH 07/58] Fix cuda namespace collision

---
 benchmarks/for.cu                             |  4 +-
 include/gunrock/algorithms/bc.hxx             | 20 +++----
 include/gunrock/algorithms/bfs.hxx            | 14 ++---
 include/gunrock/algorithms/color.hxx          | 14 ++---
 .../algorithms/experimental/async/bfs.hxx     |  8 +--
 include/gunrock/algorithms/geo.hxx            | 14 ++---
 include/gunrock/algorithms/hits.hxx           | 14 ++---
 include/gunrock/algorithms/kcore.hxx          | 16 +++---
 include/gunrock/algorithms/mst.hxx            | 16 +++---
 include/gunrock/algorithms/ppr.hxx            | 24 ++++----
 include/gunrock/algorithms/pr.hxx             | 14 ++---
 .../gunrock/algorithms/sort/radix_sort.hxx    |  2 +-
 include/gunrock/algorithms/spgemm.hxx         | 14 ++---
 include/gunrock/algorithms/spmv.hxx           | 18 +++---
 include/gunrock/algorithms/sssp.hxx           | 14 ++---
 include/gunrock/cuda/atomic_functions.hxx     |  4 +-
 include/gunrock/cuda/context.hxx              | 56 +++++++++----------
 include/gunrock/cuda/cuda.hxx                 |  2 +-
 include/gunrock/cuda/detail/launch_box.hxx    |  4 +-
 .../gunrock/cuda/detail/launch_kernels.hxx    | 24 ++++----
 include/gunrock/cuda/device.hxx               |  6 +-
 include/gunrock/cuda/device_properties.hxx    |  4 +-
 include/gunrock/cuda/event_management.hxx     |  4 +-
 include/gunrock/cuda/function.hxx             |  4 +-
 include/gunrock/cuda/global.hxx               |  4 +-
 include/gunrock/cuda/intrinsics.hxx           |  2 +-
 include/gunrock/cuda/launch_box.hxx           | 16 +++---
 include/gunrock/cuda/sm.hxx                   |  4 +-
 include/gunrock/cuda/stream_management.hxx    |  4 +-
 include/gunrock/framework/enactor.hxx         | 22 ++++----
 .../framework/experimental/async/enactor.hxx  |  9 +--
 .../experimental/boolmap_frontier.hxx         | 10 ++--
 .../gunrock/framework/frontier/frontier.hxx   |  2 +-
 .../framework/frontier/vector_frontier.hxx    | 10 ++--
 .../framework/operators/advance/advance.hxx   |  4 +-
 .../operators/advance/block_mapped.hxx        | 12 ++--
 .../framework/operators/advance/bucketing.hxx |  2 +-
 .../framework/operators/advance/helpers.hxx   |  4 +-
 .../operators/advance/merge_path.hxx          |  2 +-
 .../operators/advance/merge_path_v2.hxx       |  2 +-
 .../operators/advance/thread_mapped.hxx       |  4 +-
 .../framework/operators/filter/bypass.hxx     |  4 +-
 .../framework/operators/filter/compact.hxx    |  2 +-
 .../framework/operators/filter/filter.hxx     |  8 +--
 .../framework/operators/filter/predicated.hxx |  2 +-
 .../framework/operators/filter/remove.hxx     |  2 +-
 .../gunrock/framework/operators/for/for.hxx   |  8 +--
 .../neighborreduce/neighborreduce.hxx         |  4 +-
 .../framework/operators/uniquify/unique.hxx   |  2 +-
 .../operators/uniquify/unique_copy.hxx        |  2 +-
 .../framework/operators/uniquify/uniquify.hxx |  4 +-
 include/gunrock/framework/problem.hxx         |  6 +-
 include/gunrock/graph/graph.hxx               |  2 +-
 include/gunrock/util/math.hxx                 |  7 +--
 unittests/cuda/context.cuh                    |  4 +-
 unittests/cuda/device_properties.cuh          | 11 ++--
 unittests/cuda/launch_box.cuh                 |  2 +-
 unittests/framework/operators/for.cuh         |  4 +-
 unittests/graph/src_vertex.cuh                |  2 +-
 59 files changed, 252 insertions(+), 251 deletions(-)

diff --git a/benchmarks/for.cu b/benchmarks/for.cu
index 16e3ab74..b9fc22d3 100644
--- a/benchmarks/for.cu
+++ b/benchmarks/for.cu
@@ -16,8 +16,8 @@ void parallel_for(nvbench::state& state) {
       graph::build::from_csr<memory_space_t::device, graph::view_t::csr>(csr);
 
   // Initialize the context.
-  cuda::device_id_t device = 0;
-  cuda::multi_context_t context(device);
+  gcuda::device_id_t device = 0;
+  gcuda::multi_context_t context(device);
 
   vector_t<int> vertices(G.get_number_of_vertices());
   auto d_vertices = vertices.data().get();
diff --git a/include/gunrock/algorithms/bc.hxx b/include/gunrock/algorithms/bc.hxx
index 5d226d6c..c56f8ec2 100644
--- a/include/gunrock/algorithms/bc.hxx
+++ b/include/gunrock/algorithms/bc.hxx
@@ -35,7 +35,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -82,7 +82,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context,
+            std::shared_ptr<gcuda::multi_context_t> _context,
             enactor_properties_t _properties)
       : gunrock::enactor_t<problem_t>(_problem, _context, _properties) {}
 
@@ -96,12 +96,12 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   std::size_t depth = 0;
 
   void prepare_frontier(frontier_t* f,
-                        cuda::multi_context_t& context) override {
+                        gcuda::multi_context_t& context) override {
     auto P = this->get_problem();
     this->frontiers[0].push_back(P->param.single_source);
   }
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     auto E = this->get_enactor();
     auto P = this->get_problem();
     auto G = P->get_graph();
@@ -188,7 +188,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     }
   }
 
-  bool is_forward_converged(cuda::multi_context_t& context) {
+  bool is_forward_converged(gcuda::multi_context_t& context) {
     auto P = this->get_problem();
     auto out_frontier = &(this->frontiers[this->depth]);
     bool forward_converged = out_frontier->is_empty();
@@ -199,7 +199,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     return false;
   }
 
-  bool is_backward_converged(cuda::multi_context_t& context) {
+  bool is_backward_converged(gcuda::multi_context_t& context) {
     if (depth == 0) {
       backward = false;
       return true;
@@ -208,7 +208,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     return false;
   }
 
-  virtual bool is_converged(cuda::multi_context_t& context) {
+  virtual bool is_converged(gcuda::multi_context_t& context) {
     return (!forward && !backward) ? true : false;
   }
 };  // struct enactor_t
@@ -217,9 +217,9 @@ template <typename graph_t>
 float run(graph_t& G,
           typename graph_t::vertex_type single_source,
           typename graph_t::weight_type* bc_values,
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   // <user-defined>
   using vertex_t = typename graph_t::vertex_type;
diff --git a/include/gunrock/algorithms/bfs.hxx b/include/gunrock/algorithms/bfs.hxx
index 15e67465..78bdb9b0 100644
--- a/include/gunrock/algorithms/bfs.hxx
+++ b/include/gunrock/algorithms/bfs.hxx
@@ -37,7 +37,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -63,7 +63,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::enactor_t<problem_t>(_problem, _context) {}
 
   using vertex_t = typename problem_t::vertex_t;
@@ -72,12 +72,12 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using frontier_t = typename enactor_t<problem_t>::frontier_t;
 
   void prepare_frontier(frontier_t* f,
-                        cuda::multi_context_t& context) override {
+                        gcuda::multi_context_t& context) override {
     auto P = this->get_problem();
     f->push_back(P->param.single_source);
   }
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     // Data slice
     auto E = this->get_enactor();
     auto P = this->get_problem();
@@ -147,9 +147,9 @@ float run(graph_t& G,
           typename graph_t::vertex_type& single_source,  // Parameter
           typename graph_t::vertex_type* distances,      // Output
           typename graph_t::vertex_type* predecessors,   // Output
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   using vertex_t = typename graph_t::vertex_type;
   using param_type = param_t<vertex_t>;
diff --git a/include/gunrock/algorithms/color.hxx b/include/gunrock/algorithms/color.hxx
index 3f7e469f..049499cf 100644
--- a/include/gunrock/algorithms/color.hxx
+++ b/include/gunrock/algorithms/color.hxx
@@ -35,7 +35,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -69,7 +69,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::enactor_t<problem_t>(_problem, _context) {}
 
   using vertex_t = typename problem_t::vertex_t;
@@ -78,7 +78,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using frontier_t = typename enactor_t<problem_t>::frontier_t;
 
   void prepare_frontier(frontier_t* f,
-                        cuda::multi_context_t& context) override {
+                        gcuda::multi_context_t& context) override {
     auto P = this->get_problem();
     auto n_vertices = P->get_graph().get_number_of_vertices();
 
@@ -86,7 +86,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     f->sequence((vertex_t)0, n_vertices, context.get_context(0)->stream());
   }
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     // Data slice
     auto E = this->get_enactor();
     auto P = this->get_problem();
@@ -145,9 +145,9 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
 template <typename graph_t>
 float run(graph_t& G,
           typename graph_t::vertex_type* colors,  // Output
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   using vertex_t = typename graph_t::vertex_type;
 
diff --git a/include/gunrock/algorithms/experimental/async/bfs.hxx b/include/gunrock/algorithms/experimental/async/bfs.hxx
index 4969a283..a048f7d3 100644
--- a/include/gunrock/algorithms/experimental/async/bfs.hxx
+++ b/include/gunrock/algorithms/experimental/async/bfs.hxx
@@ -35,7 +35,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -79,7 +79,7 @@ struct enactor_t : async::enactor_t<problem_t> {
   using queue_t = typename async::enactor_t<problem_t>::queue_t;
 
   // !! Breaks w/ standard essentials (mildly...)
-  void prepare_frontier(queue_t& q, cuda::multi_context_t& context) {
+  void prepare_frontier(queue_t& q, gcuda::multi_context_t& context) {
     auto P = this->get_problem();
 
     // !! Queues creates it's own streams.  But I think we should at least
@@ -88,7 +88,7 @@ struct enactor_t : async::enactor_t<problem_t> {
   }
 
   // !! Breaks w/ standard essentials (mildly...)
-  void loop(cuda::multi_context_t& context) {
+  void loop(gcuda::multi_context_t& context) {
     auto P = this->get_problem();
     auto G = P->get_graph();
     auto q = this->q;
@@ -130,7 +130,7 @@ float run(graph_t& G,
 
   // <boiler-plate>
   auto multi_context =
-      std::shared_ptr<cuda::multi_context_t>(new cuda::multi_context_t(0));
+      std::shared_ptr<gcuda::multi_context_t>(new gcuda::multi_context_t(0));
 
   using problem_type = problem_t<graph_t, param_type, result_type>;
   using enactor_type = enactor_t<problem_type>;
diff --git a/include/gunrock/algorithms/geo.hxx b/include/gunrock/algorithms/geo.hxx
index aad49ee1..73cc4024 100644
--- a/include/gunrock/algorithms/geo.hxx
+++ b/include/gunrock/algorithms/geo.hxx
@@ -266,7 +266,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -285,7 +285,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context,
+            std::shared_ptr<gcuda::multi_context_t> _context,
             enactor_properties_t _properties)
       : gunrock::enactor_t<problem_t>(_problem, _context, _properties) {}
 
@@ -293,7 +293,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using edge_t = typename problem_t::edge_t;
   using weight_t = typename problem_t::weight_t;
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     // Data slice
     auto E = this->get_enactor();
     auto P = this->get_problem();
@@ -385,7 +385,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     );
   }
 
-  bool is_converged(cuda::multi_context_t& context) override {
+  bool is_converged(gcuda::multi_context_t& context) override {
     auto E = this->get_enactor();
     auto P = this->get_problem();
     auto iteration = E->iteration;
@@ -404,9 +404,9 @@ float run(graph_t& G,
           coordinates_t* coordinates,                    // Input/Output
           const unsigned int total_iterations,           // Parameter
           const unsigned int spatial_iterations = 1000,  // Parameter
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   // <user-defined>
   using param_type = param_t;
diff --git a/include/gunrock/algorithms/hits.hxx b/include/gunrock/algorithms/hits.hxx
index 44252bf4..db64b271 100644
--- a/include/gunrock/algorithms/hits.hxx
+++ b/include/gunrock/algorithms/hits.hxx
@@ -140,7 +140,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   };
 
   problem_t(graph_t& G,
-            std::shared_ptr<cuda::multi_context_t> _context,
+            std::shared_ptr<gcuda::multi_context_t> _context,
             int max_iterations)
       : gunrock::problem_t<graph_t>(G, _context),
         max_iterations(max_iterations) {
@@ -234,14 +234,14 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::enactor_t<problem_t>(_problem, _context) {}
 
   using vertex_t = typename problem_t::vertex_t;
   using edge_t = typename problem_t::edge_t;
   using weight_t = typename problem_t::weight_t;
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     // Data slice qqq
     auto E = this->get_enactor();
     auto P = this->get_problem();
@@ -275,7 +275,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
 
   }  // end of loop
 
-  bool is_converged(cuda::multi_context_t& context) override {
+  bool is_converged(gcuda::multi_context_t& context) override {
     auto P = this->get_problem();
     return P->is_converged();
   }
@@ -296,9 +296,9 @@ template <typename graph_t, typename param_t, typename result_t>
 float run(graph_t& G,
           param_t& param,
           result_t& result,
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   using vertex_t = typename graph_t::vertex_type;
   using weight_t = typename graph_t::weight_type;
diff --git a/include/gunrock/algorithms/kcore.hxx b/include/gunrock/algorithms/kcore.hxx
index 168da06b..47bb0227 100644
--- a/include/gunrock/algorithms/kcore.hxx
+++ b/include/gunrock/algorithms/kcore.hxx
@@ -29,7 +29,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 
   problem_t(graph_t& G,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context), result(_result) {}
 
   using vertex_t = typename graph_t::vertex_type;
@@ -90,7 +90,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::enactor_t<problem_t>(_problem, _context) {}
 
   using vertex_t = typename problem_t::vertex_t;
@@ -99,7 +99,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using frontier_t = typename enactor_t<problem_t>::frontier_t;
 
   void prepare_frontier(frontier_t* f,
-                        cuda::multi_context_t& context) override {
+                        gcuda::multi_context_t& context) override {
     // get pointer to the problem
     auto P = this->get_problem();
     auto n_vertices = P->get_graph().get_number_of_vertices();
@@ -109,7 +109,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   }
 
   // One iteration of the application
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     auto E = this->get_enactor();
     auto P = this->get_problem();
     auto G = P->get_graph();
@@ -177,7 +177,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     }
   }
 
-  virtual bool is_converged(cuda::multi_context_t& context) {
+  virtual bool is_converged(gcuda::multi_context_t& context) {
     auto P = this->get_problem();
     auto G = P->get_graph();
     auto n_vertices = G.get_number_of_vertices();
@@ -202,9 +202,9 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
 template <typename graph_t>
 float run(graph_t& G,
           int* k_cores,  // Output
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   using vertex_t = typename graph_t::vertex_type;
   using weight_t = typename graph_t::weight_type;
diff --git a/include/gunrock/algorithms/mst.hxx b/include/gunrock/algorithms/mst.hxx
index 40a76ea3..6dba4f0b 100644
--- a/include/gunrock/algorithms/mst.hxx
+++ b/include/gunrock/algorithms/mst.hxx
@@ -36,7 +36,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -84,7 +84,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::enactor_t<problem_t>(_problem, _context) {}
 
   using vertex_t = typename problem_t::vertex_t;
@@ -93,7 +93,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using frontier_t = typename enactor_t<problem_t>::frontier_t;
 
   void prepare_frontier(frontier_t* f,
-                        cuda::multi_context_t& context) override {
+                        gcuda::multi_context_t& context) override {
     auto P = this->get_problem();
     auto n_vertices = P->get_graph().get_number_of_vertices();
     auto n_edges = P->get_graph().get_number_of_edges();
@@ -102,7 +102,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     f->sequence((edge_t)0, n_edges, context.get_context(0)->stream());
   }
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     auto E = this->get_enactor();
     auto P = this->get_problem();
     auto G = P->get_graph();
@@ -255,7 +255,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     thrust::copy_n(policy, new_roots, P->n_vertices, roots);
   }
 
-  virtual bool is_converged(cuda::multi_context_t& context) {
+  virtual bool is_converged(gcuda::multi_context_t& context) {
     auto P = this->get_problem();
     return (P->super_vertices[0] == 1);
   }
@@ -264,9 +264,9 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
 template <typename graph_t>
 float run(graph_t& G,
           typename graph_t::weight_type* mst_weight,  // Output
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   using vertex_t = typename graph_t::vertex_type;
   using weight_t = typename graph_t::weight_type;
diff --git a/include/gunrock/algorithms/ppr.hxx b/include/gunrock/algorithms/ppr.hxx
index 79a2b747..cb6fe57a 100644
--- a/include/gunrock/algorithms/ppr.hxx
+++ b/include/gunrock/algorithms/ppr.hxx
@@ -38,7 +38,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -88,7 +88,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::enactor_t<problem_t>(_problem, _context) {}
 
   using vertex_t = typename problem_t::vertex_t;
@@ -97,12 +97,12 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using frontier_t = typename enactor_t<problem_t>::frontier_t;
 
   void prepare_frontier(frontier_t* f,
-                        cuda::multi_context_t& context) override {
+                        gcuda::multi_context_t& context) override {
     auto P = this->get_problem();
     f->push_back(P->param.seed);
   }
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     // Data slice
     auto E = this->get_enactor();
     auto P = this->get_problem();
@@ -148,14 +148,14 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
 };  // struct enactor_t
 
 template <typename graph_t>
-float run(
-    graph_t& G,
-    typename graph_t::vertex_type& seed,
-    typename graph_t::weight_type* p,
-    typename graph_t::weight_type& alpha,
-    typename graph_t::weight_type& epsilon,
-    std::shared_ptr<cuda::multi_context_t> context =
-        std::shared_ptr<cuda::multi_context_t>(new cuda::multi_context_t(0))) {
+float run(graph_t& G,
+          typename graph_t::vertex_type& seed,
+          typename graph_t::weight_type* p,
+          typename graph_t::weight_type& alpha,
+          typename graph_t::weight_type& epsilon,
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))) {
   // <user-defined>
   using vertex_t = typename graph_t::vertex_type;
   using weight_t = typename graph_t::weight_type;
diff --git a/include/gunrock/algorithms/pr.hxx b/include/gunrock/algorithms/pr.hxx
index 049e4d58..cb1e616b 100644
--- a/include/gunrock/algorithms/pr.hxx
+++ b/include/gunrock/algorithms/pr.hxx
@@ -39,7 +39,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -95,7 +95,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context,
+            std::shared_ptr<gcuda::multi_context_t> _context,
             enactor_properties_t _properties)
       : gunrock::enactor_t<problem_t>(_problem, _context, _properties) {}
 
@@ -103,7 +103,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using edge_t = typename problem_t::edge_t;
   using weight_t = typename problem_t::weight_t;
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     // Data slice
     auto E = this->get_enactor();
     auto P = this->get_problem();
@@ -152,7 +152,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
         G, E, spread_op, context);
   }
 
-  virtual bool is_converged(cuda::multi_context_t& context) {
+  virtual bool is_converged(gcuda::multi_context_t& context) {
     if (this->iteration == 0)
       return false;
 
@@ -184,9 +184,9 @@ float run(graph_t& G,
           typename graph_t::weight_type alpha,
           typename graph_t::weight_type tol,
           typename graph_t::weight_type* p,  // Output
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   // <user-defined>
   using vertex_t = typename graph_t::vertex_type;
diff --git a/include/gunrock/algorithms/sort/radix_sort.hxx b/include/gunrock/algorithms/sort/radix_sort.hxx
index 4cddf0c3..91753a4f 100644
--- a/include/gunrock/algorithms/sort/radix_sort.hxx
+++ b/include/gunrock/algorithms/sort/radix_sort.hxx
@@ -41,7 +41,7 @@ template <typename type_t>
 void sort_keys(type_t* keys,
                std::size_t num_items,
                order_t order = order_t::ascending,
-               cuda::stream_t stream = 0) {
+               gcuda::stream_t stream = 0) {
   if (order == order_t::ascending)
     thrust::sort(thrust::cuda::par.on(stream), keys, keys + num_items,
                  thrust::less<type_t>());
diff --git a/include/gunrock/algorithms/spgemm.hxx b/include/gunrock/algorithms/spgemm.hxx
index f2972e28..da50a07c 100644
--- a/include/gunrock/algorithms/spgemm.hxx
+++ b/include/gunrock/algorithms/spgemm.hxx
@@ -45,7 +45,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& A,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(A, _context),
         param(_param),
         result(_result) {}
@@ -80,7 +80,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context,
+            std::shared_ptr<gcuda::multi_context_t> _context,
             enactor_properties_t _properties = enactor_properties_t())
       : gunrock::enactor_t<problem_t>(_problem, _context, _properties) {}
 
@@ -88,7 +88,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using edge_t = typename problem_t::edge_t;
   using weight_t = typename problem_t::weight_t;
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     auto E = this->get_enactor();
     auto P = this->get_problem();
 
@@ -251,7 +251,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
    * @param context The context of the execution (unused).
    * @return true returns true after one iteration.
    */
-  virtual bool is_converged(cuda::multi_context_t& context) {
+  virtual bool is_converged(gcuda::multi_context_t& context) {
     return this->iteration == 0 ? false : true;
   }
 };  // struct enactor_t
@@ -260,9 +260,9 @@ template <typename graph_t, typename csr_t>
 float run(graph_t& A,
           graph_t& B,
           csr_t& C,
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   using param_type = param_t<graph_t>;
   using result_type = result_t<csr_t>;
diff --git a/include/gunrock/algorithms/spmv.hxx b/include/gunrock/algorithms/spmv.hxx
index 09c656b6..bf3dcc75 100644
--- a/include/gunrock/algorithms/spmv.hxx
+++ b/include/gunrock/algorithms/spmv.hxx
@@ -37,7 +37,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -61,7 +61,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context,
+            std::shared_ptr<gcuda::multi_context_t> _context,
             enactor_properties_t _properties = enactor_properties_t())
       : gunrock::enactor_t<problem_t>(_problem, _context, _properties) {}
 
@@ -69,13 +69,13 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using edge_t = typename problem_t::edge_t;
   using weight_t = typename problem_t::weight_t;
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     // TODO: Use a parameter (enum) to select between the two:
     // Maybe use the existing advance_direction_t enum.
     pull(context);
   }
 
-  void push(cuda::multi_context_t& context) {
+  void push(gcuda::multi_context_t& context) {
     auto E = this->get_enactor();
     auto P = this->get_problem();
     auto G = P->get_graph();
@@ -104,7 +104,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
         G, E, spmv, context);
   }
 
-  void pull(cuda::multi_context_t& context) {
+  void pull(gcuda::multi_context_t& context) {
     auto E = this->get_enactor();
     auto P = this->get_problem();
     auto G = P->get_graph();
@@ -126,7 +126,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
                                        context);
   }
 
-  virtual bool is_converged(cuda::multi_context_t& context) {
+  virtual bool is_converged(gcuda::multi_context_t& context) {
     return this->iteration == 0 ? false : true;
   }
 };  // struct enactor_t
@@ -135,9 +135,9 @@ template <typename graph_t>
 float run(graph_t& G,
           typename graph_t::weight_type* x,  // Input vector
           typename graph_t::weight_type* y,  // Output vector
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   // <user-defined>
   using weight_t = typename graph_t::weight_type;
diff --git a/include/gunrock/algorithms/sssp.hxx b/include/gunrock/algorithms/sssp.hxx
index 3ecdcdb7..0dac7e54 100644
--- a/include/gunrock/algorithms/sssp.hxx
+++ b/include/gunrock/algorithms/sssp.hxx
@@ -37,7 +37,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   problem_t(graph_t& G,
             param_type& _param,
             result_type& _result,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::problem_t<graph_t>(G, _context),
         param(_param),
         result(_result) {}
@@ -81,7 +81,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context)
+            std::shared_ptr<gcuda::multi_context_t> _context)
       : gunrock::enactor_t<problem_t>(_problem, _context) {}
 
   using vertex_t = typename problem_t::vertex_t;
@@ -90,12 +90,12 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using frontier_t = typename enactor_t<problem_t>::frontier_t;
 
   void prepare_frontier(frontier_t* f,
-                        cuda::multi_context_t& context) override {
+                        gcuda::multi_context_t& context) override {
     auto P = this->get_problem();
     f->push_back(P->param.single_source);
   }
 
-  void loop(cuda::multi_context_t& context) override {
+  void loop(gcuda::multi_context_t& context) override {
     // Data slice
     auto E = this->get_enactor();
     auto P = this->get_problem();
@@ -157,9 +157,9 @@ float run(graph_t& G,
           typename graph_t::vertex_type& single_source,  // Parameter
           typename graph_t::weight_type* distances,      // Output
           typename graph_t::vertex_type* predecessors,   // Output
-          std::shared_ptr<cuda::multi_context_t> context =
-              std::shared_ptr<cuda::multi_context_t>(
-                  new cuda::multi_context_t(0))  // Context
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
 ) {
   // <user-defined>
   using vertex_t = typename graph_t::vertex_type;
diff --git a/include/gunrock/cuda/atomic_functions.hxx b/include/gunrock/cuda/atomic_functions.hxx
index 272b791d..ef084c79 100644
--- a/include/gunrock/cuda/atomic_functions.hxx
+++ b/include/gunrock/cuda/atomic_functions.hxx
@@ -10,7 +10,7 @@
  */
 #pragma once
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 
 /**
  * @brief Wrapper around CUDA's natively supported atomicMin types.
@@ -122,5 +122,5 @@ __device__ static double atomicMax(double* address, double value) {
   return __longlong_as_double(old);
 }
 
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/cuda/context.hxx b/include/gunrock/cuda/context.hxx
index ec6cdd62..ee76740b 100644
--- a/include/gunrock/cuda/context.hxx
+++ b/include/gunrock/cuda/context.hxx
@@ -26,7 +26,7 @@
 #include <thrust/execution_policy.h>
 
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 
 template <int dummy_arg>
 __global__ void dummy_k() {}
@@ -39,26 +39,26 @@ struct context_t {
   context_t(const context_t& rhs) = delete;
   context_t& operator=(const context_t& rhs) = delete;
 
-  virtual const cuda::device_properties_t& props() const = 0;
+  virtual const gcuda::device_properties_t& props() const = 0;
   virtual void print_properties() = 0;
-  virtual cuda::compute_capability_t ptx_version() const = 0;
-  virtual cuda::stream_t stream() = 0;
+  virtual gcuda::compute_capability_t ptx_version() const = 0;
+  virtual gcuda::stream_t stream() = 0;
   virtual mgpu::standard_context_t* mgpu() = 0;
 
   // cudaStreamSynchronize or cudaDeviceSynchronize for stream 0.
   virtual void synchronize() = 0;
-  virtual cuda::event_t event() = 0;
+  virtual gcuda::event_t event() = 0;
   virtual util::timer_t& timer() = 0;
 };  // struct context_t
 
 class standard_context_t : public context_t {
  protected:
-  cuda::device_properties_t _props;
-  cuda::compute_capability_t _ptx_version;
+  gcuda::device_properties_t _props;
+  gcuda::compute_capability_t _ptx_version;
 
-  cuda::device_id_t _ordinal;
-  cuda::stream_t _stream;
-  cuda::event_t _event;
+  gcuda::device_id_t _ordinal;
+  gcuda::stream_t _stream;
+  gcuda::event_t _event;
 
   /**
    * @todo Find out how to use a shared_ptr<> without printing the GPU debug
@@ -73,10 +73,10 @@ class standard_context_t : public context_t {
   // of dummy_k for each translation unit.
   template <int dummy_arg = 0>
   void init() {
-    cuda::function_attributes_t attr;
+    gcuda::function_attributes_t attr;
     error::error_t status = cudaFuncGetAttributes(&attr, dummy_k<0>);
     error::throw_if_exception(status);
-    _ptx_version = cuda::make_compute_capability(attr.ptxVersion);
+    _ptx_version = gcuda::make_compute_capability(attr.ptxVersion);
 
     cudaSetDevice(_ordinal);
     cudaStreamCreateWithFlags(&_stream, cudaStreamNonBlocking);
@@ -87,32 +87,32 @@ class standard_context_t : public context_t {
   }
 
  public:
-  standard_context_t(cuda::device_id_t device = 0)
+  standard_context_t(gcuda::device_id_t device = 0)
       : context_t(), _ordinal(device), _mgpu_context(nullptr) {
     init();
   }
 
-  standard_context_t(cudaStream_t stream, cuda::device_id_t device = 0)
+  standard_context_t(cudaStream_t stream, gcuda::device_id_t device = 0)
       : context_t(), _ordinal(device), _mgpu_context(nullptr), _stream(stream) {
     init();
   }
 
   ~standard_context_t() { cudaEventDestroy(_event); }
 
-  virtual const cuda::device_properties_t& props() const override {
+  virtual const gcuda::device_properties_t& props() const override {
     return _props;
   }
 
   virtual void print_properties() override {
-    cuda::device::set(_ordinal);
-    cuda::properties::print(_props);
+    gcuda::device::set(_ordinal);
+    gcuda::properties::print(_props);
   }
 
-  virtual cuda::compute_capability_t ptx_version() const override {
+  virtual gcuda::compute_capability_t ptx_version() const override {
     return _ptx_version;
   }
 
-  virtual cuda::stream_t stream() override { return _stream; }
+  virtual gcuda::stream_t stream() override { return _stream; }
   virtual mgpu::standard_context_t* mgpu() override { return _mgpu_context; }
 
   virtual void synchronize() override {
@@ -121,11 +121,11 @@ class standard_context_t : public context_t {
     error::throw_if_exception(status);
   }
 
-  virtual cuda::event_t event() override { return _event; }
+  virtual gcuda::event_t event() override { return _event; }
 
   virtual util::timer_t& timer() override { return _timer; }
 
-  virtual cuda::device_id_t ordinal() { return _ordinal; }
+  virtual gcuda::device_id_t ordinal() { return _ordinal; }
 
   auto execution_policy() {
     return thrust::cuda::par_nosync.on(this->stream());
@@ -136,11 +136,11 @@ class standard_context_t : public context_t {
 class multi_context_t {
  public:
   thrust::host_vector<standard_context_t*> contexts;
-  thrust::host_vector<cuda::device_id_t> devices;
+  thrust::host_vector<gcuda::device_id_t> devices;
   static constexpr std::size_t MAX_NUMBER_OF_GPUS = 1024;
 
   // Multiple devices.
-  multi_context_t(thrust::host_vector<cuda::device_id_t> _devices)
+  multi_context_t(thrust::host_vector<gcuda::device_id_t> _devices)
       : devices(_devices) {
     for (auto& device : devices) {
       standard_context_t* device_context = new standard_context_t(device);
@@ -149,7 +149,7 @@ class multi_context_t {
   }
 
   // Multiple devices with a user-provided stream
-  multi_context_t(thrust::host_vector<cuda::device_id_t> _devices,
+  multi_context_t(thrust::host_vector<gcuda::device_id_t> _devices,
                   cudaStream_t _stream)
       : devices(_devices) {
     for (auto& device : devices) {
@@ -160,7 +160,7 @@ class multi_context_t {
   }
 
   // Single device.
-  multi_context_t(cuda::device_id_t _device) : devices(1, _device) {
+  multi_context_t(gcuda::device_id_t _device) : devices(1, _device) {
     for (auto& device : devices) {
       standard_context_t* device_context = new standard_context_t(device);
       contexts.push_back(device_context);
@@ -168,7 +168,7 @@ class multi_context_t {
   }
 
   // Single device with a user-provided stream
-  multi_context_t(cuda::device_id_t _device, cudaStream_t _stream)
+  multi_context_t(gcuda::device_id_t _device, cudaStream_t _stream)
       : devices(1, _device) {
     for (auto& device : devices) {
       standard_context_t* device_context =
@@ -178,7 +178,7 @@ class multi_context_t {
   }
   ~multi_context_t() {}
 
-  auto get_context(cuda::device_id_t device) {
+  auto get_context(gcuda::device_id_t device) {
     auto contexts_ptr = contexts.data();
     return contexts_ptr[device];
   }
@@ -205,5 +205,5 @@ class multi_context_t {
   }
 };  // class multi_context_t
 
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
diff --git a/include/gunrock/cuda/cuda.hxx b/include/gunrock/cuda/cuda.hxx
index fea52014..f7fa0308 100644
--- a/include/gunrock/cuda/cuda.hxx
+++ b/include/gunrock/cuda/cuda.hxx
@@ -12,7 +12,7 @@
 #pragma once
 
 namespace gunrock {
-namespace cuda {}  // namespace cuda
+namespace gcuda {}  // namespace gcuda
 }  // namespace gunrock
 
 #include <gunrock/cuda/global.hxx>
diff --git a/include/gunrock/cuda/detail/launch_box.hxx b/include/gunrock/cuda/detail/launch_box.hxx
index 834d553b..36753c77 100644
--- a/include/gunrock/cuda/detail/launch_box.hxx
+++ b/include/gunrock/cuda/detail/launch_box.hxx
@@ -13,7 +13,7 @@
 #include <gunrock/cuda/sm.hxx>
 
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 namespace launch_box {
 namespace detail {
 
@@ -97,5 +97,5 @@ inline void for_each_argument_address(void** collected_addresses,
 
 }  // namespace detail
 }  // namespace launch_box
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
diff --git a/include/gunrock/cuda/detail/launch_kernels.hxx b/include/gunrock/cuda/detail/launch_kernels.hxx
index 70d07a60..514aa1a0 100644
--- a/include/gunrock/cuda/detail/launch_kernels.hxx
+++ b/include/gunrock/cuda/detail/launch_kernels.hxx
@@ -14,7 +14,7 @@
 #include <gunrock/cuda/global.hxx>
 
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 namespace kernels {
 namespace detail {
 template <unsigned int threads_per_block,
@@ -24,16 +24,16 @@ template <unsigned int threads_per_block,
 __global__ __launch_bounds__(threads_per_block,
                              items_per_thread)  // strict launch bounds
     void blocked_kernel(func_t f, const std::size_t bound, args_t... args) {
-  const int stride = cuda::block::size::x() * cuda::grid::size::x();
-  for (int i = cuda::thread::global::id::x();  // global id
-       i < bound;                              // bound check
-       i += (stride * items_per_thread)        // offset
+  const int stride = gcuda::block::size::x() * gcuda::grid::size::x();
+  for (int i = gcuda::thread::global::id::x();  // global id
+       i < bound;                               // bound check
+       i += (stride * items_per_thread)         // offset
   ) {
 #pragma unroll items_per_thread
     for (int j = 0; j < items_per_thread; ++j) {
       // Simple blocking per thread (unrolled items_per_thread_t)
       if ((i + (stride * j)) < bound)
-        f(i + (stride * j), cuda::block::id::x(), args...);
+        f(i + (stride * j), gcuda::block::id::x(), args...);
     }
   }
 }
@@ -41,15 +41,15 @@ __global__ __launch_bounds__(threads_per_block,
 template <unsigned int threads_per_block, typename func_t, typename... args_t>
 __global__ __launch_bounds__(threads_per_block, 1)  // strict launch bounds
     void strided_kernel(func_t f, const std::size_t bound, args_t... args) {
-  const int stride = cuda::block::size::x() * cuda::grid::size::x();
-  for (int i = cuda::thread::global::id::x();  // global id
-       i < bound;                              // bound check
-       i += stride                             // offset
+  const int stride = gcuda::block::size::x() * gcuda::grid::size::x();
+  for (int i = gcuda::thread::global::id::x();  // global id
+       i < bound;                               // bound check
+       i += stride                              // offset
   ) {
-    f(i, cuda::block::id::x(), args...);
+    f(i, gcuda::block::id::x(), args...);
   }
 }
 }  // namespace detail
 }  // namespace kernels
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/cuda/device.hxx b/include/gunrock/cuda/device.hxx
index 28b3e929..a27eec1b 100644
--- a/include/gunrock/cuda/device.hxx
+++ b/include/gunrock/cuda/device.hxx
@@ -11,16 +11,16 @@
 
 #pragma once
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 
 typedef int device_id_t;
 
 namespace device {
 
-void set(cuda::device_id_t device) {
+void set(gcuda::device_id_t device) {
   cudaSetDevice(device);
 }
 
 }  // namespace device
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/cuda/device_properties.hxx b/include/gunrock/cuda/device_properties.hxx
index fd62eb9d..969b4a82 100644
--- a/include/gunrock/cuda/device_properties.hxx
+++ b/include/gunrock/cuda/device_properties.hxx
@@ -14,7 +14,7 @@
 #include <gunrock/error.hxx>
 
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 
 typedef cudaDeviceProp device_properties_t;
 
@@ -241,5 +241,5 @@ void print(device_properties_t& prop) {
 
 }  // namespace properties
 
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/cuda/event_management.hxx b/include/gunrock/cuda/event_management.hxx
index 0bd9acc6..4c79650b 100644
--- a/include/gunrock/cuda/event_management.hxx
+++ b/include/gunrock/cuda/event_management.hxx
@@ -12,9 +12,9 @@
 #pragma once
 
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 
 typedef cudaEvent_t event_t;
 
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/cuda/function.hxx b/include/gunrock/cuda/function.hxx
index 4f942e2c..5429c232 100644
--- a/include/gunrock/cuda/function.hxx
+++ b/include/gunrock/cuda/function.hxx
@@ -11,9 +11,9 @@
 #pragma once
 
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 
 typedef cudaFuncAttributes function_attributes_t;
 
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/cuda/global.hxx b/include/gunrock/cuda/global.hxx
index 04d19f9a..fee1b181 100644
--- a/include/gunrock/cuda/global.hxx
+++ b/include/gunrock/cuda/global.hxx
@@ -11,7 +11,7 @@
 
 #pragma once
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 
 typedef int thread_idx_t;
 
@@ -103,5 +103,5 @@ __device__ __forceinline__ int total() {
 }  // namespace size
 }  // namespace grid
 
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/cuda/intrinsics.hxx b/include/gunrock/cuda/intrinsics.hxx
index e2ca918b..c80def11 100644
--- a/include/gunrock/cuda/intrinsics.hxx
+++ b/include/gunrock/cuda/intrinsics.hxx
@@ -11,5 +11,5 @@
 
 #pragma once
 namespace gunrock {
-namespace cuda {}  // namespace cuda
+namespace gcuda {}  // namespace gcuda
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/cuda/launch_box.hxx b/include/gunrock/cuda/launch_box.hxx
index 44d653b6..82f45096 100644
--- a/include/gunrock/cuda/launch_box.hxx
+++ b/include/gunrock/cuda/launch_box.hxx
@@ -27,7 +27,7 @@
 #endif
 
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 namespace launch_box {
 
 struct dimensions_t {
@@ -212,7 +212,7 @@ struct launch_box_t : public select_launch_params_t<lp_v...> {
    * auto f = [=] __device__(int const& tid, int const& bid) {
    *  // Do something
    * };
-   * using namespace cuda::launch_box;
+   * using namespace gcuda::launch_box;
    * using launch_t =
    *  launch_box_t<launch_params_dynamic_grid_t<fallback, dim3_t<128>>>;
    *
@@ -230,7 +230,7 @@ struct launch_box_t : public select_launch_params_t<lp_v...> {
    * @param args arguments to be passed to the function.
    */
   template <typename func_t, typename... args_t>
-  void launch_strided(cuda::standard_context_t& context,
+  void launch_strided(gcuda::standard_context_t& context,
                       func_t& f,
                       const std::size_t num_elements,
                       args_t&&... args) {
@@ -257,7 +257,7 @@ struct launch_box_t : public select_launch_params_t<lp_v...> {
    * auto f = [=] __device__(int const& tid, int const& bid) {
    *  // Do something
    * };
-   * using namespace cuda::launch_box;
+   * using namespace gcuda::launch_box;
    * using launch_t =
    *  launch_box_t<launch_params_dynamic_grid_t<fallback, dim3_t<128>>>;
    *
@@ -275,7 +275,7 @@ struct launch_box_t : public select_launch_params_t<lp_v...> {
    * @param args arguments to be passed to the function.
    */
   template <typename func_t, typename... args_t>
-  void launch_blocked(cuda::standard_context_t& context,
+  void launch_blocked(gcuda::standard_context_t& context,
                       func_t& f,
                       const std::size_t num_elements,
                       args_t&&... args) {
@@ -289,7 +289,7 @@ struct launch_box_t : public select_launch_params_t<lp_v...> {
   }
 
   template <typename func_t, typename... args_t>
-  void launch_cooperative(cuda::standard_context_t& context,
+  void launch_cooperative(gcuda::standard_context_t& context,
                           const func_t& f,
                           const std::size_t num_elements,
                           args_t&&... args) {
@@ -325,7 +325,7 @@ struct launch_box_t : public select_launch_params_t<lp_v...> {
    * \return void
    */
   template <typename func_t, typename... args_t>
-  void launch(cuda::standard_context_t& context,
+  void launch(gcuda::standard_context_t& context,
               const func_t& f,
               args_t&&... args) {
     f<<<params_t::grid_dimensions, params_t::block_dimensions,
@@ -360,5 +360,5 @@ inline float occupancy(func_t kernel) {
 }
 
 }  // namespace launch_box
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
diff --git a/include/gunrock/cuda/sm.hxx b/include/gunrock/cuda/sm.hxx
index a47142e1..fba212af 100644
--- a/include/gunrock/cuda/sm.hxx
+++ b/include/gunrock/cuda/sm.hxx
@@ -11,7 +11,7 @@
 #pragma once
 
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 
 namespace launch_box {
 
@@ -64,5 +64,5 @@ constexpr sm_flag_t operator&(sm_flag_t lhs, sm_flag_t rhs) {
 
 }  // namespace launch_box
 
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
diff --git a/include/gunrock/cuda/stream_management.hxx b/include/gunrock/cuda/stream_management.hxx
index ed21326d..aa077ad0 100644
--- a/include/gunrock/cuda/stream_management.hxx
+++ b/include/gunrock/cuda/stream_management.hxx
@@ -11,9 +11,9 @@
 #pragma once
 
 namespace gunrock {
-namespace cuda {
+namespace gcuda {
 
 typedef cudaStream_t stream_t;
 
-}  // namespace cuda
+}  // namespace gcuda
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/framework/enactor.hxx b/include/gunrock/framework/enactor.hxx
index 089e0dac..ede978d5 100644
--- a/include/gunrock/framework/enactor.hxx
+++ b/include/gunrock/framework/enactor.hxx
@@ -93,9 +93,9 @@ struct enactor_t {
 
   /*!
    * A shared_ptr to a multi-gpu context.
-   * @see `gunrock::cuda::multi_context_t`
+   * @see `gunrock::gcuda::multi_context_t`
    */
-  std::shared_ptr<cuda::multi_context_t> context;
+  std::shared_ptr<gcuda::multi_context_t> context;
 
   /*!
    * Algorithm's problem structure.
@@ -151,7 +151,7 @@ struct enactor_t {
    * @brief Construct a new enactor t object.
    *
    * @param _problem algorithm's problem data structure.
-   * @param _context shared pointer to the cuda::multi_context_t context that
+   * @param _context shared pointer to the gcuda::multi_context_t context that
    * stores information about multiple GPUs (such as streams, device ids,
    * events, etc.)
    * @param _properties `gunrock::enactor_properties_t`, includes
@@ -159,7 +159,7 @@ struct enactor_t {
    * frontier buffers to create for the enactor.
    */
   enactor_t(algorithm_problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context,
+            std::shared_ptr<gcuda::multi_context_t> _context,
             enactor_properties_t _properties = enactor_properties_t())
       : problem(_problem),
         properties(_properties),
@@ -265,17 +265,17 @@ struct enactor_t {
    * function if they are not part of the algorithm's core, or running API calls
    * that are incredibly slow (such as `printfs` or debug statements).
    *
-   * @param context `gunrock::cuda::multi_context_t`.
+   * @param context `gunrock::gcuda::multi_context_t`.
    */
-  virtual void loop(cuda::multi_context_t& context) = 0;
+  virtual void loop(gcuda::multi_context_t& context) = 0;
 
   /**
    * @brief Prepare the initial frontier.
    *
-   * @param context `gunrock::cuda::multi_context_t`.
+   * @param context `gunrock::gcuda::multi_context_t`.
    */
   virtual void prepare_frontier(frontier_t* f,
-                                cuda::multi_context_t& context){};
+                                gcuda::multi_context_t& context){};
 
   /**
    * @brief Algorithm is converged if true is returned, keep on iterating if
@@ -291,7 +291,7 @@ struct enactor_t {
    * @return true converged!
    * @return false not converged, keep looping!
    */
-  virtual bool is_converged(cuda::multi_context_t& context) {
+  virtual bool is_converged(gcuda::multi_context_t& context) {
     return active_frontier->is_empty();
   }
 
@@ -303,9 +303,9 @@ struct enactor_t {
    * one final wrap-up of the application. Users are not required to implement
    * this function.
    *
-   * @param context `gunrock::cuda::multi_context_t`.
+   * @param context `gunrock::gcuda::multi_context_t`.
    */
-  virtual void finalize(cuda::multi_context_t& context) {}
+  virtual void finalize(gcuda::multi_context_t& context) {}
 
 };  // struct enactor_t
 
diff --git a/include/gunrock/framework/experimental/async/enactor.hxx b/include/gunrock/framework/experimental/async/enactor.hxx
index 5530ab48..803a7c6b 100644
--- a/include/gunrock/framework/experimental/async/enactor.hxx
+++ b/include/gunrock/framework/experimental/async/enactor.hxx
@@ -15,7 +15,7 @@ struct enactor_t {
   queue_t q;
 
   algorithm_problem_t* problem;
-  std::shared_ptr<cuda::multi_context_t> context;
+  std::shared_ptr<gcuda::multi_context_t> context;
 
   enactor_t(const enactor_t& rhs) = delete;
   enactor_t& operator=(const enactor_t& rhs) = delete;
@@ -31,7 +31,7 @@ struct enactor_t {
   float sizing_factor = 1.5;
 
   enactor_t(algorithm_problem_t* _problem,
-            std::shared_ptr<cuda::multi_context_t> _context,
+            std::shared_ptr<gcuda::multi_context_t> _context,
             enactor_properties_t _properties = enactor_properties_t())
       : problem(_problem), context(_context) {
     auto n_vertices = problem->get_graph().get_number_of_vertices();
@@ -47,8 +47,9 @@ struct enactor_t {
   enactor_t* get_enactor() { return this; }
   algorithm_problem_t* get_problem() { return problem; }
 
-  virtual void loop(cuda::multi_context_t& context) = 0;
-  virtual void prepare_frontier(queue_t& q, cuda::multi_context_t& context) = 0;
+  virtual void loop(gcuda::multi_context_t& context) = 0;
+  virtual void prepare_frontier(queue_t& q,
+                                gcuda::multi_context_t& context) = 0;
 
   float enact() {
     auto single_context = context->get_context(0);
diff --git a/include/gunrock/framework/frontier/experimental/boolmap_frontier.hxx b/include/gunrock/framework/frontier/experimental/boolmap_frontier.hxx
index 8267d6e0..2b19b1bb 100644
--- a/include/gunrock/framework/frontier/experimental/boolmap_frontier.hxx
+++ b/include/gunrock/framework/frontier/experimental/boolmap_frontier.hxx
@@ -53,7 +53,7 @@ class boolmap_frontier_t {
    * @return std::size_t
    */
   __host__ __device__ __forceinline__ std::size_t get_number_of_elements(
-      cuda::stream_t stream = 0) {
+      gcuda::stream_t stream = 0) {
     // Compute number of elements using a reduction.
 #ifdef __CUDA_ARCH__
     num_elements = thrust::reduce(thrust::seq, this->begin(), this->end(), 0);
@@ -99,7 +99,7 @@ class boolmap_frontier_t {
   __device__ __forceinline__ constexpr void set_element_at(
       type_t const& element,
       std::size_t const& idx = 0  // Ignore idx for boolmap.
-      ) const noexcept {          // XXX: This should not be const
+  ) const noexcept {              // XXX: This should not be const
     thread::store(this->get() + element, 1);
   }
 
@@ -143,7 +143,7 @@ class boolmap_frontier_t {
    * @param value
    * @param stream
    */
-  void fill(type_t const value, cuda::stream_t stream = 0) {
+  void fill(type_t const value, gcuda::stream_t stream = 0) {
     if (value != 0 || value != 1)
       error::throw_if_exception(cudaErrorUnknown,
                                 "Boolmap only supports 1 or 0 as fill value.");
@@ -180,10 +180,10 @@ class boolmap_frontier_t {
    * @brief Parallel sort the frontier.
    *
    * @param order see sort::order_t
-   * @param stream see cuda::stream
+   * @param stream see gcuda::stream
    */
   void sort(sort::order_t order = sort::order_t::ascending,
-            cuda::stream_t stream = 0) {
+            gcuda::stream_t stream = 0) {
     // Bool-map frontier is always sorted.
   }
 
diff --git a/include/gunrock/framework/frontier/frontier.hxx b/include/gunrock/framework/frontier/frontier.hxx
index 93dc78ac..98a375bb 100644
--- a/include/gunrock/framework/frontier/frontier.hxx
+++ b/include/gunrock/framework/frontier/frontier.hxx
@@ -118,7 +118,7 @@ class frontier_t : public frontier::vector_frontier_t<vertex_t, edge_t, _kind> {
    * @return std::size_t
    */
   __host__ __device__ __forceinline__ std::size_t get_number_of_elements(
-      cuda::stream_t stream = 0) {
+      gcuda::stream_t stream = 0) {
     return underlying_view_t::get_number_of_elements(stream);
   }
 
diff --git a/include/gunrock/framework/frontier/vector_frontier.hxx b/include/gunrock/framework/frontier/vector_frontier.hxx
index de2f7153..bb6f0ae6 100644
--- a/include/gunrock/framework/frontier/vector_frontier.hxx
+++ b/include/gunrock/framework/frontier/vector_frontier.hxx
@@ -78,7 +78,7 @@ class vector_frontier_t {
    * @return std::size_t
    */
   __host__ __device__ __forceinline__ std::size_t get_number_of_elements(
-      cuda::stream_t stream = 0) const {
+      gcuda::stream_t stream = 0) const {
     return num_elements;
   }
 
@@ -173,7 +173,7 @@ class vector_frontier_t {
    * @param value
    * @param stream
    */
-  void fill(type_t const value, cuda::stream_t stream = 0) {
+  void fill(type_t const value, gcuda::stream_t stream = 0) {
     thrust::fill(thrust::cuda::par.on(stream), this->begin(), this->end(),
                  value);
   }
@@ -190,7 +190,7 @@ class vector_frontier_t {
    */
   void sequence(type_t const initial_value,
                 std::size_t const& size,
-                cuda::stream_t stream = 0) {
+                gcuda::stream_t stream = 0) {
     // Resize if needed.
     if (this->get_capacity() < size)
       this->reserve(size);
@@ -232,10 +232,10 @@ class vector_frontier_t {
    * @brief Parallel sort the frontier.
    *
    * @param order see sort::order_t
-   * @param stream see cuda::stream
+   * @param stream see gcuda::stream
    */
   void sort(sort::order_t order = sort::order_t::ascending,
-            cuda::stream_t stream = 0) {
+            gcuda::stream_t stream = 0) {
     sort::radix::sort_keys(p_storage.get()->data().get(),
                            this->get_number_of_elements(), order, stream);
   }
diff --git a/include/gunrock/framework/operators/advance/advance.hxx b/include/gunrock/framework/operators/advance/advance.hxx
index daab838f..43ee0611 100644
--- a/include/gunrock/framework/operators/advance/advance.hxx
+++ b/include/gunrock/framework/operators/advance/advance.hxx
@@ -98,7 +98,7 @@ void execute(graph_t& G,
              frontier_t* input,
              frontier_t* output,
              work_tiles_t& segments,
-             cuda::multi_context_t& context) {
+             gcuda::multi_context_t& context) {
   if (context.size() == 1) {
     auto context0 = context.get_context(0);
 
@@ -193,7 +193,7 @@ template <load_balance_t lb = load_balance_t::merge_path,
 void execute(graph_t& G,
              enactor_type* E,
              operator_type op,
-             cuda::multi_context_t& context,
+             gcuda::multi_context_t& context,
              bool swap_buffers = true) {
   execute<lb, direction, input_type, output_type>(
       G,                         // graph
diff --git a/include/gunrock/framework/operators/advance/block_mapped.hxx b/include/gunrock/framework/operators/advance/block_mapped.hxx
index 80d290a1..72add815 100644
--- a/include/gunrock/framework/operators/advance/block_mapped.hxx
+++ b/include/gunrock/framework/operators/advance/block_mapped.hxx
@@ -51,8 +51,8 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK, 2)
   // Specialize Block Scan for 1D block of THREADS_PER_BLOCK.
   using block_scan_t = cub::BlockScan<edge_t, THREADS_PER_BLOCK>;
 
-  auto global_idx = cuda::thread::global::id::x();
-  auto local_idx = cuda::thread::local::id::x();
+  auto global_idx = gcuda::thread::global::id::x();
+  auto local_idx = gcuda::thread::local::id::x();
 
   thrust::counting_iterator<type_t> all_vertices(0);
   __shared__ typename block_scan_t::TempStorage scan;
@@ -102,7 +102,7 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK, 2)
     __syncthreads();
   }
 
-  auto length = global_idx - local_idx + cuda::block::size::x();
+  auto length = global_idx - local_idx + gcuda::block::size::x();
 
   if (input_size < length)
     length = input_size;
@@ -115,7 +115,7 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK, 2)
   /// resultant neighbor or invalid vertex is written to the output frontier.
   for (edge_t i = local_idx;            // threadIdx.x
        i < aggregate_degree_per_block;  // total degree to process
-       i += cuda::block::size::x()      // increment by blockDim.x
+       i += gcuda::block::size::x()     // increment by blockDim.x
   ) {
     // Binary search to find which vertex id to work on.
     int id = search::binary::rightmost(degrees, i, length);
@@ -156,7 +156,7 @@ void execute(graph_t& G,
              operator_t op,
              frontier_t& input,
              frontier_t& output,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   if constexpr (output_type != advance_io_type_t::none) {
     auto size_of_output = compute_output_length(G, input, context);
 
@@ -178,7 +178,7 @@ void execute(graph_t& G,
                                  : input.get_number_of_elements();
 
   // Set-up and launch block-mapped advance.
-  using namespace cuda::launch_box;
+  using namespace gcuda::launch_box;
   using launch_t =
       launch_box_t<launch_params_dynamic_grid_t<fallback, dim3_t<256>>>;
 
diff --git a/include/gunrock/framework/operators/advance/bucketing.hxx b/include/gunrock/framework/operators/advance/bucketing.hxx
index 87710b38..928f5acc 100644
--- a/include/gunrock/framework/operators/advance/bucketing.hxx
+++ b/include/gunrock/framework/operators/advance/bucketing.hxx
@@ -33,7 +33,7 @@ void execute(graph_t& G,
              frontier_t* input,
              frontier_t* output,
              work_tiles_t& segments,
-             cuda::standard_context_t& context) {}
+             gcuda::standard_context_t& context) {}
 }  // namespace bucketing
 
 }  // namespace advance
diff --git a/include/gunrock/framework/operators/advance/helpers.hxx b/include/gunrock/framework/operators/advance/helpers.hxx
index 92b19f2e..b1c5b048 100644
--- a/include/gunrock/framework/operators/advance/helpers.hxx
+++ b/include/gunrock/framework/operators/advance/helpers.hxx
@@ -39,7 +39,7 @@ template <typename graph_t, typename frontier_t, typename work_tiles_t>
 std::size_t compute_output_offsets(graph_t& G,
                                    frontier_t* input,
                                    work_tiles_t& segments,
-                                   cuda::standard_context_t& context,
+                                   gcuda::standard_context_t& context,
                                    bool graph_as_frontier = false) {
   using vertex_t = typename graph_t::vertex_type;
   using edge_t = typename graph_t::edge_type;
@@ -112,7 +112,7 @@ std::size_t compute_output_offsets(graph_t& G,
 template <typename graph_t, typename frontier_t>
 std::size_t compute_output_length(graph_t& G,
                                   frontier_t& input,
-                                  cuda::standard_context_t& context,
+                                  gcuda::standard_context_t& context,
                                   bool graph_as_frontier = false) {
   using vertex_t = typename graph_t::vertex_type;
   using edge_t = typename graph_t::edge_type;
diff --git a/include/gunrock/framework/operators/advance/merge_path.hxx b/include/gunrock/framework/operators/advance/merge_path.hxx
index 92966c91..f42ed895 100644
--- a/include/gunrock/framework/operators/advance/merge_path.hxx
+++ b/include/gunrock/framework/operators/advance/merge_path.hxx
@@ -37,7 +37,7 @@ void execute(graph_t& G,
              frontier_t* input,
              frontier_t* output,
              work_tiles_t& segments,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   if constexpr (direction == advance_direction_t::optimized) {
     error::throw_if_exception(cudaErrorUnknown,
                               "Direction-optimized not yet implemented.");
diff --git a/include/gunrock/framework/operators/advance/merge_path_v2.hxx b/include/gunrock/framework/operators/advance/merge_path_v2.hxx
index 2258e4c6..6c2b6ea1 100644
--- a/include/gunrock/framework/operators/advance/merge_path_v2.hxx
+++ b/include/gunrock/framework/operators/advance/merge_path_v2.hxx
@@ -196,7 +196,7 @@ void execute(graph_t& G,
              frontier_t& input,
              frontier_t& output,
              work_tiles_t& segments,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   auto size_of_output = compute_output_offsets(
       G, &input, segments, context,
       (input_type == advance_io_type_t::graph) ? true : false);
diff --git a/include/gunrock/framework/operators/advance/thread_mapped.hxx b/include/gunrock/framework/operators/advance/thread_mapped.hxx
index ff5f9883..6e2af4e4 100644
--- a/include/gunrock/framework/operators/advance/thread_mapped.hxx
+++ b/include/gunrock/framework/operators/advance/thread_mapped.hxx
@@ -34,7 +34,7 @@ void execute(graph_t& G,
              frontier_t& input,
              frontier_t& output,
              work_tiles_t& segments,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   using type_t = typename frontier_t::type_t;
 
   if (output_type != advance_io_type_t::none) {
@@ -87,7 +87,7 @@ void execute(graph_t& G,
                                  : input.get_number_of_elements();
 
   // Set-up and launch thread-mapped advance.
-  using namespace cuda::launch_box;
+  using namespace gcuda::launch_box;
   using launch_t =
       launch_box_t<launch_params_dynamic_grid_t<fallback, dim3_t<256>, 3>>;
 
diff --git a/include/gunrock/framework/operators/filter/bypass.hxx b/include/gunrock/framework/operators/filter/bypass.hxx
index d2c935df..81d28e95 100644
--- a/include/gunrock/framework/operators/filter/bypass.hxx
+++ b/include/gunrock/framework/operators/filter/bypass.hxx
@@ -12,7 +12,7 @@ void execute(graph_t& G,
              operator_t op,
              frontier_t* input,
              frontier_t* output,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   using type_t = typename frontier_t::type_t;
 
   // ... resize as needed.
@@ -49,7 +49,7 @@ template <typename graph_t, typename operator_t, typename frontier_t>
 void execute(graph_t& G,
              operator_t op,
              frontier_t* input,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   // in-place bypass filter (doesn't require an output frontier.)
   execute(G, op, input, input, context);
 }
diff --git a/include/gunrock/framework/operators/filter/compact.hxx b/include/gunrock/framework/operators/filter/compact.hxx
index e30672dd..6d4c0d92 100644
--- a/include/gunrock/framework/operators/filter/compact.hxx
+++ b/include/gunrock/framework/operators/filter/compact.hxx
@@ -13,7 +13,7 @@ void execute(graph_t& G,
              operator_t op,
              frontier_t* input,
              frontier_t* output,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   using vertex_t = typename graph_t::vertex_type;
   using size_type = decltype(input->get_number_of_elements());
 
diff --git a/include/gunrock/framework/operators/filter/filter.hxx b/include/gunrock/framework/operators/filter/filter.hxx
index 08a1da9b..5d7e29be 100644
--- a/include/gunrock/framework/operators/filter/filter.hxx
+++ b/include/gunrock/framework/operators/filter/filter.hxx
@@ -53,7 +53,7 @@ namespace filter {
  * @param input Input frontier.
  * @param output Output frontier (some algorithms may not use this, and allow
  * for in-place filter operation).
- * @param context a `cuda::multi_context_t` that contains GPU contexts for the
+ * @param context a `gcuda::multi_context_t` that contains GPU contexts for the
  * available CUDA devices. Used to launch the filter kernels.
  */
 template <filter_algorithm_t alg_type,
@@ -64,7 +64,7 @@ void execute(graph_t& G,
              operator_t op,
              frontier_t* input,
              frontier_t* output,
-             cuda::multi_context_t& context) {
+             gcuda::multi_context_t& context) {
   if (context.size() == 1) {
     auto single_context = context.get_context(0);
 
@@ -122,7 +122,7 @@ void execute(graph_t& G,
  * @param G Input graph used.
  * @param op Predicate function, can be defined using a C++ lambda function.
  * @param E Enactor struct containing input and output frontiers.
- * @param context a `cuda::multi_context_t` that contains GPU contexts for the
+ * @param context a `gcuda::multi_context_t` that contains GPU contexts for the
  * available CUDA devices. Used to launch the filter kernels.
  */
 template <filter_algorithm_t alg_type,
@@ -132,7 +132,7 @@ template <filter_algorithm_t alg_type,
 void execute(graph_t& G,
              enactor_type* E,
              operator_t op,
-             cuda::multi_context_t& context,
+             gcuda::multi_context_t& context,
              bool swap_buffers = true) {
   execute<alg_type>(G,                         // graph
                     op,                        // operator_t
diff --git a/include/gunrock/framework/operators/filter/predicated.hxx b/include/gunrock/framework/operators/filter/predicated.hxx
index b341b9f6..c9e06378 100644
--- a/include/gunrock/framework/operators/filter/predicated.hxx
+++ b/include/gunrock/framework/operators/filter/predicated.hxx
@@ -13,7 +13,7 @@ void execute(graph_t& G,
              operator_t op,
              frontier_t* input,
              frontier_t* output,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   using type_t = typename frontier_t::type_t;
 
   // Allocate output size if necessary.
diff --git a/include/gunrock/framework/operators/filter/remove.hxx b/include/gunrock/framework/operators/filter/remove.hxx
index d51de33d..bb9403dd 100644
--- a/include/gunrock/framework/operators/filter/remove.hxx
+++ b/include/gunrock/framework/operators/filter/remove.hxx
@@ -12,7 +12,7 @@ void execute(graph_t& G,
              operator_t op,
              frontier_t* input,
              frontier_t* output,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   using type_t = typename frontier_t::type_t;
 
   // Allocate output size if necessary.
diff --git a/include/gunrock/framework/operators/for/for.hxx b/include/gunrock/framework/operators/for/for.hxx
index e973d44e..2c1cdbf3 100644
--- a/include/gunrock/framework/operators/for/for.hxx
+++ b/include/gunrock/framework/operators/for/for.hxx
@@ -20,14 +20,14 @@ namespace parallel_for {
  * @tparam frontier_t Frontier type.
  * @param f Frontiers to apply user-defined function to.
  * @param op User-defined function.
- * @param context Device context (@see cuda::multi_context_t).
+ * @param context Device context (@see gcuda::multi_context_t).
  * @return bool ignore the output, limitation of `__device__` lambda functions
  * require a template parameter to be named (see
  * https://github.com/neoblizz/enable_if_bug).
  */
 template <parallel_for_each_t type, typename func_t, typename frontier_t>
 std::enable_if_t<type == parallel_for_each_t::element>
-execute(frontier_t& f, func_t op, cuda::multi_context_t& context) {
+execute(frontier_t& f, func_t op, gcuda::multi_context_t& context) {
   static_assert(type == parallel_for_each_t::element);
   using type_t = typename frontier_t::type_t;
   auto single_context = context.get_context(0);
@@ -52,14 +52,14 @@ execute(frontier_t& f, func_t op, cuda::multi_context_t& context) {
  * @tparam graph_t Graph type.
  * @param G Graph to apply user-defined function to.
  * @param op User-defined function.
- * @param context Device context (@see cuda::multi_context_t).
+ * @param context Device context (@see gcuda::multi_context_t).
  * @return bool ignore the output, limitation of `__device__` lambda functions
  * require a template parameter to be named (see
  * https://github.com/neoblizz/enable_if_bug).
  */
 template <parallel_for_each_t type, typename func_t, typename graph_t>
 std::enable_if_t<type != parallel_for_each_t::element>
-execute(graph_t& G, func_t op, cuda::multi_context_t& context) {
+execute(graph_t& G, func_t op, gcuda::multi_context_t& context) {
   static_assert((type == parallel_for_each_t::weight) ||
                 (type == parallel_for_each_t::edge) ||
                 (type == parallel_for_each_t::vertex));
diff --git a/include/gunrock/framework/operators/neighborreduce/neighborreduce.hxx b/include/gunrock/framework/operators/neighborreduce/neighborreduce.hxx
index a30a755b..785b9fc4 100644
--- a/include/gunrock/framework/operators/neighborreduce/neighborreduce.hxx
+++ b/include/gunrock/framework/operators/neighborreduce/neighborreduce.hxx
@@ -50,7 +50,7 @@ namespace neighborreduce {
  * @param op user-defined lambda function.
  * @param arithmetic_op arithmetic operator (binary).
  * @param init_value initial value for the reduction.
- * @param context cuda context (@see cuda::multi_context_t).
+ * @param context cuda context (@see gcuda::multi_context_t).
  */
 template <advance_io_type_t input_t = advance_io_type_t::graph,
           typename graph_t,
@@ -64,7 +64,7 @@ void execute(graph_t& G,
              operator_t op,
              arithmetic_t arithmetic_op,
              output_t init_value,
-             cuda::multi_context_t& context) {
+             gcuda::multi_context_t& context) {
   if (context.size() == 1) {
     auto context0 = context.get_context(0);
 
diff --git a/include/gunrock/framework/operators/uniquify/unique.hxx b/include/gunrock/framework/operators/uniquify/unique.hxx
index 0bce6d0d..1ee79245 100644
--- a/include/gunrock/framework/operators/uniquify/unique.hxx
+++ b/include/gunrock/framework/operators/uniquify/unique.hxx
@@ -22,7 +22,7 @@ namespace unique {
 template <typename frontier_t>
 void execute(frontier_t* input,
              frontier_t* output,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   auto new_end = thrust::unique(
       thrust::cuda::par.on(context.stream()),  // execution policy
       input->begin(),                          // input iterator: begin
diff --git a/include/gunrock/framework/operators/uniquify/unique_copy.hxx b/include/gunrock/framework/operators/uniquify/unique_copy.hxx
index 4a74ada6..a4103270 100644
--- a/include/gunrock/framework/operators/uniquify/unique_copy.hxx
+++ b/include/gunrock/framework/operators/uniquify/unique_copy.hxx
@@ -22,7 +22,7 @@ namespace unique_copy {
 template <typename frontier_t>
 void execute(frontier_t* input,
              frontier_t* output,
-             cuda::standard_context_t& context) {
+             gcuda::standard_context_t& context) {
   // Make sure output frontier has enough space.
   if (output->get_capacity() < input->get_number_of_elements())
     output->reserve(input->get_number_of_elements());
diff --git a/include/gunrock/framework/operators/uniquify/uniquify.hxx b/include/gunrock/framework/operators/uniquify/uniquify.hxx
index 61bfd4eb..80e9b1aa 100644
--- a/include/gunrock/framework/operators/uniquify/uniquify.hxx
+++ b/include/gunrock/framework/operators/uniquify/uniquify.hxx
@@ -15,7 +15,7 @@ namespace uniquify {
 template <uniquify_algorithm_t type, typename frontier_t>
 void execute(frontier_t* input,
              frontier_t* output,
-             cuda::multi_context_t& context,
+             gcuda::multi_context_t& context,
              bool best_effort_uniquification = false,
              const float uniquification_percent = 100) {
   if (context.size() == 1) {
@@ -44,7 +44,7 @@ void execute(frontier_t* input,
 template <uniquify_algorithm_t type = uniquify_algorithm_t::unique,
           typename enactor_type>
 void execute(enactor_type* E,
-             cuda::multi_context_t& context,
+             gcuda::multi_context_t& context,
              bool best_effort_uniquification = false,
              const float uniquification_percent = 100,
              bool swap_buffers = true) {
diff --git a/include/gunrock/framework/problem.hxx b/include/gunrock/framework/problem.hxx
index 468eb2b3..40fd2064 100644
--- a/include/gunrock/framework/problem.hxx
+++ b/include/gunrock/framework/problem.hxx
@@ -33,16 +33,16 @@ struct problem_t {
   using weight_t = typename graph_t::weight_type;
 
   graph_t graph_slice;
-  std::shared_ptr<cuda::multi_context_t> context;
+  std::shared_ptr<gcuda::multi_context_t> context;
 
   problem_t() : graph_slice(nullptr) {}
 
-  problem_t(graph_t& G, std::shared_ptr<cuda::multi_context_t> _context)
+  problem_t(graph_t& G, std::shared_ptr<gcuda::multi_context_t> _context)
       : graph_slice(G), context(_context) {}
 
   auto get_graph() { return graph_slice; }
   auto get_multi_context() { return context; }
-  auto get_single_context(cuda::device_id_t device = 0) {
+  auto get_single_context(gcuda::device_id_t device = 0) {
     return context->get_context(device);
   }
 
diff --git a/include/gunrock/graph/graph.hxx b/include/gunrock/graph/graph.hxx
index 2ea38124..7a39e55e 100644
--- a/include/gunrock/graph/graph.hxx
+++ b/include/gunrock/graph/graph.hxx
@@ -377,7 +377,7 @@ __host__ __device__ double get_degree_standard_deviation(graph_type const& G) {
 template <typename graph_type, typename histogram_t>
 void build_degree_histogram(graph_type const& G,
                             histogram_t* histogram,
-                            cuda::stream_t stream = 0) {
+                            gcuda::stream_t stream = 0) {
   using vertex_t = typename graph_type::vertex_type;
   auto length = sizeof(vertex_t) * 8 + 1;
 
diff --git a/include/gunrock/util/math.hxx b/include/gunrock/util/math.hxx
index e0e12d86..b3f15e50 100644
--- a/include/gunrock/util/math.hxx
+++ b/include/gunrock/util/math.hxx
@@ -89,7 +89,7 @@ __host__ __device__ __forceinline__ type_t add(type_t* address, type_t value) {
 template <typename type_t>
 __host__ __device__ __forceinline__ type_t min(type_t* address, type_t value) {
 #ifdef __CUDA_ARCH__
-  return cuda::atomicMin(address, value);
+  return gcuda::atomicMin(address, value);
 #else
   return std::min<type_t>(*address, value);  // use std::atomic;
 #endif
@@ -98,7 +98,7 @@ __host__ __device__ __forceinline__ type_t min(type_t* address, type_t value) {
 template <typename type_t>
 __host__ __device__ __forceinline__ type_t max(type_t* address, type_t value) {
 #ifdef __CUDA_ARCH__
-  return cuda::atomicMax(address, value);
+  return gcuda::atomicMax(address, value);
 #else
   return std::max<type_t>(*address, value);  // use std::atomic;
 #endif
@@ -118,8 +118,7 @@ __host__ __device__ __forceinline__ type_t cas(type_t* address,
 }
 
 template <typename type_t>
-__host__ __device__ __forceinline__ type_t exch(type_t* address,
-                                               type_t value) {
+__host__ __device__ __forceinline__ type_t exch(type_t* address, type_t value) {
 #ifdef __CUDA_ARCH__
   return atomicExch(address, value);
 #else
diff --git a/unittests/cuda/context.cuh b/unittests/cuda/context.cuh
index 87779289..a5025525 100644
--- a/unittests/cuda/context.cuh
+++ b/unittests/cuda/context.cuh
@@ -5,14 +5,14 @@ void test_context() {
   using namespace gunrock;
 
   // List of devices we care about
-  std::vector<cuda::device_id_t> devices;
+  std::vector<gcuda::device_id_t> devices;
 
   // Initialize
   devices.push_back(0);
   // devices.push_back(1);
 
   // Create contexts for all the devices
-  cuda::multi_context_t multi_context(devices);
+  gcuda::multi_context_t multi_context(devices);
 
   auto context_device_0 = multi_context.get_context(0);
   // auto context_device_1 = multi_context.get_context(0);
diff --git a/unittests/cuda/device_properties.cuh b/unittests/cuda/device_properties.cuh
index 218296e2..0f156a7b 100644
--- a/unittests/cuda/device_properties.cuh
+++ b/unittests/cuda/device_properties.cuh
@@ -2,21 +2,21 @@
 #include <string>
 #include <gunrock/cuda/device_properties.hxx>
 
-using namespace gunrock::cuda;
-using namespace gunrock::cuda::properties;
+using namespace gunrock::gcuda;
+using namespace gunrock::gcuda::properties;
 
 // Making sure the CUDA API enums are known at compile time
 compute_capability_t sm30 = make_compute_capability(30);
 size_t smem_size = sm_max_shared_memory_bytes<cudaFuncCachePreferEqual>(sm30);
 size_t smem_bank_stride =
-  shared_memory_bank_stride<cudaSharedMemBankSizeEightByte>();
+    shared_memory_bank_stride<cudaSharedMemBankSizeEightByte>();
 
 int main(int argc, char** argv) {
   using namespace std;
 
   int cc_ver = (argc > 1) ? stoi(argv[1]) : 30;
   compute_capability_t cc = make_compute_capability(cc_ver);
-  const char *arch = arch_name(cc);
+  const char* arch = arch_name(cc);
 
   cout << "Compute Capability Version Major: " << cc.major << endl;
   cout << "Compute Capability Version Minor: " << cc.minor << endl;
@@ -30,7 +30,8 @@ int main(int argc, char** argv) {
   cout << "sm_max_ctas:                " << sm_max_ctas(cc) << endl;
   cout << "sm_max_threads:             " << sm_max_threads(cc) << endl;
   cout << "sm_registers:               " << sm_registers(cc) << endl;
-  cout << "sm_max_shared_memory_bytes: " << sm_max_shared_memory_bytes(cc) << endl;
+  cout << "sm_max_shared_memory_bytes: " << sm_max_shared_memory_bytes(cc)
+       << endl;
   cout << "shared_memory_banks:        " << shared_memory_banks() << endl;
   cout << "shared_memory_bank_stride:  " << shared_memory_bank_stride() << endl;
 }
diff --git a/unittests/cuda/launch_box.cuh b/unittests/cuda/launch_box.cuh
index 0f611b6e..8ca7a1f4 100644
--- a/unittests/cuda/launch_box.cuh
+++ b/unittests/cuda/launch_box.cuh
@@ -3,7 +3,7 @@
 
 #include <gunrock/cuda/launch_box.hxx>
 
-using namespace gunrock::cuda::launch_box;
+using namespace gunrock::gcuda::launch_box;
 
 typedef launch_box_t<
     launch_params_t<sm_86 | sm_80, dim3_t<16, 2, 2>, dim3_t<64, 1, 4>, 2>,
diff --git a/unittests/framework/operators/for.cuh b/unittests/framework/operators/for.cuh
index 69779514..99db1011 100644
--- a/unittests/framework/operators/for.cuh
+++ b/unittests/framework/operators/for.cuh
@@ -29,8 +29,8 @@ TEST(operators, prallel_for) {
                                            gunrock::graph::view_t::csr>(csr);
 
   // Initialize the devicecontext.
-  gunrock::cuda::device_id_t device = 0;
-  gunrock::cuda::multi_context_t context(device);
+  gunrock::gcuda::device_id_t device = 0;
+  gunrock::gcuda::multi_context_t context(device);
 
   // Launch for using a separate function.
   gunrock::operators::parallel_for::execute<
diff --git a/unittests/graph/src_vertex.cuh b/unittests/graph/src_vertex.cuh
index ce7227a9..c34001ff 100644
--- a/unittests/graph/src_vertex.cuh
+++ b/unittests/graph/src_vertex.cuh
@@ -9,7 +9,7 @@ template <typename graph_t>
 void test_get_source_vertex(graph_t& G) {
   using edge_t = typename graph_t::edge_type;
   auto context =
-      std::shared_ptr<cuda::multi_context_t>(new cuda::multi_context_t(0));
+      std::shared_ptr<gcuda::multi_context_t>(new gcuda::multi_context_t(0));
 
   auto log_edge = [=] __device__(edge_t const& e) -> void {
     auto src = G.get_source_vertex(e);

From 9a959910a2d152bcec3d7494d6dfbcc602dfa773 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Sun, 22 May 2022 23:07:35 -0700
Subject: [PATCH 08/58] pass file to nvbench (still has free error)

---
 benchmarks/CMakeLists.txt           |  2 +-
 benchmarks/bench.cu                 | 82 +++++++++++++++++------------
 include/gunrock/io/sample_large.hxx | 73 -------------------------
 3 files changed, 50 insertions(+), 107 deletions(-)
 delete mode 100644 include/gunrock/io/sample_large.hxx

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 40f0a599..530b2216 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -8,7 +8,7 @@ foreach(SOURCE IN LISTS BENCHMARK_SOURCES)
   add_executable(${BENCHMARK_NAME} ${SOURCE})
   target_link_libraries(${BENCHMARK_NAME} 
     PRIVATE essentials
-    PRIVATE nvbench::main
+    PRIVATE nvbench::nvbench
   )
   get_target_property(ESSENTIALS_ARCHITECTURES 
     essentials CUDA_ARCHITECTURES
diff --git a/benchmarks/bench.cu b/benchmarks/bench.cu
index 660b7d97..b5e4a6ac 100644
--- a/benchmarks/bench.cu
+++ b/benchmarks/bench.cu
@@ -3,42 +3,45 @@
 #include <gunrock/formats/formats.hxx>
 #include <gunrock/cuda/cuda.hxx>
 #include <gunrock/framework/operators/for/for.hxx>
-#include <gunrock/io/sample_large.hxx>
 #include <nvbench/nvbench.cuh>
 #include <iostream>
 #include <gunrock/algorithms/algorithms.hxx>
 #include <gunrock/algorithms/mst.hxx>
 #include <gunrock/algorithms/bfs.hxx>
+#include <cxxopts.hpp>
 
-namespace gunrock {
-namespace benchmark {
+using namespace gunrock;
+using namespace memory;
 
-void mst_bench(nvbench::state& state) {
-  // Build a graph using a sample csr.
-  auto csr = io::sample_large::csr();
-  auto G =
-      graph::build::from_csr<memory_space_t::device, graph::view_t::csr>(csr);
+using csr_t = format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+csr_t csr;
 
-  // Initialize the context.
-  cuda::device_id_t device = 0;
-  cuda::multi_context_t context(device);
+void mst_bench(nvbench::state& state) {
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
 
-  // --
-  // Params and memory allocation
-  thrust::device_vector<weight_t> mst_weight(1);
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
 
   // --
-  // GPU Run
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::mst::run(G, mst_weight.data().get());
-  });
-}
+  // Build graph + metadata
 
-void bfs_bench(nvbench::state& state) {
-  // Build a graph using a sample csr.
-  auto csr = io::sample_large::csr();
   auto G =
-      graph::build::from_csr<memory_space_t::device, graph::view_t::csr>(csr);
+      graph::build::from_csr<memory_space_t::device,
+                             graph::view_t::csr /* | graph::view_t::csc */>(
+          csr.number_of_rows,               // rows
+          csr.number_of_columns,            // columns
+          csr.number_of_nonzeros,           // nonzeros
+          csr.row_offsets.data().get(),     // row_offsets
+          csr.column_indices.data().get(),  // column_indices
+          csr.nonzero_values.data().get(),  // values
+          row_indices.data().get(),         // row_indices
+          column_offsets.data().get()       // column_offsets
+      );
 
   // Initialize the context.
   cuda::device_id_t device = 0;
@@ -46,21 +49,34 @@ void bfs_bench(nvbench::state& state) {
 
   // --
   // Params and memory allocation
-  vertex_t single_source = 0;
-  vertex_t n_vertices = G.get_number_of_vertices();
-  thrust::device_vector<vertex_t> distances(n_vertices);
-  thrust::device_vector<vertex_t> predecessors(n_vertices);
+  thrust::device_vector<weight_t> mst_weight(1);
 
   // --
   // GPU Run
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::bfs::run(G, single_source, distances.data().get(),
-                      predecessors.data().get());
+    gunrock::mst::run(G, mst_weight.data().get());
   });
 }
 
-NVBENCH_BENCH(mst_bench);
-NVBENCH_BENCH(bfs_bench);
+int main(int argc, char** argv) {
+  std::string filename = argv[1];
+
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  char* args[argc - 1];
+  args[0] = argv[0];
+  for (int i = 1; i < argc + 1; i++) {
+    args[i] = argv[i + 1];
+  }
 
-}  // namespace benchmark
-}  // namespace gunrock
+  NVBENCH_BENCH(mst_bench);
+  NVBENCH_MAIN_BODY(argc - 1, args);
+}
\ No newline at end of file
diff --git a/include/gunrock/io/sample_large.hxx b/include/gunrock/io/sample_large.hxx
deleted file mode 100644
index 2fda592c..00000000
--- a/include/gunrock/io/sample_large.hxx
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * @file sample.hxx
- * @author Muhammad Osama (mosama@ucdavis.edu)
- * @brief
- * @version 0.1
- * @date 2021-12-23
- *
- * @copyright Copyright (c) 2021
- *
- */
-#pragma once
-
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/graph/graph.hxx>
-
-namespace gunrock {
-namespace io {
-namespace sample_large {
-
-using namespace memory;
-
-/**
- * @brief Returns a large sample CSR matrix of size 10000 x 10000,
- * filled with ones.
- *
- * @tparam space Memory space of the CSR matrix.
- * @tparam vertex_t Type of vertex.
- * @tparam edge_t Type of edge.
- * @tparam weight_t Type of weight.
- * @return format::csr_t<space, vertex_t, edge_t, weight_t> CSR matrix.
- */
-template <memory_space_t space = memory_space_t::device,
-          typename vertex_t = int,
-          typename edge_t = int,
-          typename weight_t = float>
-format::csr_t<space, vertex_t, edge_t, weight_t> csr() {
-  using csr_t = format::csr_t<memory_space_t::host, vertex_t, edge_t, weight_t>;
-
-  int dim = 10000;
-  csr_t matrix(dim, dim, dim * dim);
-
-  // Row Offsets
-  thrust::host_vector<int> rowSeq(dim + 1);
-  thrust::host_vector<int> multVect(dim + 1);
-  thrust::sequence(rowSeq.begin(), rowSeq.end());
-  thrust::fill(multVect.begin(), multVect.end(), dim);
-  thrust::transform(rowSeq.begin(), rowSeq.end(), multVect.begin(),
-                    matrix.row_offsets.begin(), thrust::multiplies<int>());
-
-  // Column Indices
-  thrust::host_vector<int> colSeq(dim * dim);
-  thrust::host_vector<int> modVect(dim * dim);
-  thrust::sequence(colSeq.begin(), colSeq.end());
-  thrust::fill(modVect.begin(), modVect.end(), dim);
-  thrust::transform(colSeq.begin(), colSeq.end(), modVect.begin(),
-                    matrix.column_indices.begin(), thrust::modulus<int>());
-
-  // Non-zero values
-  thrust::fill(matrix.nonzero_values.begin(), matrix.nonzero_values.end(), 1);
-
-  if (space == memory_space_t::host) {
-    return matrix;
-  } else {
-    using d_csr_t =
-        format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-    d_csr_t d_matrix(matrix);
-    return d_matrix;
-  }
-}
-
-}  // namespace sample_large
-}  // namespace io
-}  // namespace gunrock
\ No newline at end of file

From 778517a0815398a199ad3ea87f82e8144f3158c5 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Sun, 22 May 2022 23:19:26 -0700
Subject: [PATCH 09/58] minor tweaks

---
 benchmarks/CMakeLists.txt             |  2 +-
 benchmarks/{bench.cu => mst_bench.cu} | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 13 deletions(-)
 rename benchmarks/{bench.cu => mst_bench.cu} (97%)

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 530b2216..a2e06fc9 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,6 +1,6 @@
 set(BENCHMARK_SOURCES
   for.cu
-  bench.cu
+  mst_bench.cu
 )
 
 foreach(SOURCE IN LISTS BENCHMARK_SOURCES)
diff --git a/benchmarks/bench.cu b/benchmarks/mst_bench.cu
similarity index 97%
rename from benchmarks/bench.cu
rename to benchmarks/mst_bench.cu
index b5e4a6ac..2c338f5c 100644
--- a/benchmarks/bench.cu
+++ b/benchmarks/mst_bench.cu
@@ -14,7 +14,7 @@ using namespace gunrock;
 using namespace memory;
 
 using csr_t = format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-csr_t csr;
+std::string filename;
 
 void mst_bench(nvbench::state& state) {
   state.collect_dram_throughput();
@@ -23,6 +23,17 @@ void mst_bench(nvbench::state& state) {
   state.collect_loads_efficiency();
   state.collect_stores_efficiency();
 
+  csr_t csr;
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
   thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
   thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
   thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
@@ -59,17 +70,7 @@ void mst_bench(nvbench::state& state) {
 }
 
 int main(int argc, char** argv) {
-  std::string filename = argv[1];
-
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
+  filename = argv[1];
 
   char* args[argc - 1];
   args[0] = argv[0];

From 540d1593a45d8a1bf8a541478f5deeb78a695d82 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Mon, 23 May 2022 14:38:52 -0700
Subject: [PATCH 10/58] multiple benchmarks

---
 benchmarks/CMakeLists.txt |  1 +
 benchmarks/bfs_bench.cu   | 93 +++++++++++++++++++++++++++++++++++++++
 benchmarks/mst_bench.cu   | 19 +++++---
 3 files changed, 106 insertions(+), 7 deletions(-)
 create mode 100644 benchmarks/bfs_bench.cu

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index a2e06fc9..5060e3be 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,6 +1,7 @@
 set(BENCHMARK_SOURCES
   for.cu
   mst_bench.cu
+  bfs_bench.cu
 )
 
 foreach(SOURCE IN LISTS BENCHMARK_SOURCES)
diff --git a/benchmarks/bfs_bench.cu b/benchmarks/bfs_bench.cu
new file mode 100644
index 00000000..42cff9ce
--- /dev/null
+++ b/benchmarks/bfs_bench.cu
@@ -0,0 +1,93 @@
+#include <gunrock/error.hxx>
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/cuda/cuda.hxx>
+#include <gunrock/framework/operators/for/for.hxx>
+#include <nvbench/nvbench.cuh>
+#include <iostream>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/mst.hxx>
+#include <gunrock/algorithms/bfs.hxx>
+#include <cxxopts.hpp>
+
+using namespace gunrock;
+using namespace memory;
+
+std::string filename;  // Global
+
+void bfs_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // IO
+  csr_t csr;
+
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  // --
+  // Build graph + metadata
+  auto G =
+      graph::build::from_csr<memory_space_t::device,
+                             graph::view_t::csr /* | graph::view_t::csc */>(
+          csr.number_of_rows,               // rows
+          csr.number_of_columns,            // columns
+          csr.number_of_nonzeros,           // nonzeros
+          csr.row_offsets.data().get(),     // row_offsets
+          csr.column_indices.data().get(),  // column_indices
+          csr.nonzero_values.data().get(),  // values
+          row_indices.data().get(),         // row_indices
+          column_offsets.data().get()       // column_offsets
+      );
+
+  // --
+  // Params and memory allocation
+
+  vertex_t single_source = 0;
+
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<vertex_t> distances(n_vertices);
+  thrust::device_vector<vertex_t> predecessors(n_vertices);
+
+  // --
+  // Run BFS with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::bfs::run(G, single_source, distances.data().get(),
+                      predecessors.data().get());
+  });
+}
+
+int main(int argc, char** argv) {
+  filename = argv[1];
+
+  // Create a new argument array without filename to pass to NVBench.
+  char* args[argc - 1];
+  args[0] = argv[0];
+  for (int i = 1; i < argc; i++) {
+    args[i] = argv[i + 1];
+  }
+
+  NVBENCH_BENCH(bfs_bench);
+  NVBENCH_MAIN_BODY(argc - 1, args);
+}
\ No newline at end of file
diff --git a/benchmarks/mst_bench.cu b/benchmarks/mst_bench.cu
index 2c338f5c..5dab267e 100644
--- a/benchmarks/mst_bench.cu
+++ b/benchmarks/mst_bench.cu
@@ -13,16 +13,23 @@
 using namespace gunrock;
 using namespace memory;
 
-using csr_t = format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-std::string filename;
+std::string filename;  // Global
 
 void mst_bench(nvbench::state& state) {
+  // Add metrics.
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
   state.collect_loads_efficiency();
   state.collect_stores_efficiency();
 
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
   csr_t csr;
   if (util::is_market(filename)) {
     io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
@@ -38,9 +45,6 @@ void mst_bench(nvbench::state& state) {
   thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
   thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
 
-  // --
-  // Build graph + metadata
-
   auto G =
       graph::build::from_csr<memory_space_t::device,
                              graph::view_t::csr /* | graph::view_t::csc */>(
@@ -63,7 +67,7 @@ void mst_bench(nvbench::state& state) {
   thrust::device_vector<weight_t> mst_weight(1);
 
   // --
-  // GPU Run
+  // Run MST with NVBench
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     gunrock::mst::run(G, mst_weight.data().get());
   });
@@ -72,9 +76,10 @@ void mst_bench(nvbench::state& state) {
 int main(int argc, char** argv) {
   filename = argv[1];
 
+  // Create a new argument array without filename to pass to NVBench.
   char* args[argc - 1];
   args[0] = argv[0];
-  for (int i = 1; i < argc + 1; i++) {
+  for (int i = 1; i < argc; i++) {
     args[i] = argv[i + 1];
   }
 

From 9002360293b548b6c7c847c6eef679dee39118a8 Mon Sep 17 00:00:00 2001
From: Jonathan Wapman <jdwapman@ucdavis.edu>
Date: Fri, 27 May 2022 19:37:58 -0700
Subject: [PATCH 11/58] Support reading smtx files

---
 ...0.attn.proj_swin_tiny_unstructured_50.smtx |  19 ++
 include/gunrock/algorithms/algorithms.hxx     |   1 +
 .../gunrock/algorithms/generate/random.hxx    |  18 ++
 include/gunrock/io/smtx.hxx                   | 175 ++++++++++++++++++
 unittests/io/smtx.cuh                         |  34 ++++
 unittests/unittests.hxx                       |   1 +
 6 files changed, 248 insertions(+)
 create mode 100644 datasets/layers.0.blocks.0.attn.proj_swin_tiny_unstructured_50.smtx
 create mode 100644 include/gunrock/io/smtx.hxx
 create mode 100644 unittests/io/smtx.cuh

diff --git a/datasets/layers.0.blocks.0.attn.proj_swin_tiny_unstructured_50.smtx b/datasets/layers.0.blocks.0.attn.proj_swin_tiny_unstructured_50.smtx
new file mode 100644
index 00000000..09a90c9f
--- /dev/null
+++ b/datasets/layers.0.blocks.0.attn.proj_swin_tiny_unstructured_50.smtx
@@ -0,0 +1,19 @@
+% Sparse matrix file format .smtx
+% ------------------------------------------------------------------------------
+% model:          swin_tiny
+% layer:          layers.0.blocks.0.attn.proj
+% model sparsity: 0.5
+% 
+% Format Description: 
+% This format is similar to the CSR format, storing the row offsets and column 
+% indices of the non-zeros, but does not store the actual non-zero values. Note 
+% that there are no non-zero values stored, just the sparse structure. 
+% 
+% The line-by-line representation is as follows: 
+%     {M} {K} {NNZ} 
+%     row offsets 
+%     column indices 
+% ------------------------------------------------------------------------------
+96 96 4608
+0 62 88 134 190 244 292 333 394 452 495 549 600 656 692 751 786 842 873 920 966 1012 1054 1108 1167 1223 1250 1307 1357 1394 1458 1502 1562 1592 1663 1719 1754 1804 1859 1911 1962 2021 2068 2099 2133 2184 2234 2289 2316 2377 2430 2462 2510 2566 2604 2663 2712 2762 2809 2858 2911 2959 3000 3052 3095 3141 3181 3242 3283 3334 3392 3458 3521 3561 3610 3654 3700 3752 3780 3820 3841 3893 3940 3997 4036 4086 4124 4176 4222 4281 4339 4389 4416 4467 4512 4566 4608
+0 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 23 24 26 27 28 29 30 31 32 34 35 42 45 48 49 50 53 62 64 65 67 68 69 72 74 76 77 78 79 80 83 84 85 87 88 90 91 92 93 94 95 0 1 2 3 6 7 10 13 16 22 23 24 29 42 50 51 62 64 70 71 76 86 87 91 92 94 0 4 8 10 11 12 13 17 18 20 23 24 25 26 27 28 29 30 31 42 51 52 55 64 66 67 69 70 71 72 73 75 76 77 79 82 83 84 86 87 88 90 91 92 94 95 0 2 3 4 5 6 7 8 10 12 13 15 17 18 19 20 22 23 24 25 26 28 31 34 37 40 41 47 50 51 55 58 62 64 65 66 67 68 69 70 71 72 77 79 80 81 82 83 84 85 86 88 89 91 92 93 0 2 3 7 8 9 10 12 13 15 16 19 20 21 23 26 27 29 30 39 44 45 51 53 57 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 80 81 82 85 86 87 88 89 91 92 93 94 95 0 1 2 6 7 10 11 12 16 19 22 23 24 25 27 28 29 30 33 36 37 42 43 47 49 50 51 52 57 62 63 65 66 69 71 73 74 75 77 80 83 84 87 89 90 91 93 95 0 2 3 4 6 7 8 13 14 18 20 23 25 26 28 29 44 45 46 47 50 51 52 54 56 60 62 69 70 71 72 74 76 79 82 83 84 85 88 93 94 1 2 3 6 7 8 10 11 12 13 14 15 16 17 18 19 20 22 23 24 26 27 28 30 34 36 37 39 42 45 47 48 49 50 51 53 56 60 63 65 67 68 69 71 72 73 74 75 78 79 80 81 82 86 87 89 90 91 92 94 95 1 4 5 6 7 8 10 11 12 13 15 19 20 21 22 23 24 25 26 28 29 31 32 33 34 36 39 40 42 47 50 51 54 56 57 59 62 64 66 69 70 71 72 73 75 79 81 82 83 84 86 88 89 90 91 92 93 94 0 3 4 5 6 8 9 10 11 17 18 19 20 23 24 25 26 27 29 30 48 49 50 51 62 64 66 69 70 72 73 74 77 78 79 80 82 83 86 87 89 93 95 2 3 4 5 6 7 8 9 10 11 12 14 16 18 20 21 22 23 25 26 27 28 29 30 42 45 48 49 59 62 64 67 68 69 70 71 73 74 76 77 78 79 80 81 83 84 85 88 89 90 91 92 94 95 1 2 3 4 6 8 10 11 13 14 15 16 17 18 19 22 23 26 27 28 29 30 31 35 42 46 49 50 52 54 59 60 66 67 68 70 71 74 75 76 79 82 83 84 85 86 89 91 92 93 95 0 1 3 4 6 7 8 13 14 15 16 17 18 19 22 23 25 26 27 28 29 31 34 36 39 40 41 42 47 49 50 51 55 57 59 60 62 66 68 69 70 72 74 76 77 79 80 81 83 84 85 89 90 92 93 95 1 2 4 6 13 14 15 16 17 18 19 27 30 31 32 42 50 51 64 66 67 69 70 73 75 76 77 82 83 86 87 89 90 92 93 94 0 1 3 4 5 6 7 8 9 10 11 12 14 16 17 20 21 22 23 25 29 31 33 37 41 43 45 48 49 50 51 54 58 60 62 64 65 66 67 68 69 70 71 73 75 76 78 81 82 83 84 85 86 87 89 90 91 94 95 8 9 17 18 19 22 24 25 27 29 30 42 51 62 64 66 67 68 69 70 71 76 77 80 81 82 83 84 85 87 88 91 92 94 95 0 1 2 3 4 5 6 7 11 13 14 16 17 18 19 20 21 22 24 25 27 28 29 30 31 33 36 38 40 42 48 50 51 57 59 62 65 66 67 68 69 70 71 72 74 77 79 80 81 82 83 85 89 90 94 95 4 5 10 13 18 19 22 26 27 42 49 50 51 57 67 69 70 71 74 78 81 82 84 85 86 88 91 92 93 94 95 0 1 2 4 5 9 11 12 15 17 18 19 21 22 23 24 25 27 32 39 40 42 47 48 51 55 64 65 67 69 70 71 72 73 75 76 77 79 82 85 86 87 89 91 92 93 95 0 3 4 5 6 8 11 12 13 14 16 17 19 20 23 24 26 29 31 36 37 40 42 45 46 53 58 59 60 63 65 66 68 69 74 76 78 79 80 81 85 86 89 90 94 95 3 5 6 7 8 11 12 13 14 16 17 19 20 22 23 24 25 26 28 35 40 50 51 59 60 62 66 67 68 70 71 73 74 75 76 78 79 80 81 82 83 87 89 90 91 93 1 3 4 5 7 10 11 18 19 24 25 29 31 35 36 37 39 40 45 47 48 49 50 51 59 62 64 66 67 69 71 72 73 74 76 77 83 86 87 88 89 90 0 1 2 3 4 5 6 10 12 13 14 15 16 17 18 19 21 25 26 27 30 31 33 35 39 40 46 47 48 49 50 51 52 53 55 59 63 65 66 68 70 71 72 75 76 79 80 82 87 88 89 91 93 95 0 1 2 3 4 5 6 7 8 11 12 13 14 15 16 17 20 21 22 23 24 25 26 27 28 29 30 31 33 44 46 47 49 50 52 54 55 57 62 66 67 69 72 73 74 76 77 78 81 82 83 84 86 88 89 90 91 92 95 0 1 2 3 5 6 7 8 9 11 12 13 15 16 17 20 21 23 24 26 30 38 40 41 42 49 52 55 56 57 61 64 65 66 67 68 69 70 71 72 73 74 75 76 78 79 82 83 84 86 87 88 92 93 94 95 1 3 12 13 18 19 21 22 24 28 42 51 62 64 65 71 73 75 79 80 81 83 85 87 88 91 92 0 2 3 4 5 7 8 9 10 11 12 13 14 15 16 17 19 21 22 24 25 26 28 29 32 35 37 39 41 42 44 47 49 50 52 54 58 64 66 67 68 71 73 77 78 79 80 81 82 83 84 86 87 90 91 92 93 0 1 2 3 4 5 11 12 13 14 15 16 17 18 20 23 24 25 26 27 28 29 30 33 38 43 48 51 52 53 54 57 59 64 65 66 67 69 72 73 75 76 79 82 83 84 85 89 92 94 0 1 2 3 4 6 8 9 11 12 13 17 18 22 25 26 27 28 31 42 45 50 51 63 66 72 74 75 76 77 81 83 85 86 87 94 95 0 3 4 5 7 9 10 11 12 13 14 15 16 18 19 20 21 22 23 24 26 29 30 33 34 35 36 39 42 45 47 48 50 51 55 59 61 62 63 64 65 66 67 69 72 73 75 76 77 78 79 80 81 82 84 85 86 87 88 90 92 93 94 95 2 4 5 7 10 11 12 15 16 17 18 19 21 22 23 24 26 27 28 30 39 47 49 51 53 55 64 68 69 70 71 72 75 77 78 81 84 87 88 89 91 92 94 95 1 2 3 4 5 6 7 8 9 11 14 15 18 19 20 21 23 24 26 27 29 30 31 33 36 38 39 40 44 48 49 50 51 53 54 57 60 62 64 65 66 67 68 69 70 71 72 74 75 76 78 79 80 82 83 86 88 89 91 94 0 1 2 6 7 8 16 17 20 24 26 27 29 30 42 48 66 70 71 72 74 76 77 86 89 90 91 92 93 95 0 1 3 4 5 6 9 10 11 13 14 16 17 18 19 20 21 22 23 24 25 26 27 28 33 34 35 36 37 38 41 42 43 44 47 49 51 55 57 59 60 61 62 63 64 65 66 67 68 69 70 73 74 75 77 79 80 81 82 83 84 85 86 88 89 90 91 92 93 94 95 0 1 3 4 6 7 9 11 12 13 15 17 19 20 21 22 23 24 26 27 28 29 30 31 34 39 42 44 47 48 50 51 52 59 64 65 67 68 69 70 71 72 73 77 78 80 81 84 85 86 87 88 90 91 92 95 1 2 9 10 11 14 16 20 21 23 24 25 26 28 29 30 33 42 64 66 70 74 75 77 78 79 81 82 84 86 87 88 90 93 95 0 1 2 3 4 5 6 7 8 10 11 14 16 17 19 20 24 25 26 29 30 33 43 45 48 49 51 52 53 59 66 67 68 69 70 72 73 75 78 79 82 83 84 85 86 88 89 90 93 95 0 1 3 5 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 24 27 29 30 35 42 43 50 51 53 54 61 62 64 68 69 71 72 74 75 76 77 78 81 82 83 84 86 87 88 89 90 91 92 94 95 0 1 2 3 4 6 7 8 11 12 14 15 17 18 20 22 24 25 26 27 28 29 32 39 41 42 45 46 48 49 51 55 57 59 61 64 66 68 70 71 74 75 80 81 82 84 87 88 89 90 92 94 2 3 4 12 13 14 15 16 20 21 22 23 24 25 26 29 30 31 32 34 40 41 50 51 62 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 87 88 89 91 92 95 1 2 3 6 8 9 11 12 13 15 16 17 18 19 21 22 24 25 26 27 28 29 30 31 32 35 36 38 40 41 49 50 54 56 57 58 62 67 68 70 71 72 73 74 75 76 78 79 80 82 83 84 87 88 89 90 93 94 95 0 1 3 4 5 6 8 10 12 13 17 18 19 21 22 26 27 29 33 34 36 42 43 45 48 51 55 57 58 64 65 68 69 72 73 76 77 79 82 84 86 87 88 89 91 93 94 1 2 5 7 9 13 14 16 18 20 24 27 30 40 45 48 49 50 51 63 64 67 69 79 84 85 87 88 92 94 95 3 18 19 21 22 23 27 30 40 41 42 49 51 52 57 62 66 68 69 71 72 73 74 75 76 78 81 83 84 86 87 88 89 94 0 1 2 3 4 5 6 9 13 14 17 18 20 22 23 24 25 27 29 30 31 37 42 50 55 57 62 65 67 68 70 71 73 75 76 77 78 79 80 81 83 84 85 86 87 88 89 90 91 92 95 0 1 3 4 5 7 9 10 15 16 17 18 19 20 21 22 23 24 26 30 31 33 35 42 47 49 50 51 56 64 65 66 73 74 75 76 77 78 79 81 82 83 84 85 87 88 89 90 92 95 1 2 3 4 5 7 8 10 11 12 13 15 17 18 19 22 23 25 26 27 28 29 31 33 47 48 50 51 56 61 64 65 68 69 70 71 72 74 75 76 77 78 80 81 82 84 85 86 87 88 89 91 92 93 95 0 1 2 7 8 10 11 13 19 20 22 28 36 42 49 64 68 69 74 75 77 80 81 86 88 90 95 0 1 2 3 4 7 9 10 11 12 13 14 16 18 19 20 21 22 23 24 25 26 29 30 32 35 38 39 41 42 46 47 48 51 56 57 59 64 65 66 67 70 71 72 73 74 75 76 77 78 79 82 83 84 86 87 88 90 91 92 93 0 1 2 3 4 6 7 11 13 14 16 19 21 24 26 27 28 29 30 34 40 41 42 45 48 51 52 53 55 56 57 58 59 62 63 65 66 67 68 71 72 76 78 79 80 81 87 88 89 91 93 94 95 1 4 5 6 7 8 9 10 11 19 27 28 29 38 42 46 50 51 58 65 66 67 69 72 74 76 78 81 83 85 92 94 0 1 3 4 5 6 7 8 10 12 16 17 18 19 20 21 22 23 24 25 26 28 31 36 42 49 51 55 59 62 64 66 69 70 72 74 75 77 78 79 80 82 83 85 90 91 92 95 0 1 2 4 5 6 7 8 11 12 15 16 17 19 21 22 23 24 25 26 27 30 31 33 34 40 41 45 47 48 50 51 52 57 62 64 67 69 70 71 72 73 74 75 76 77 79 80 84 85 86 87 88 90 91 93 1 2 3 4 5 6 9 11 12 14 19 24 25 26 31 42 49 50 57 64 66 67 68 69 73 77 78 79 80 81 83 84 86 88 92 93 94 95 0 1 2 3 4 5 6 7 8 9 10 12 13 15 16 17 18 19 20 21 22 23 26 28 29 30 31 36 40 41 42 46 51 59 62 63 64 65 67 68 69 71 72 73 74 75 76 78 81 83 84 85 86 87 89 90 91 93 94 0 2 3 4 5 6 9 10 11 12 13 17 19 22 23 25 26 28 30 31 36 42 43 44 50 51 57 62 64 65 66 68 69 72 73 74 75 76 78 79 80 82 83 85 87 90 91 93 95 0 2 3 4 5 7 8 9 11 12 15 17 18 19 20 22 23 24 25 26 27 28 30 34 37 40 42 50 62 64 66 71 73 74 75 77 78 79 81 82 83 84 87 88 89 90 91 92 93 95 1 2 7 9 10 11 13 14 17 19 22 24 26 28 30 31 33 37 42 43 49 50 51 62 64 65 66 68 69 72 73 77 78 80 82 83 84 85 86 87 88 89 90 91 92 93 95 0 1 2 4 6 7 8 9 11 12 13 14 16 21 23 24 25 26 27 28 29 30 31 33 39 40 42 48 49 50 51 63 66 67 70 72 73 75 76 78 79 82 84 85 87 91 92 93 95 0 2 3 4 5 8 10 11 12 14 15 17 18 19 20 22 23 24 25 26 27 28 29 30 31 42 45 49 55 62 64 65 66 67 68 69 72 73 74 75 76 77 78 79 80 81 83 86 88 91 92 93 94 0 5 8 9 10 11 12 13 14 15 16 17 18 19 23 24 28 29 30 31 43 49 51 52 56 62 64 66 67 69 70 71 72 73 74 75 76 77 78 80 81 83 86 87 91 92 94 95 0 1 2 3 4 5 9 11 12 13 17 18 20 21 24 26 27 28 29 31 42 49 50 51 62 64 73 74 75 77 78 80 81 82 83 85 87 90 91 92 95 0 1 2 4 5 8 9 10 11 12 13 15 16 19 20 21 23 24 25 26 27 28 29 30 31 32 42 45 50 51 55 62 63 66 67 69 70 75 76 77 79 82 83 84 85 86 87 90 91 92 93 94 4 6 7 8 12 14 15 16 17 18 20 21 23 24 26 27 28 29 30 32 46 49 50 55 57 59 65 66 67 70 71 73 74 75 76 77 82 83 84 88 89 90 95 0 1 2 7 9 12 16 17 18 20 22 23 24 26 27 28 29 30 36 37 42 48 49 54 55 56 64 65 67 68 70 71 72 73 76 78 80 83 85 86 88 89 91 93 94 95 1 5 7 9 10 11 12 13 15 16 20 28 29 34 36 39 42 49 50 51 59 60 62 63 66 71 72 73 75 76 80 81 82 83 84 85 86 87 90 95 1 2 3 5 6 8 9 10 11 12 14 15 16 18 19 22 23 25 26 27 29 30 31 32 36 38 42 46 48 49 50 51 52 53 56 63 64 65 66 67 68 69 70 72 73 75 76 77 78 80 81 83 84 85 86 88 89 90 91 92 95 3 4 5 11 13 14 17 20 25 27 29 33 37 41 42 45 49 50 51 56 58 60 62 64 65 66 67 70 71 73 74 77 78 79 84 85 87 90 91 92 93 1 2 3 5 6 7 11 12 13 14 16 18 19 20 21 22 23 26 27 28 30 36 38 42 47 49 50 51 58 62 64 65 69 70 71 72 73 76 77 79 80 82 86 87 88 89 90 91 92 93 94 0 2 4 5 6 7 8 11 12 14 16 18 19 20 22 23 24 26 28 29 30 31 33 36 38 41 42 43 44 48 49 51 52 53 57 59 61 63 66 67 69 70 71 72 73 75 76 82 83 85 86 87 88 89 91 92 94 95 0 1 2 3 4 6 9 10 12 14 15 16 17 18 19 20 21 22 23 24 25 26 28 29 31 32 36 37 39 40 42 43 45 51 52 53 55 56 57 58 62 63 64 66 67 68 70 71 72 74 75 77 78 79 80 82 83 84 86 87 88 91 92 93 94 95 0 1 2 3 4 6 8 9 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 28 29 30 31 32 33 37 38 42 44 45 47 49 51 52 54 61 63 65 67 68 69 71 73 74 79 80 81 82 83 84 85 86 87 88 90 91 93 95 0 2 3 5 8 11 13 14 15 17 18 22 23 24 27 28 30 31 36 41 46 48 49 50 51 53 57 58 62 63 67 68 73 74 75 78 79 82 88 95 0 2 4 5 8 9 10 11 12 13 17 19 20 21 22 23 25 26 27 30 31 40 41 43 49 50 51 53 64 65 66 67 68 70 72 73 74 75 77 78 79 84 87 88 89 92 93 94 95 0 2 3 4 5 11 12 13 14 15 16 17 19 20 23 25 28 29 30 36 38 47 49 50 52 56 61 64 65 67 68 69 71 72 73 74 76 78 82 85 89 90 94 95 3 4 5 6 7 9 11 12 13 15 16 17 18 19 21 24 26 28 29 30 31 36 42 45 49 50 55 59 60 64 66 67 73 75 76 77 80 81 83 86 87 90 91 92 93 94 0 2 3 4 5 6 7 8 11 12 16 17 20 21 23 28 29 30 31 32 41 42 47 55 62 64 65 67 68 69 71 73 74 75 76 77 78 80 81 82 83 84 85 86 87 88 89 90 91 92 93 95 11 13 17 18 19 25 26 28 35 37 40 51 60 62 69 71 73 74 75 78 80 81 83 88 90 91 92 93 1 2 7 8 9 14 16 18 22 23 24 26 27 28 29 30 31 36 45 48 51 52 61 64 66 67 69 70 71 75 77 82 83 85 86 87 89 90 92 93 2 5 8 10 13 14 24 27 31 42 50 52 55 66 67 70 73 79 82 87 89 0 1 2 3 4 5 6 12 13 14 16 17 19 20 22 25 26 29 33 37 38 41 42 43 45 46 47 48 49 50 51 59 62 65 66 68 69 70 74 75 77 78 79 81 82 84 85 86 90 91 94 95 0 1 3 5 6 7 8 12 13 14 15 16 17 19 20 26 27 28 29 30 33 37 39 40 41 42 45 48 49 50 56 57 59 68 69 71 72 78 79 82 83 84 86 87 90 91 93 1 2 4 5 6 7 8 9 10 11 12 13 14 17 20 21 22 23 24 25 26 27 28 29 30 31 35 38 40 44 50 51 54 61 62 64 66 67 70 71 73 75 76 78 82 83 84 85 87 88 89 90 91 92 93 94 95 1 2 3 4 6 8 9 11 12 13 15 17 19 24 25 26 27 36 44 48 50 57 67 68 72 74 75 77 78 81 83 84 85 87 90 91 92 93 95 1 2 5 8 9 11 12 13 15 17 18 19 21 22 23 26 27 30 36 43 45 47 48 50 51 52 57 62 63 64 66 67 71 73 74 75 77 78 79 80 81 82 83 84 85 86 88 91 93 95 5 7 10 11 12 13 14 16 17 18 26 27 30 35 37 42 47 48 50 52 57 59 63 64 69 71 72 74 75 76 77 78 81 82 85 89 90 95 0 5 6 7 8 9 10 11 12 13 14 15 16 17 19 20 21 23 25 26 27 28 29 30 31 34 44 49 50 51 58 65 66 67 68 69 70 71 72 75 76 80 81 84 85 86 87 89 90 92 93 94 0 1 3 4 5 8 12 14 15 17 18 19 20 21 24 25 26 27 28 29 37 38 40 45 47 48 49 50 51 60 62 66 68 70 78 81 82 86 87 88 89 90 91 93 94 95 0 1 2 3 5 6 10 11 12 15 16 17 18 19 20 21 23 25 26 27 29 30 34 44 45 46 48 49 50 51 53 54 60 61 62 64 65 66 68 69 71 72 75 77 78 79 80 81 83 84 85 86 87 88 89 90 92 93 94 2 3 5 6 7 8 9 10 11 13 17 18 19 21 22 24 25 26 27 28 29 30 31 33 37 42 45 48 49 50 51 57 59 61 62 64 65 66 68 69 71 72 73 74 75 77 79 80 81 82 83 86 89 91 92 93 94 95 0 1 3 4 6 7 8 10 11 12 13 14 15 17 18 20 21 22 24 25 26 27 28 29 30 31 39 46 49 59 64 65 66 69 71 72 73 74 75 76 79 80 86 87 89 90 92 93 94 95 6 13 15 19 20 26 30 31 41 42 49 50 57 62 66 69 73 74 75 81 84 85 86 87 89 90 95 2 3 7 8 10 11 12 13 14 16 17 18 19 22 23 24 25 26 27 29 30 34 36 39 42 50 55 56 57 62 64 65 67 69 70 71 72 73 78 79 81 82 84 85 86 87 89 92 93 94 95 2 3 4 6 7 8 10 11 12 15 16 18 20 21 24 25 26 27 29 30 31 36 42 48 50 51 54 57 58 62 64 66 68 70 76 78 80 82 83 84 87 89 92 94 95 0 2 3 4 5 7 8 11 12 14 17 18 19 21 22 24 25 27 28 30 31 32 36 38 39 42 44 47 51 52 55 57 59 64 65 67 71 72 73 74 75 76 78 80 81 83 84 86 90 91 92 93 94 95 0 2 3 5 7 11 14 17 18 21 22 24 25 26 27 28 31 36 45 48 49 50 51 52 59 61 62 63 65 66 67 75 76 77 78 79 80 83 84 92 93 94
diff --git a/include/gunrock/algorithms/algorithms.hxx b/include/gunrock/algorithms/algorithms.hxx
index 972e08c8..2a66a211 100644
--- a/include/gunrock/algorithms/algorithms.hxx
+++ b/include/gunrock/algorithms/algorithms.hxx
@@ -30,6 +30,7 @@ namespace gunrock {}  // namespace gunrock
 
 // I/O includes
 #include <gunrock/io/matrix_market.hxx>
+#include <gunrock/io/smtx.hxx>
 #include <gunrock/io/sample.hxx>
 
 // Graph includes
diff --git a/include/gunrock/algorithms/generate/random.hxx b/include/gunrock/algorithms/generate/random.hxx
index a8efde2d..de3283f7 100644
--- a/include/gunrock/algorithms/generate/random.hxx
+++ b/include/gunrock/algorithms/generate/random.hxx
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <numeric>
+#include <random>
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/random.h>
 
@@ -30,6 +32,22 @@ void uniform_distribution(vector_t& input,
                     input.begin(), generate_random);
 }
 
+/**
+ * @brief Get a random floating point value
+ * 
+ * @tparam rand_t type of value (default = float)
+ * @param begin low random value
+ * @param end high random value
+ * @return rand_t random value in the range [begin, end]
+ */
+template <typename rand_t = float>
+rand_t get_random(rand_t begin = 0.0f, rand_t end = 1.0f) {
+  std::random_device rd;
+  std::mt19937 gen(rd());
+  std::uniform_real_distribution<> dis(begin, end);
+  return (rand_t)dis(gen);
+}
+
 }  // namespace random
 }  // namespace generate
 }  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/io/smtx.hxx b/include/gunrock/io/smtx.hxx
new file mode 100644
index 00000000..6917dde0
--- /dev/null
+++ b/include/gunrock/io/smtx.hxx
@@ -0,0 +1,175 @@
+/**
+ * @file smtx.hxx
+ * @author Jonathan Wapman (jdwapman@ucdavis.edu)
+ * @brief
+ * @version 0.1
+ * @date 2022-05-27
+ *
+ * @copyright Copyright (c) 2022
+ *
+ */
+
+#pragma once
+
+#include <string>
+#include <limits>
+#include <filesystem>
+#include <fstream>
+#include <random>
+#include <sstream>
+#include <stdexcept>
+#include <vector>
+
+#include <gunrock/io/detail/mmio.hxx>
+
+#include <gunrock/util/filepath.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/memory.hxx>
+#include <gunrock/error.hxx>
+#include <gunrock/algorithms/generate/random.hxx>
+
+namespace gunrock {
+namespace io {
+
+using namespace memory;
+
+std::string leading_trim(std::string s) {
+    size_t start = s.find_first_not_of(" ");
+    return (start == std::string::npos) ? "" : s.substr(start);
+}
+
+/**
+ * @brief Reads a smtx graph from an input-stream
+ * into a specified sparse format
+ *
+ * Here is an example of the smtx format
+ * +----------------------------------------------+
+ * |% Sparse matrix file format .smtx             | <--- header line
+ * |%                                             | <--+
+ * |% comments                                    |    |-- 0 or more comments
+ * |%                                             | <--+
+ * |  M K NNZ                                     | <--- rows, columns, entries
+ * |  row_offsets                                 | <--+
+ * |  column_indices                              | <--+-- 2 lines
+ * |                                              |
+ * +----------------------------------------------+
+ *
+ */
+template <typename vertex_t, typename edge_t, typename weight_t>
+struct smtx_t {
+  std::string filename;
+  std::string dataset;
+
+  smtx_t() {}
+  ~smtx_t() {}
+
+  /**
+   * @brief Loads the given .smtx file into a csr format.
+   *
+   * @param _filename input file name (.smtx)
+   * @return csr sparse format
+   */
+  auto load(std::string _filename, bool first_line_csv = false) {
+    filename = _filename;
+    dataset = util::extract_dataset(util::extract_filename(filename));
+
+    std::ifstream smtx_file(filename);
+    unsigned int row_ptrs_buf;
+    vertex_t col_idxs_buf;
+
+    // smtx is written in CSR format
+    format::csr_t<memory_space_t::host, vertex_t, edge_t, weight_t> csr(
+        (vertex_t)0, (vertex_t)0, (edge_t)0);
+    csr.row_offsets.resize(0);
+    csr.column_indices.resize(0);
+    csr.nonzero_values.resize(0);
+
+    if (smtx_file.is_open()) {
+      std::size_t num_rows, num_columns, num_nonzeros;
+
+      std::string line;  // Buffer for storing file lines
+
+      for (int line_num = 0; line_num < 3; line_num++) {
+        // Skip over comment lines
+        do {
+          std::getline(smtx_file, line);
+        } while (line[0] == '%');
+
+        std::istringstream line_stream(line);
+
+        if (line_num == 0) {  // First Line has dimensions and nnz
+          if (first_line_csv) {
+            std::string buf;
+            std::getline(line_stream, buf, ',');
+            leading_trim(buf);
+            num_rows = std::stoi(buf);
+            std::getline(line_stream, buf, ',');
+            leading_trim(buf);
+            num_columns = std::stoi(buf);
+            std::getline(line_stream, buf, ',');
+            leading_trim(buf);
+            num_nonzeros = std::stoi(buf);
+          } else {
+            line_stream >> num_rows;
+            line_stream >> num_columns;
+            line_stream >> num_nonzeros;
+          }
+
+          error::throw_if_exception(
+              num_rows >= std::numeric_limits<vertex_t>::max() ||
+                  num_columns >= std::numeric_limits<vertex_t>::max(),
+              "vertex_t overflow");
+          error::throw_if_exception(
+              num_nonzeros >= std::numeric_limits<edge_t>::max(),
+              "edge_t overflow");
+
+          csr.number_of_rows = num_rows;
+          csr.number_of_columns = num_columns;
+          csr.number_of_nonzeros = num_nonzeros;
+
+          csr.row_offsets.reserve(csr.number_of_rows + 1);
+          csr.column_indices.reserve(csr.number_of_nonzeros);
+          csr.nonzero_values.reserve(csr.number_of_nonzeros);
+        } else if (line_num == 1) {  // Second line has row pointers
+          int count = 0;
+          while (line_stream >> row_ptrs_buf) {
+            csr.row_offsets.push_back(row_ptrs_buf);
+            count++;
+          }
+        } else if (line_num == 2) {  // Third line has column indices
+          while (line_stream >> col_idxs_buf) {
+            csr.column_indices.push_back(col_idxs_buf);
+            csr.nonzero_values.push_back(
+                gunrock::generate::random::get_random<weight_t>(1.0f, 10.0f));
+          }
+        }
+      }
+
+      smtx_file.close();
+    }
+    else {
+      throw(std::runtime_error("Unable to open file"));
+    }
+
+    if (csr.row_offsets.size() - 1 != csr.number_of_rows) {
+      std::ostringstream ss;
+      ss << "Number of rows in " << filename << " ("
+         << csr.row_offsets.size() - 1
+         << ") does not match the count in the first line (" << csr.number_of_rows
+         << ")";
+      throw(std::invalid_argument(ss.str()));
+    }
+
+    if (csr.nonzero_values.size() != csr.number_of_nonzeros) {
+      std::ostringstream ss;
+      ss << "Number of non-zeros in " << filename << " (" << csr.nonzero_values.size()
+         << ") does not match the count in the first line (" << csr.number_of_nonzeros << ")";
+      throw(std::invalid_argument(ss.str()));
+    }
+
+    return csr;
+  }
+};
+
+}  // namespace io
+}  // namespace gunrock
\ No newline at end of file
diff --git a/unittests/io/smtx.cuh b/unittests/io/smtx.cuh
new file mode 100644
index 00000000..a331367a
--- /dev/null
+++ b/unittests/io/smtx.cuh
@@ -0,0 +1,34 @@
+/**
+ * @file smtx.cuh
+ * @author Jonathan Wapman (jdwapman@ucdavis.edu)
+ * @brief Unit test for smtx loading.
+ * @version 0.1
+ * @date 2022-05-27
+ *
+ * @copyright Copyright (c) 2022
+ *
+ */
+
+#include <gunrock/error.hxx>    // error checking
+#include <gunrock/io/smtx.hxx>  // smtx support
+
+#include <gtest/gtest.h>
+
+TEST(io, smtx) {
+  using namespace gunrock;
+
+  // Load the smtx matrix
+  using row_t = int;
+  using edge_t = int;
+  using nonzero_t = float;
+  using csr_t = format::csr_t<memory_space_t::device, row_t, edge_t, nonzero_t>;
+
+  io::smtx_t<row_t, edge_t, nonzero_t> smtx;
+
+  csr_t csr = smtx.load(
+      "datasets/layers.0.blocks.0.attn.proj_swin_tiny_unstructured_50.smtx");
+
+  EXPECT_EQ(csr.number_of_rows, 96);
+  EXPECT_EQ(csr.number_of_columns, 96);
+  EXPECT_EQ(csr.number_of_nonzeros, 4608);
+}
\ No newline at end of file
diff --git a/unittests/unittests.hxx b/unittests/unittests.hxx
index 69bd0270..4bfba366 100644
--- a/unittests/unittests.hxx
+++ b/unittests/unittests.hxx
@@ -39,4 +39,5 @@
 #include "containers/array.cuh"
 
 // #include "io/matrix_market.cuh"
+#include "io/smtx.cuh"
 // #include "io/mtxbin.cuh"

From de8d03c9dc6e6daa8db117d72eafda106f40acf3 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Sat, 28 May 2022 17:23:57 -0700
Subject: [PATCH 12/58] multiple benchmarks

---
 benchmarks/CMakeLists.txt | 18 +++++---
 benchmarks/bench.cu       | 86 +++++++++++++++++++++++++++++++++++++++
 benchmarks/bfs_bench.cu   | 16 --------
 benchmarks/mst_bench.cu   | 16 --------
 4 files changed, 98 insertions(+), 38 deletions(-)
 create mode 100644 benchmarks/bench.cu

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 5060e3be..9d260ca9 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,16 +1,22 @@
 set(BENCHMARK_SOURCES
   for.cu
-  mst_bench.cu
-  bfs_bench.cu
+  bench.cu
 )
 
 foreach(SOURCE IN LISTS BENCHMARK_SOURCES)
   get_filename_component(BENCHMARK_NAME ${SOURCE} NAME_WLE)
   add_executable(${BENCHMARK_NAME} ${SOURCE})
-  target_link_libraries(${BENCHMARK_NAME} 
-    PRIVATE essentials
-    PRIVATE nvbench::nvbench
-  )
+  if(SOURCE MATCHES "bench.cu")
+    target_link_libraries(${BENCHMARK_NAME} 
+      PRIVATE essentials
+      PRIVATE nvbench::nvbench
+    )
+  else()
+    target_link_libraries(${BENCHMARK_NAME} 
+      PRIVATE essentials
+      PRIVATE nvbench::main
+    )
+  endif()  
   get_target_property(ESSENTIALS_ARCHITECTURES 
     essentials CUDA_ARCHITECTURES
   )
diff --git a/benchmarks/bench.cu b/benchmarks/bench.cu
new file mode 100644
index 00000000..e50d5568
--- /dev/null
+++ b/benchmarks/bench.cu
@@ -0,0 +1,86 @@
+std::string filename;
+
+#include "mst_bench.cu"
+#include "bfs_bench.cu"
+#include <cxxopts.hpp>
+
+std::vector<std::string> benchmarks;
+struct parameters_t {
+  std::string filename;
+  std::string benchmark;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv)
+      : options(argv[0], "Algorithm Benchmarks") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file (required)",
+         cxxopts::value<std::string>())  // mtx
+        ("b,benchmark", "Benchmark name (optional)",
+         cxxopts::value<std::string>());  // benchmark
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+    }
+
+    if (result.count("market") == 1) {
+      filename = result["market"].as<std::string>();
+      if (util::is_market(filename)) {
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    } else {
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      std::exit(0);
+    }
+
+    if (result.count("benchmark") == 1) {
+      benchmark = result["benchmark"].as<std::string>();
+      if (std::find(benchmarks.begin(), benchmarks.end(), benchmark) ==
+          benchmarks.end()) {
+        std::cout << "Error: invalid benchmark" << std::endl;
+        std::exit(0);
+      }
+    } else {
+      benchmark = "all";
+    }
+  }
+};
+
+int main(int argc, char** argv) {
+  benchmarks = {"mst_bench", "bfs_bench"};
+
+  parameters_t params(argc, argv);
+  filename = params.filename;
+  std::string benchmark = params.benchmark;
+
+  // Create a new argument array without filename to pass to NVBench.
+  char* args[argc - 2];
+  int j = 0;
+  for (int i = 0; i < argc; i++) {
+    if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+      i++;
+      continue;
+    }
+    args[j] = argv[i];
+    j++;
+  }
+
+  NVBENCH_BENCH(mst_bench);
+  NVBENCH_BENCH(bfs_bench);
+  NVBENCH_MAIN_BODY(argc - 2, args);
+}
\ No newline at end of file
diff --git a/benchmarks/bfs_bench.cu b/benchmarks/bfs_bench.cu
index 42cff9ce..f36c11a8 100644
--- a/benchmarks/bfs_bench.cu
+++ b/benchmarks/bfs_bench.cu
@@ -13,8 +13,6 @@
 using namespace gunrock;
 using namespace memory;
 
-std::string filename;  // Global
-
 void bfs_bench(nvbench::state& state) {
   // Add metrics.
   state.collect_dram_throughput();
@@ -77,17 +75,3 @@ void bfs_bench(nvbench::state& state) {
                       predecessors.data().get());
   });
 }
-
-int main(int argc, char** argv) {
-  filename = argv[1];
-
-  // Create a new argument array without filename to pass to NVBench.
-  char* args[argc - 1];
-  args[0] = argv[0];
-  for (int i = 1; i < argc; i++) {
-    args[i] = argv[i + 1];
-  }
-
-  NVBENCH_BENCH(bfs_bench);
-  NVBENCH_MAIN_BODY(argc - 1, args);
-}
\ No newline at end of file
diff --git a/benchmarks/mst_bench.cu b/benchmarks/mst_bench.cu
index 5dab267e..b5d0b1f3 100644
--- a/benchmarks/mst_bench.cu
+++ b/benchmarks/mst_bench.cu
@@ -13,8 +13,6 @@
 using namespace gunrock;
 using namespace memory;
 
-std::string filename;  // Global
-
 void mst_bench(nvbench::state& state) {
   // Add metrics.
   state.collect_dram_throughput();
@@ -72,17 +70,3 @@ void mst_bench(nvbench::state& state) {
     gunrock::mst::run(G, mst_weight.data().get());
   });
 }
-
-int main(int argc, char** argv) {
-  filename = argv[1];
-
-  // Create a new argument array without filename to pass to NVBench.
-  char* args[argc - 1];
-  args[0] = argv[0];
-  for (int i = 1; i < argc; i++) {
-    args[i] = argv[i + 1];
-  }
-
-  NVBENCH_BENCH(mst_bench);
-  NVBENCH_MAIN_BODY(argc - 1, args);
-}
\ No newline at end of file

From 187f5db7c7041be810e73dad7d98edb89bcb6b02 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Sat, 28 May 2022 17:47:07 -0700
Subject: [PATCH 13/58] param fixes

---
 benchmarks/bench.cu | 70 ++++++++++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/benchmarks/bench.cu b/benchmarks/bench.cu
index e50d5568..519f7cfb 100644
--- a/benchmarks/bench.cu
+++ b/benchmarks/bench.cu
@@ -8,6 +8,7 @@ std::vector<std::string> benchmarks;
 struct parameters_t {
   std::string filename;
   std::string benchmark;
+  bool help = false;
   cxxopts::Options options;
 
   /**
@@ -30,33 +31,34 @@ struct parameters_t {
     auto result = options.parse(argc, argv);
 
     if (result.count("help")) {
+      help = true;
       std::cout << options.help({""});
       std::cout << "  [optional nvbench args]" << std::endl << std::endl;
-    }
-
-    if (result.count("market") == 1) {
-      filename = result["market"].as<std::string>();
-      if (util::is_market(filename)) {
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (util::is_market(filename)) {
+        } else {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
       } else {
         std::cout << options.help({""});
         std::cout << "  [optional nvbench args]" << std::endl << std::endl;
         std::exit(0);
       }
-    } else {
-      std::cout << options.help({""});
-      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
-      std::exit(0);
-    }
 
-    if (result.count("benchmark") == 1) {
-      benchmark = result["benchmark"].as<std::string>();
-      if (std::find(benchmarks.begin(), benchmarks.end(), benchmark) ==
-          benchmarks.end()) {
-        std::cout << "Error: invalid benchmark" << std::endl;
-        std::exit(0);
+      if (result.count("benchmark") == 1) {
+        benchmark = result["benchmark"].as<std::string>();
+        if (std::find(benchmarks.begin(), benchmarks.end(), benchmark) ==
+            benchmarks.end()) {
+          std::cout << "Error: invalid benchmark" << std::endl;
+          std::exit(0);
+        }
+      } else {
+        benchmark = "all";
       }
-    } else {
-      benchmark = "all";
     }
   }
 };
@@ -69,18 +71,26 @@ int main(int argc, char** argv) {
   std::string benchmark = params.benchmark;
 
   // Create a new argument array without filename to pass to NVBench.
-  char* args[argc - 2];
-  int j = 0;
-  for (int i = 0; i < argc; i++) {
-    if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
-      i++;
-      continue;
+  if (params.help) {
+    const char* args[1] = {"-h"};
+    NVBENCH_BENCH(mst_bench);
+    NVBENCH_BENCH(bfs_bench);
+    NVBENCH_MAIN_BODY(1, args);
+
+  } else {
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
     }
-    args[j] = argv[i];
-    j++;
-  }
 
-  NVBENCH_BENCH(mst_bench);
-  NVBENCH_BENCH(bfs_bench);
-  NVBENCH_MAIN_BODY(argc - 2, args);
+    NVBENCH_BENCH(mst_bench);
+    NVBENCH_BENCH(bfs_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
 }
\ No newline at end of file

From 22d2f8e0905009c6dc3ff558c4a4921e6f74ed2a Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Sun, 29 May 2022 16:17:57 -0700
Subject: [PATCH 14/58] add more algorithm benchmarks

---
 benchmarks/algorithms/bc_bench.cu        | 69 +++++++++++++++++++++
 benchmarks/{ => algorithms}/bfs_bench.cu | 10 ++--
 benchmarks/algorithms/color_bench.cu     | 69 +++++++++++++++++++++
 benchmarks/algorithms/kcore_bench.cu     | 69 +++++++++++++++++++++
 benchmarks/{ => algorithms}/mst_bench.cu | 14 ++---
 benchmarks/algorithms/ppr_bench.cu       | 76 ++++++++++++++++++++++++
 benchmarks/algorithms/pr_bench.cu        | 74 +++++++++++++++++++++++
 benchmarks/algorithms/spmv_bench.cu      | 72 ++++++++++++++++++++++
 benchmarks/algorithms/sssp_bench.cu      | 75 +++++++++++++++++++++++
 benchmarks/bench.cu                      | 28 ++++++---
 10 files changed, 534 insertions(+), 22 deletions(-)
 create mode 100644 benchmarks/algorithms/bc_bench.cu
 rename benchmarks/{ => algorithms}/bfs_bench.cu (94%)
 create mode 100644 benchmarks/algorithms/color_bench.cu
 create mode 100644 benchmarks/algorithms/kcore_bench.cu
 rename benchmarks/{ => algorithms}/mst_bench.cu (89%)
 create mode 100644 benchmarks/algorithms/ppr_bench.cu
 create mode 100644 benchmarks/algorithms/pr_bench.cu
 create mode 100644 benchmarks/algorithms/spmv_bench.cu
 create mode 100644 benchmarks/algorithms/sssp_bench.cu

diff --git a/benchmarks/algorithms/bc_bench.cu b/benchmarks/algorithms/bc_bench.cu
new file mode 100644
index 00000000..32f620f0
--- /dev/null
+++ b/benchmarks/algorithms/bc_bench.cu
@@ -0,0 +1,69 @@
+#ifndef BENCH_INCLUDES
+#define BENCH_INCLUDES
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/cuda/cuda.hxx>
+#include <nvbench/nvbench.cuh>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <cxxopts.hpp>
+#endif
+
+#include <gunrock/algorithms/bc.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+void bc_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G =
+      graph::build::from_csr<memory_space_t::device,
+                             graph::view_t::csr /* | graph::view_t::csc */>(
+          csr.number_of_rows,               // rows
+          csr.number_of_columns,            // columns
+          csr.number_of_nonzeros,           // nonzeros
+          csr.row_offsets.data().get(),     // row_offsets
+          csr.column_indices.data().get(),  // column_indices
+          csr.nonzero_values.data().get(),  // values
+          row_indices.data().get(),         // row_indices
+          column_offsets.data().get()       // column_offsets
+      );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<weight_t> bc_values(n_vertices);
+
+  // --
+  // Run BC with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::bc::run(G, bc_values.data().get());
+  });
+}
diff --git a/benchmarks/bfs_bench.cu b/benchmarks/algorithms/bfs_bench.cu
similarity index 94%
rename from benchmarks/bfs_bench.cu
rename to benchmarks/algorithms/bfs_bench.cu
index f36c11a8..0281f318 100644
--- a/benchmarks/bfs_bench.cu
+++ b/benchmarks/algorithms/bfs_bench.cu
@@ -1,14 +1,14 @@
-#include <gunrock/error.hxx>
+#ifndef BENCH_INCLUDES
+#define BENCH_INCLUDES
 #include <gunrock/graph/graph.hxx>
 #include <gunrock/formats/formats.hxx>
 #include <gunrock/cuda/cuda.hxx>
-#include <gunrock/framework/operators/for/for.hxx>
 #include <nvbench/nvbench.cuh>
-#include <iostream>
 #include <gunrock/algorithms/algorithms.hxx>
-#include <gunrock/algorithms/mst.hxx>
-#include <gunrock/algorithms/bfs.hxx>
 #include <cxxopts.hpp>
+#endif
+
+#include <gunrock/algorithms/bfs.hxx>
 
 using namespace gunrock;
 using namespace memory;
diff --git a/benchmarks/algorithms/color_bench.cu b/benchmarks/algorithms/color_bench.cu
new file mode 100644
index 00000000..89ed975e
--- /dev/null
+++ b/benchmarks/algorithms/color_bench.cu
@@ -0,0 +1,69 @@
+#ifndef BENCH_INCLUDES
+#define BENCH_INCLUDES
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/cuda/cuda.hxx>
+#include <nvbench/nvbench.cuh>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <cxxopts.hpp>
+#endif
+
+#include <gunrock/algorithms/color.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+void color_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G =
+      graph::build::from_csr<memory_space_t::device,
+                             graph::view_t::csr /* | graph::view_t::csc */>(
+          csr.number_of_rows,               // rows
+          csr.number_of_columns,            // columns
+          csr.number_of_nonzeros,           // nonzeros
+          csr.row_offsets.data().get(),     // row_offsets
+          csr.column_indices.data().get(),  // column_indices
+          csr.nonzero_values.data().get(),  // values
+          row_indices.data().get(),         // row_indices
+          column_offsets.data().get()       // column_offsets
+      );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<vertex_t> colors(n_vertices);
+
+  // --
+  // Run Graph Coloring with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::color::run(G, colors.data().get());
+  });
+}
diff --git a/benchmarks/algorithms/kcore_bench.cu b/benchmarks/algorithms/kcore_bench.cu
new file mode 100644
index 00000000..70883b5c
--- /dev/null
+++ b/benchmarks/algorithms/kcore_bench.cu
@@ -0,0 +1,69 @@
+#ifndef BENCH_INCLUDES
+#define BENCH_INCLUDES
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/cuda/cuda.hxx>
+#include <nvbench/nvbench.cuh>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <cxxopts.hpp>
+#endif
+
+#include <gunrock/algorithms/kcore.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+void kcore_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G =
+      graph::build::from_csr<memory_space_t::device,
+                             graph::view_t::csr /* | graph::view_t::csc */>(
+          csr.number_of_rows,               // rows
+          csr.number_of_columns,            // columns
+          csr.number_of_nonzeros,           // nonzeros
+          csr.row_offsets.data().get(),     // row_offsets
+          csr.column_indices.data().get(),  // column_indices
+          csr.nonzero_values.data().get(),  // values
+          row_indices.data().get(),         // row_indices
+          column_offsets.data().get()       // column_offsets
+      );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<int> k_cores(n_vertices);
+
+  // --
+  // Run K-Core Decomposition with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::kcore::run(G, k_cores.data().get());
+  });
+}
diff --git a/benchmarks/mst_bench.cu b/benchmarks/algorithms/mst_bench.cu
similarity index 89%
rename from benchmarks/mst_bench.cu
rename to benchmarks/algorithms/mst_bench.cu
index b5d0b1f3..b2d99f46 100644
--- a/benchmarks/mst_bench.cu
+++ b/benchmarks/algorithms/mst_bench.cu
@@ -1,14 +1,14 @@
-#include <gunrock/error.hxx>
+#ifndef BENCH_INCLUDES
+#define BENCH_INCLUDES
 #include <gunrock/graph/graph.hxx>
 #include <gunrock/formats/formats.hxx>
 #include <gunrock/cuda/cuda.hxx>
-#include <gunrock/framework/operators/for/for.hxx>
 #include <nvbench/nvbench.cuh>
-#include <iostream>
 #include <gunrock/algorithms/algorithms.hxx>
-#include <gunrock/algorithms/mst.hxx>
-#include <gunrock/algorithms/bfs.hxx>
 #include <cxxopts.hpp>
+#endif
+
+#include <gunrock/algorithms/mst.hxx>
 
 using namespace gunrock;
 using namespace memory;
@@ -56,10 +56,6 @@ void mst_bench(nvbench::state& state) {
           column_offsets.data().get()       // column_offsets
       );
 
-  // Initialize the context.
-  cuda::device_id_t device = 0;
-  cuda::multi_context_t context(device);
-
   // --
   // Params and memory allocation
   thrust::device_vector<weight_t> mst_weight(1);
diff --git a/benchmarks/algorithms/ppr_bench.cu b/benchmarks/algorithms/ppr_bench.cu
new file mode 100644
index 00000000..64aa0390
--- /dev/null
+++ b/benchmarks/algorithms/ppr_bench.cu
@@ -0,0 +1,76 @@
+#ifndef BENCH_INCLUDES
+#define BENCH_INCLUDES
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/cuda/cuda.hxx>
+#include <nvbench/nvbench.cuh>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <cxxopts.hpp>
+#endif
+
+#include <gunrock/algorithms/ppr.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+void ppr_bench(nvbench::state& state) {
+  // --
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+  
+  // --
+  // IO
+  weight_t alpha = 0.15;
+  weight_t epsilon = 1e-6;
+  vertex_t n_seeds = 10;
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G =
+      graph::build::from_csr<memory_space_t::device,
+                             graph::view_t::csr /* | graph::view_t::csc */>(
+          csr.number_of_rows,               // rows
+          csr.number_of_columns,            // columns
+          csr.number_of_nonzeros,           // nonzeros
+          csr.row_offsets.data().get(),     // row_offsets
+          csr.column_indices.data().get(),  // column_indices
+          csr.nonzero_values.data().get(),  // values
+          row_indices.data().get(),         // row_indices
+          column_offsets.data().get()       // column_offsets
+      );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<weight_t> p(n_seeds * n_vertices);
+
+  // --
+  // Run Personalized PR with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      gunrock::ppr::run_batch(G, n_seeds, p.data().get(), alpha, epsilon);
+  });
+}
diff --git a/benchmarks/algorithms/pr_bench.cu b/benchmarks/algorithms/pr_bench.cu
new file mode 100644
index 00000000..2f3b16e5
--- /dev/null
+++ b/benchmarks/algorithms/pr_bench.cu
@@ -0,0 +1,74 @@
+#ifndef BENCH_INCLUDES
+#define BENCH_INCLUDES
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/cuda/cuda.hxx>
+#include <nvbench/nvbench.cuh>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <cxxopts.hpp>
+#endif
+
+#include <gunrock/algorithms/pr.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+void pr_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G =
+      graph::build::from_csr<memory_space_t::device,
+                             graph::view_t::csr /* | graph::view_t::csc */>(
+          csr.number_of_rows,               // rows
+          csr.number_of_columns,            // columns
+          csr.number_of_nonzeros,           // nonzeros
+          csr.row_offsets.data().get(),     // row_offsets
+          csr.column_indices.data().get(),  // column_indices
+          csr.nonzero_values.data().get(),  // values
+          row_indices.data().get(),         // row_indices
+          column_offsets.data().get()       // column_offsets
+      );
+
+  // --
+  // Params and memory allocation
+  srand(time(NULL));
+
+  weight_t alpha = 0.85;
+  weight_t tol = 1e-6;
+
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<weight_t> p(n_vertices);
+
+  // --
+  // Run PR with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::pr::run(G, alpha, tol, p.data().get());
+  });
+}
diff --git a/benchmarks/algorithms/spmv_bench.cu b/benchmarks/algorithms/spmv_bench.cu
new file mode 100644
index 00000000..d79f8470
--- /dev/null
+++ b/benchmarks/algorithms/spmv_bench.cu
@@ -0,0 +1,72 @@
+#ifndef BENCH_INCLUDES
+#define BENCH_INCLUDES
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/cuda/cuda.hxx>
+#include <nvbench/nvbench.cuh>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <cxxopts.hpp>
+#endif
+
+#include <gunrock/algorithms/spmv.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+void spmv_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G =
+      graph::build::from_csr<memory_space_t::device,
+                             graph::view_t::csr /* | graph::view_t::csc */>(
+          csr.number_of_rows,               // rows
+          csr.number_of_columns,            // columns
+          csr.number_of_nonzeros,           // nonzeros
+          csr.row_offsets.data().get(),     // row_offsets
+          csr.column_indices.data().get(),  // column_indices
+          csr.nonzero_values.data().get(),  // values
+          row_indices.data().get(),         // row_indices
+          column_offsets.data().get()       // column_offsets
+      );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<weight_t> x(n_vertices);
+  thrust::device_vector<weight_t> y(n_vertices);
+
+  gunrock::generate::random::uniform_distribution(x);
+
+  // --
+  // Run SPMV with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::spmv::run(G, x.data().get(), y.data().get());
+  });
+}
diff --git a/benchmarks/algorithms/sssp_bench.cu b/benchmarks/algorithms/sssp_bench.cu
new file mode 100644
index 00000000..9b4d4344
--- /dev/null
+++ b/benchmarks/algorithms/sssp_bench.cu
@@ -0,0 +1,75 @@
+#ifndef BENCH_INCLUDES
+#define BENCH_INCLUDES
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/cuda/cuda.hxx>
+#include <nvbench/nvbench.cuh>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <cxxopts.hpp>
+#endif
+
+#include <gunrock/algorithms/sssp.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+void sssp_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G =
+      graph::build::from_csr<memory_space_t::device,
+                             graph::view_t::csr /* | graph::view_t::csc */>(
+          csr.number_of_rows,               // rows
+          csr.number_of_columns,            // columns
+          csr.number_of_nonzeros,           // nonzeros
+          csr.row_offsets.data().get(),     // row_offsets
+          csr.column_indices.data().get(),  // column_indices
+          csr.nonzero_values.data().get(),  // values
+          row_indices.data().get(),         // row_indices
+          column_offsets.data().get()       // column_offsets
+      );
+
+  // --
+  // Params and memory allocation
+  srand(time(NULL));
+
+  vertex_t n_vertices = G.get_number_of_vertices();
+  vertex_t single_source = 0;  // rand() % n_vertices;
+
+  thrust::device_vector<weight_t> distances(n_vertices);
+  thrust::device_vector<vertex_t> predecessors(n_vertices);
+
+  // --
+  // Run SSSP with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::sssp::run(G, single_source, distances.data().get(),
+                       predecessors.data().get());
+  });
+}
diff --git a/benchmarks/bench.cu b/benchmarks/bench.cu
index 519f7cfb..d6063345 100644
--- a/benchmarks/bench.cu
+++ b/benchmarks/bench.cu
@@ -1,8 +1,14 @@
 std::string filename;
 
-#include "mst_bench.cu"
-#include "bfs_bench.cu"
-#include <cxxopts.hpp>
+#include "algorithms/mst_bench.cu"
+#include "algorithms/bfs_bench.cu"
+#include "algorithms/bc_bench.cu"
+#include "algorithms/color_bench.cu"
+#include "algorithms/kcore_bench.cu"
+#include "algorithms/ppr_bench.cu"
+#include "algorithms/pr_bench.cu"
+#include "algorithms/spmv_bench.cu"
+#include "algorithms/sssp_bench.cu"
 
 std::vector<std::string> benchmarks;
 struct parameters_t {
@@ -64,20 +70,19 @@ struct parameters_t {
 };
 
 int main(int argc, char** argv) {
-  benchmarks = {"mst_bench", "bfs_bench"};
+  benchmarks = {"mst_bench",   "bfs_bench",   "bc_bench",
+                "color_bench", "kcore_bench", "ppr_bench",
+                "pr_bench",    "spmv_bench",  "sssp_bench"};
 
   parameters_t params(argc, argv);
   filename = params.filename;
   std::string benchmark = params.benchmark;
 
-  // Create a new argument array without filename to pass to NVBench.
   if (params.help) {
     const char* args[1] = {"-h"};
-    NVBENCH_BENCH(mst_bench);
-    NVBENCH_BENCH(bfs_bench);
     NVBENCH_MAIN_BODY(1, args);
-
   } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
     char* args[argc - 2];
     int j = 0;
     for (int i = 0; i < argc; i++) {
@@ -91,6 +96,13 @@ int main(int argc, char** argv) {
 
     NVBENCH_BENCH(mst_bench);
     NVBENCH_BENCH(bfs_bench);
+    NVBENCH_BENCH(bc_bench);
+    NVBENCH_BENCH(color_bench);
+    NVBENCH_BENCH(kcore_bench);
+    NVBENCH_BENCH(ppr_bench);
+    NVBENCH_BENCH(pr_bench);
+    NVBENCH_BENCH(spmv_bench);
+    NVBENCH_BENCH(sssp_bench);
     NVBENCH_MAIN_BODY(argc - 2, args);
   }
 }
\ No newline at end of file

From 7d527c73a61e5d8c4d0083cd582ce08e43793e7b Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Tue, 31 May 2022 21:11:46 -0700
Subject: [PATCH 15/58] split benchmarks; add geo / hits

---
 benchmarks/CMakeLists.txt            |  18 +-
 benchmarks/algorithms/bc_bench.cu    |  69 --------
 benchmarks/algorithms/bfs_bench.cu   |  77 --------
 benchmarks/algorithms/color_bench.cu |  69 --------
 benchmarks/algorithms/kcore_bench.cu |  69 --------
 benchmarks/algorithms/mst_bench.cu   |  68 -------
 benchmarks/algorithms/ppr_bench.cu   |  76 --------
 benchmarks/algorithms/pr_bench.cu    |  74 --------
 benchmarks/algorithms/spmv_bench.cu  |  72 --------
 benchmarks/algorithms/sssp_bench.cu  |  75 --------
 benchmarks/bc_bench.cu               | 129 ++++++++++++++
 benchmarks/bench.cu                  | 108 ------------
 benchmarks/bfs_bench.cu              | 137 ++++++++++++++
 benchmarks/color_bench.cu            | 130 ++++++++++++++
 benchmarks/geo_bench.cu              | 255 +++++++++++++++++++++++++++
 benchmarks/hits_bench.cu             | 129 ++++++++++++++
 benchmarks/kcore_bench.cu            | 130 ++++++++++++++
 benchmarks/mst_bench.cu              | 124 +++++++++++++
 benchmarks/ppr_bench.cu              | 137 ++++++++++++++
 benchmarks/pr_bench.cu               | 134 ++++++++++++++
 benchmarks/spmv_bench.cu             | 133 ++++++++++++++
 benchmarks/sssp_bench.cu             | 135 ++++++++++++++
 22 files changed, 1587 insertions(+), 761 deletions(-)
 delete mode 100644 benchmarks/algorithms/bc_bench.cu
 delete mode 100644 benchmarks/algorithms/bfs_bench.cu
 delete mode 100644 benchmarks/algorithms/color_bench.cu
 delete mode 100644 benchmarks/algorithms/kcore_bench.cu
 delete mode 100644 benchmarks/algorithms/mst_bench.cu
 delete mode 100644 benchmarks/algorithms/ppr_bench.cu
 delete mode 100644 benchmarks/algorithms/pr_bench.cu
 delete mode 100644 benchmarks/algorithms/spmv_bench.cu
 delete mode 100644 benchmarks/algorithms/sssp_bench.cu
 create mode 100644 benchmarks/bc_bench.cu
 delete mode 100644 benchmarks/bench.cu
 create mode 100644 benchmarks/bfs_bench.cu
 create mode 100644 benchmarks/color_bench.cu
 create mode 100644 benchmarks/geo_bench.cu
 create mode 100644 benchmarks/hits_bench.cu
 create mode 100644 benchmarks/kcore_bench.cu
 create mode 100644 benchmarks/mst_bench.cu
 create mode 100644 benchmarks/ppr_bench.cu
 create mode 100644 benchmarks/pr_bench.cu
 create mode 100644 benchmarks/spmv_bench.cu
 create mode 100644 benchmarks/sssp_bench.cu

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 9d260ca9..7a327387 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -1,20 +1,30 @@
 set(BENCHMARK_SOURCES
   for.cu
-  bench.cu
+  bc_bench.cu
+  bfs_bench.cu
+  color_bench.cu
+  geo_bench.cu
+  hits_bench.cu
+  kcore_bench.cu
+  mst_bench.cu
+  ppr_bench.cu
+  pr_bench.cu
+  spmv_bench.cu
+  sssp_bench.cu
 )
 
 foreach(SOURCE IN LISTS BENCHMARK_SOURCES)
   get_filename_component(BENCHMARK_NAME ${SOURCE} NAME_WLE)
   add_executable(${BENCHMARK_NAME} ${SOURCE})
-  if(SOURCE MATCHES "bench.cu")
+  if(SOURCE MATCHES "for.cu")
     target_link_libraries(${BENCHMARK_NAME} 
       PRIVATE essentials
-      PRIVATE nvbench::nvbench
+      PRIVATE nvbench::main
     )
   else()
     target_link_libraries(${BENCHMARK_NAME} 
       PRIVATE essentials
-      PRIVATE nvbench::main
+      PRIVATE nvbench::nvbench
     )
   endif()  
   get_target_property(ESSENTIALS_ARCHITECTURES 
diff --git a/benchmarks/algorithms/bc_bench.cu b/benchmarks/algorithms/bc_bench.cu
deleted file mode 100644
index 32f620f0..00000000
--- a/benchmarks/algorithms/bc_bench.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef BENCH_INCLUDES
-#define BENCH_INCLUDES
-#include <gunrock/graph/graph.hxx>
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/cuda/cuda.hxx>
-#include <nvbench/nvbench.cuh>
-#include <gunrock/algorithms/algorithms.hxx>
-#include <cxxopts.hpp>
-#endif
-
-#include <gunrock/algorithms/bc.hxx>
-
-using namespace gunrock;
-using namespace memory;
-
-void bc_bench(nvbench::state& state) {
-  // Add metrics.
-  state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
-  state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
-
-  // --
-  // Define types
-  using csr_t =
-      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-
-  // --
-  // Build graph + metadata
-  csr_t csr;
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
-
-  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
-  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
-  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
-
-  auto G =
-      graph::build::from_csr<memory_space_t::device,
-                             graph::view_t::csr /* | graph::view_t::csc */>(
-          csr.number_of_rows,               // rows
-          csr.number_of_columns,            // columns
-          csr.number_of_nonzeros,           // nonzeros
-          csr.row_offsets.data().get(),     // row_offsets
-          csr.column_indices.data().get(),  // column_indices
-          csr.nonzero_values.data().get(),  // values
-          row_indices.data().get(),         // row_indices
-          column_offsets.data().get()       // column_offsets
-      );
-
-  // --
-  // Params and memory allocation
-  vertex_t n_vertices = G.get_number_of_vertices();
-  thrust::device_vector<weight_t> bc_values(n_vertices);
-
-  // --
-  // Run BC with NVBench
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::bc::run(G, bc_values.data().get());
-  });
-}
diff --git a/benchmarks/algorithms/bfs_bench.cu b/benchmarks/algorithms/bfs_bench.cu
deleted file mode 100644
index 0281f318..00000000
--- a/benchmarks/algorithms/bfs_bench.cu
+++ /dev/null
@@ -1,77 +0,0 @@
-#ifndef BENCH_INCLUDES
-#define BENCH_INCLUDES
-#include <gunrock/graph/graph.hxx>
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/cuda/cuda.hxx>
-#include <nvbench/nvbench.cuh>
-#include <gunrock/algorithms/algorithms.hxx>
-#include <cxxopts.hpp>
-#endif
-
-#include <gunrock/algorithms/bfs.hxx>
-
-using namespace gunrock;
-using namespace memory;
-
-void bfs_bench(nvbench::state& state) {
-  // Add metrics.
-  state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
-  state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
-
-  // --
-  // Define types
-  using csr_t =
-      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-
-  // --
-  // IO
-  csr_t csr;
-
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
-
-  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
-  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
-  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
-
-  // --
-  // Build graph + metadata
-  auto G =
-      graph::build::from_csr<memory_space_t::device,
-                             graph::view_t::csr /* | graph::view_t::csc */>(
-          csr.number_of_rows,               // rows
-          csr.number_of_columns,            // columns
-          csr.number_of_nonzeros,           // nonzeros
-          csr.row_offsets.data().get(),     // row_offsets
-          csr.column_indices.data().get(),  // column_indices
-          csr.nonzero_values.data().get(),  // values
-          row_indices.data().get(),         // row_indices
-          column_offsets.data().get()       // column_offsets
-      );
-
-  // --
-  // Params and memory allocation
-
-  vertex_t single_source = 0;
-
-  vertex_t n_vertices = G.get_number_of_vertices();
-  thrust::device_vector<vertex_t> distances(n_vertices);
-  thrust::device_vector<vertex_t> predecessors(n_vertices);
-
-  // --
-  // Run BFS with NVBench
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::bfs::run(G, single_source, distances.data().get(),
-                      predecessors.data().get());
-  });
-}
diff --git a/benchmarks/algorithms/color_bench.cu b/benchmarks/algorithms/color_bench.cu
deleted file mode 100644
index 89ed975e..00000000
--- a/benchmarks/algorithms/color_bench.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef BENCH_INCLUDES
-#define BENCH_INCLUDES
-#include <gunrock/graph/graph.hxx>
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/cuda/cuda.hxx>
-#include <nvbench/nvbench.cuh>
-#include <gunrock/algorithms/algorithms.hxx>
-#include <cxxopts.hpp>
-#endif
-
-#include <gunrock/algorithms/color.hxx>
-
-using namespace gunrock;
-using namespace memory;
-
-void color_bench(nvbench::state& state) {
-  // Add metrics.
-  state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
-  state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
-
-  // --
-  // Define types
-  using csr_t =
-      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-
-  // --
-  // Build graph + metadata
-  csr_t csr;
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
-
-  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
-  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
-  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
-
-  auto G =
-      graph::build::from_csr<memory_space_t::device,
-                             graph::view_t::csr /* | graph::view_t::csc */>(
-          csr.number_of_rows,               // rows
-          csr.number_of_columns,            // columns
-          csr.number_of_nonzeros,           // nonzeros
-          csr.row_offsets.data().get(),     // row_offsets
-          csr.column_indices.data().get(),  // column_indices
-          csr.nonzero_values.data().get(),  // values
-          row_indices.data().get(),         // row_indices
-          column_offsets.data().get()       // column_offsets
-      );
-
-  // --
-  // Params and memory allocation
-  vertex_t n_vertices = G.get_number_of_vertices();
-  thrust::device_vector<vertex_t> colors(n_vertices);
-
-  // --
-  // Run Graph Coloring with NVBench
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::color::run(G, colors.data().get());
-  });
-}
diff --git a/benchmarks/algorithms/kcore_bench.cu b/benchmarks/algorithms/kcore_bench.cu
deleted file mode 100644
index 70883b5c..00000000
--- a/benchmarks/algorithms/kcore_bench.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-#ifndef BENCH_INCLUDES
-#define BENCH_INCLUDES
-#include <gunrock/graph/graph.hxx>
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/cuda/cuda.hxx>
-#include <nvbench/nvbench.cuh>
-#include <gunrock/algorithms/algorithms.hxx>
-#include <cxxopts.hpp>
-#endif
-
-#include <gunrock/algorithms/kcore.hxx>
-
-using namespace gunrock;
-using namespace memory;
-
-void kcore_bench(nvbench::state& state) {
-  // Add metrics.
-  state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
-  state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
-
-  // --
-  // Define types
-  using csr_t =
-      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-
-  // --
-  // Build graph + metadata
-  csr_t csr;
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
-
-  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
-  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
-  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
-
-  auto G =
-      graph::build::from_csr<memory_space_t::device,
-                             graph::view_t::csr /* | graph::view_t::csc */>(
-          csr.number_of_rows,               // rows
-          csr.number_of_columns,            // columns
-          csr.number_of_nonzeros,           // nonzeros
-          csr.row_offsets.data().get(),     // row_offsets
-          csr.column_indices.data().get(),  // column_indices
-          csr.nonzero_values.data().get(),  // values
-          row_indices.data().get(),         // row_indices
-          column_offsets.data().get()       // column_offsets
-      );
-
-  // --
-  // Params and memory allocation
-  vertex_t n_vertices = G.get_number_of_vertices();
-  thrust::device_vector<int> k_cores(n_vertices);
-
-  // --
-  // Run K-Core Decomposition with NVBench
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::kcore::run(G, k_cores.data().get());
-  });
-}
diff --git a/benchmarks/algorithms/mst_bench.cu b/benchmarks/algorithms/mst_bench.cu
deleted file mode 100644
index b2d99f46..00000000
--- a/benchmarks/algorithms/mst_bench.cu
+++ /dev/null
@@ -1,68 +0,0 @@
-#ifndef BENCH_INCLUDES
-#define BENCH_INCLUDES
-#include <gunrock/graph/graph.hxx>
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/cuda/cuda.hxx>
-#include <nvbench/nvbench.cuh>
-#include <gunrock/algorithms/algorithms.hxx>
-#include <cxxopts.hpp>
-#endif
-
-#include <gunrock/algorithms/mst.hxx>
-
-using namespace gunrock;
-using namespace memory;
-
-void mst_bench(nvbench::state& state) {
-  // Add metrics.
-  state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
-  state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
-
-  // --
-  // Define types
-  using csr_t =
-      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-
-  // --
-  // Build graph + metadata
-  csr_t csr;
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
-
-  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
-  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
-  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
-
-  auto G =
-      graph::build::from_csr<memory_space_t::device,
-                             graph::view_t::csr /* | graph::view_t::csc */>(
-          csr.number_of_rows,               // rows
-          csr.number_of_columns,            // columns
-          csr.number_of_nonzeros,           // nonzeros
-          csr.row_offsets.data().get(),     // row_offsets
-          csr.column_indices.data().get(),  // column_indices
-          csr.nonzero_values.data().get(),  // values
-          row_indices.data().get(),         // row_indices
-          column_offsets.data().get()       // column_offsets
-      );
-
-  // --
-  // Params and memory allocation
-  thrust::device_vector<weight_t> mst_weight(1);
-
-  // --
-  // Run MST with NVBench
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::mst::run(G, mst_weight.data().get());
-  });
-}
diff --git a/benchmarks/algorithms/ppr_bench.cu b/benchmarks/algorithms/ppr_bench.cu
deleted file mode 100644
index 64aa0390..00000000
--- a/benchmarks/algorithms/ppr_bench.cu
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef BENCH_INCLUDES
-#define BENCH_INCLUDES
-#include <gunrock/graph/graph.hxx>
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/cuda/cuda.hxx>
-#include <nvbench/nvbench.cuh>
-#include <gunrock/algorithms/algorithms.hxx>
-#include <cxxopts.hpp>
-#endif
-
-#include <gunrock/algorithms/ppr.hxx>
-
-using namespace gunrock;
-using namespace memory;
-
-void ppr_bench(nvbench::state& state) {
-  // --
-  // Add metrics.
-  state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
-  state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
-  
-  // --
-  // IO
-  weight_t alpha = 0.15;
-  weight_t epsilon = 1e-6;
-  vertex_t n_seeds = 10;
-
-  // --
-  // Define types
-  using csr_t =
-      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-
-  // --
-  // Build graph + metadata
-  csr_t csr;
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
-
-  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
-  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
-  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
-
-  auto G =
-      graph::build::from_csr<memory_space_t::device,
-                             graph::view_t::csr /* | graph::view_t::csc */>(
-          csr.number_of_rows,               // rows
-          csr.number_of_columns,            // columns
-          csr.number_of_nonzeros,           // nonzeros
-          csr.row_offsets.data().get(),     // row_offsets
-          csr.column_indices.data().get(),  // column_indices
-          csr.nonzero_values.data().get(),  // values
-          row_indices.data().get(),         // row_indices
-          column_offsets.data().get()       // column_offsets
-      );
-
-  // --
-  // Params and memory allocation
-  vertex_t n_vertices = G.get_number_of_vertices();
-  thrust::device_vector<weight_t> p(n_seeds * n_vertices);
-
-  // --
-  // Run Personalized PR with NVBench
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-      gunrock::ppr::run_batch(G, n_seeds, p.data().get(), alpha, epsilon);
-  });
-}
diff --git a/benchmarks/algorithms/pr_bench.cu b/benchmarks/algorithms/pr_bench.cu
deleted file mode 100644
index 2f3b16e5..00000000
--- a/benchmarks/algorithms/pr_bench.cu
+++ /dev/null
@@ -1,74 +0,0 @@
-#ifndef BENCH_INCLUDES
-#define BENCH_INCLUDES
-#include <gunrock/graph/graph.hxx>
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/cuda/cuda.hxx>
-#include <nvbench/nvbench.cuh>
-#include <gunrock/algorithms/algorithms.hxx>
-#include <cxxopts.hpp>
-#endif
-
-#include <gunrock/algorithms/pr.hxx>
-
-using namespace gunrock;
-using namespace memory;
-
-void pr_bench(nvbench::state& state) {
-  // Add metrics.
-  state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
-  state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
-
-  // --
-  // Define types
-  using csr_t =
-      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-
-  // --
-  // Build graph + metadata
-  csr_t csr;
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
-
-  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
-  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
-  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
-
-  auto G =
-      graph::build::from_csr<memory_space_t::device,
-                             graph::view_t::csr /* | graph::view_t::csc */>(
-          csr.number_of_rows,               // rows
-          csr.number_of_columns,            // columns
-          csr.number_of_nonzeros,           // nonzeros
-          csr.row_offsets.data().get(),     // row_offsets
-          csr.column_indices.data().get(),  // column_indices
-          csr.nonzero_values.data().get(),  // values
-          row_indices.data().get(),         // row_indices
-          column_offsets.data().get()       // column_offsets
-      );
-
-  // --
-  // Params and memory allocation
-  srand(time(NULL));
-
-  weight_t alpha = 0.85;
-  weight_t tol = 1e-6;
-
-  vertex_t n_vertices = G.get_number_of_vertices();
-  thrust::device_vector<weight_t> p(n_vertices);
-
-  // --
-  // Run PR with NVBench
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::pr::run(G, alpha, tol, p.data().get());
-  });
-}
diff --git a/benchmarks/algorithms/spmv_bench.cu b/benchmarks/algorithms/spmv_bench.cu
deleted file mode 100644
index d79f8470..00000000
--- a/benchmarks/algorithms/spmv_bench.cu
+++ /dev/null
@@ -1,72 +0,0 @@
-#ifndef BENCH_INCLUDES
-#define BENCH_INCLUDES
-#include <gunrock/graph/graph.hxx>
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/cuda/cuda.hxx>
-#include <nvbench/nvbench.cuh>
-#include <gunrock/algorithms/algorithms.hxx>
-#include <cxxopts.hpp>
-#endif
-
-#include <gunrock/algorithms/spmv.hxx>
-
-using namespace gunrock;
-using namespace memory;
-
-void spmv_bench(nvbench::state& state) {
-  // Add metrics.
-  state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
-  state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
-
-  // --
-  // Define types
-  using csr_t =
-      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-
-  // --
-  // Build graph + metadata
-  csr_t csr;
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
-
-  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
-  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
-  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
-
-  auto G =
-      graph::build::from_csr<memory_space_t::device,
-                             graph::view_t::csr /* | graph::view_t::csc */>(
-          csr.number_of_rows,               // rows
-          csr.number_of_columns,            // columns
-          csr.number_of_nonzeros,           // nonzeros
-          csr.row_offsets.data().get(),     // row_offsets
-          csr.column_indices.data().get(),  // column_indices
-          csr.nonzero_values.data().get(),  // values
-          row_indices.data().get(),         // row_indices
-          column_offsets.data().get()       // column_offsets
-      );
-
-  // --
-  // Params and memory allocation
-  vertex_t n_vertices = G.get_number_of_vertices();
-  thrust::device_vector<weight_t> x(n_vertices);
-  thrust::device_vector<weight_t> y(n_vertices);
-
-  gunrock::generate::random::uniform_distribution(x);
-
-  // --
-  // Run SPMV with NVBench
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::spmv::run(G, x.data().get(), y.data().get());
-  });
-}
diff --git a/benchmarks/algorithms/sssp_bench.cu b/benchmarks/algorithms/sssp_bench.cu
deleted file mode 100644
index 9b4d4344..00000000
--- a/benchmarks/algorithms/sssp_bench.cu
+++ /dev/null
@@ -1,75 +0,0 @@
-#ifndef BENCH_INCLUDES
-#define BENCH_INCLUDES
-#include <gunrock/graph/graph.hxx>
-#include <gunrock/formats/formats.hxx>
-#include <gunrock/cuda/cuda.hxx>
-#include <nvbench/nvbench.cuh>
-#include <gunrock/algorithms/algorithms.hxx>
-#include <cxxopts.hpp>
-#endif
-
-#include <gunrock/algorithms/sssp.hxx>
-
-using namespace gunrock;
-using namespace memory;
-
-void sssp_bench(nvbench::state& state) {
-  // Add metrics.
-  state.collect_dram_throughput();
-  state.collect_l1_hit_rates();
-  state.collect_l2_hit_rates();
-  state.collect_loads_efficiency();
-  state.collect_stores_efficiency();
-
-  // --
-  // Define types
-  using csr_t =
-      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
-
-  // --
-  // Build graph + metadata
-  csr_t csr;
-  if (util::is_market(filename)) {
-    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
-  } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
-    exit(1);
-  }
-
-  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
-  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
-  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
-
-  auto G =
-      graph::build::from_csr<memory_space_t::device,
-                             graph::view_t::csr /* | graph::view_t::csc */>(
-          csr.number_of_rows,               // rows
-          csr.number_of_columns,            // columns
-          csr.number_of_nonzeros,           // nonzeros
-          csr.row_offsets.data().get(),     // row_offsets
-          csr.column_indices.data().get(),  // column_indices
-          csr.nonzero_values.data().get(),  // values
-          row_indices.data().get(),         // row_indices
-          column_offsets.data().get()       // column_offsets
-      );
-
-  // --
-  // Params and memory allocation
-  srand(time(NULL));
-
-  vertex_t n_vertices = G.get_number_of_vertices();
-  vertex_t single_source = 0;  // rand() % n_vertices;
-
-  thrust::device_vector<weight_t> distances(n_vertices);
-  thrust::device_vector<vertex_t> predecessors(n_vertices);
-
-  // --
-  // Run SSSP with NVBench
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    gunrock::sssp::run(G, single_source, distances.data().get(),
-                       predecessors.data().get());
-  });
-}
diff --git a/benchmarks/bc_bench.cu b/benchmarks/bc_bench.cu
new file mode 100644
index 00000000..6091db48
--- /dev/null
+++ b/benchmarks/bc_bench.cu
@@ -0,0 +1,129 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/bc.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv) : options(argv[0], "BC Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void bc_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<weight_t> bc_values(n_vertices);
+
+  // --
+  // Run BC with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::bc::run(G, bc_values.data().get());
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(bc_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}
diff --git a/benchmarks/bench.cu b/benchmarks/bench.cu
deleted file mode 100644
index d6063345..00000000
--- a/benchmarks/bench.cu
+++ /dev/null
@@ -1,108 +0,0 @@
-std::string filename;
-
-#include "algorithms/mst_bench.cu"
-#include "algorithms/bfs_bench.cu"
-#include "algorithms/bc_bench.cu"
-#include "algorithms/color_bench.cu"
-#include "algorithms/kcore_bench.cu"
-#include "algorithms/ppr_bench.cu"
-#include "algorithms/pr_bench.cu"
-#include "algorithms/spmv_bench.cu"
-#include "algorithms/sssp_bench.cu"
-
-std::vector<std::string> benchmarks;
-struct parameters_t {
-  std::string filename;
-  std::string benchmark;
-  bool help = false;
-  cxxopts::Options options;
-
-  /**
-   * @brief Construct a new parameters object and parse command line arguments.
-   *
-   * @param argc Number of command line arguments.
-   * @param argv Command line arguments.
-   */
-  parameters_t(int argc, char** argv)
-      : options(argv[0], "Algorithm Benchmarks") {
-    options.allow_unrecognised_options();
-    // Add command line options
-    options.add_options()("h,help", "Print help")  // help
-        ("m,market", "Matrix file (required)",
-         cxxopts::value<std::string>())  // mtx
-        ("b,benchmark", "Benchmark name (optional)",
-         cxxopts::value<std::string>());  // benchmark
-
-    // Parse command line arguments
-    auto result = options.parse(argc, argv);
-
-    if (result.count("help")) {
-      help = true;
-      std::cout << options.help({""});
-      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
-    } else {
-      if (result.count("market") == 1) {
-        filename = result["market"].as<std::string>();
-        if (util::is_market(filename)) {
-        } else {
-          std::cout << options.help({""});
-          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
-          std::exit(0);
-        }
-      } else {
-        std::cout << options.help({""});
-        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
-        std::exit(0);
-      }
-
-      if (result.count("benchmark") == 1) {
-        benchmark = result["benchmark"].as<std::string>();
-        if (std::find(benchmarks.begin(), benchmarks.end(), benchmark) ==
-            benchmarks.end()) {
-          std::cout << "Error: invalid benchmark" << std::endl;
-          std::exit(0);
-        }
-      } else {
-        benchmark = "all";
-      }
-    }
-  }
-};
-
-int main(int argc, char** argv) {
-  benchmarks = {"mst_bench",   "bfs_bench",   "bc_bench",
-                "color_bench", "kcore_bench", "ppr_bench",
-                "pr_bench",    "spmv_bench",  "sssp_bench"};
-
-  parameters_t params(argc, argv);
-  filename = params.filename;
-  std::string benchmark = params.benchmark;
-
-  if (params.help) {
-    const char* args[1] = {"-h"};
-    NVBENCH_MAIN_BODY(1, args);
-  } else {
-    // Create a new argument array without matrix filename to pass to NVBench.
-    char* args[argc - 2];
-    int j = 0;
-    for (int i = 0; i < argc; i++) {
-      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
-        i++;
-        continue;
-      }
-      args[j] = argv[i];
-      j++;
-    }
-
-    NVBENCH_BENCH(mst_bench);
-    NVBENCH_BENCH(bfs_bench);
-    NVBENCH_BENCH(bc_bench);
-    NVBENCH_BENCH(color_bench);
-    NVBENCH_BENCH(kcore_bench);
-    NVBENCH_BENCH(ppr_bench);
-    NVBENCH_BENCH(pr_bench);
-    NVBENCH_BENCH(spmv_bench);
-    NVBENCH_BENCH(sssp_bench);
-    NVBENCH_MAIN_BODY(argc - 2, args);
-  }
-}
\ No newline at end of file
diff --git a/benchmarks/bfs_bench.cu b/benchmarks/bfs_bench.cu
new file mode 100644
index 00000000..8296fa69
--- /dev/null
+++ b/benchmarks/bfs_bench.cu
@@ -0,0 +1,137 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/bfs.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv) : options(argv[0], "BFS Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void bfs_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // IO
+  csr_t csr;
+
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  // --
+  // Build graph + metadata
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+
+  vertex_t single_source = 0;
+
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<vertex_t> distances(n_vertices);
+  thrust::device_vector<vertex_t> predecessors(n_vertices);
+
+  // --
+  // Run BFS with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::bfs::run(G, single_source, distances.data().get(),
+                      predecessors.data().get());
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(bfs_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}
diff --git a/benchmarks/color_bench.cu b/benchmarks/color_bench.cu
new file mode 100644
index 00000000..5177c369
--- /dev/null
+++ b/benchmarks/color_bench.cu
@@ -0,0 +1,130 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/color.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv)
+      : options(argv[0], "Graph Coloring Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void color_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<vertex_t> colors(n_vertices);
+
+  // --
+  // Run Graph Coloring with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::color::run(G, colors.data().get());
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(color_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}
diff --git a/benchmarks/geo_bench.cu b/benchmarks/geo_bench.cu
new file mode 100644
index 00000000..23b556e7
--- /dev/null
+++ b/benchmarks/geo_bench.cu
@@ -0,0 +1,255 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/geo.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string matrix_filename;
+std::string coordinates_filename;
+
+struct parameters_t {
+  std::string matrix_filename;
+  std::string coordinates_filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv) : options(argv[0], "Geo Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>())  // coords
+        ("c,coordinates", "Coordinates file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        matrix_filename = result["market"].as<std::string>();
+        if (!util::is_market(matrix_filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+      if (result.count("coordinates") == 1) {
+        coordinates_filename = result["market"].as<std::string>();
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+/**
+ * @brief Reads a coordinates file from an input-stream a dense array.
+ *
+ * Here is an example of the labels file
+ * +-------------------------+
+ * |%%Labels Formatted File  | <-- header
+ * |%                        | <-+
+ * |% comments               |   |-- comments
+ * |%                        | <-+
+ * |  N L L                  | <-- num_nodes, num_labels, num_labels
+ * |  I0 L1A L1B             | <-- node id, latitude, longutude
+ * |  I4                     | <-- coordinates missing, populated as invalids
+ * |  I5 L5A L5B             |
+ * |  . . .                  |
+ * |  IN LNA LNB             |
+ * +-------------------------+
+ *
+ * @note Node ID (first column) must be 0-based.
+ * @note If a Node ID is present but coordinates are missing,
+ *       the coordinates are filled as invalids.
+ * @note If Node ID and coordinates are missing, the coordinates
+ *       for those Node IDs are filled as invalids.
+ */
+void read_coordinates_file(std::string filename,
+                           geo::coordinates_t* coordinates) {
+  FILE* f_in = fopen(filename.c_str(), "r");
+  int labels_read = gunrock::numeric_limits<int>::invalid();
+  char line[1024];
+
+  while (true) {
+    if (fscanf(f_in, "%[^\n]\n", line) <= 0) {
+      break;
+    }
+
+    if (line[0] == '%') {  // Comment
+      if (strlen(line) >= 2 && line[1] == '%') {
+        // Header -> Can be used to extract info for labels
+      }
+    }  // -> if
+
+    else if (!gunrock::util::limits::is_valid(
+                 labels_read)) {  // Problem description-> First line
+                                  // with nodes and labels info
+      long long ll_nodes, ll_label_x, ll_label_y;
+      int items_scanned =
+          sscanf(line, "%lld %lld %lld", &ll_nodes, &ll_label_x, &ll_label_y);
+      labels_read = 0;
+    }  // -> else if
+
+    else {                // Now we can start storing labels
+      long long ll_node;  // Active node
+
+      // Used for sscanf
+      float lf_label_a = gunrock::numeric_limits<float>::invalid();
+      float lf_label_b = gunrock::numeric_limits<float>::invalid();
+
+      float ll_label_a, ll_label_b;  // Used to parse float/double
+
+      int num_input =
+          sscanf(line, "%lld %f %f", &ll_node, &lf_label_a, &lf_label_b);
+
+      if (num_input == 1) {
+        // if only node id exists in the line, populate the coordinates with
+        // invalid values.
+        ll_label_a = gunrock::numeric_limits<float>::invalid();
+        ll_label_b = gunrock::numeric_limits<float>::invalid();
+      }
+
+      else if (num_input == 3) {
+        // if all three; node id, latitude and longitude exist, populate all
+        // three.
+        ll_label_a = lf_label_a;
+        ll_label_b = lf_label_b;
+
+        labels_read++;
+      }
+
+      else {
+        // else print an error.
+        std::cerr << "Invalid coordinates file format." << std::endl;
+        exit(1);
+      }
+
+      // XXX: Make sure these are 0-based;
+      coordinates[ll_node].latitude = ll_label_a;
+      coordinates[ll_node].longitude = ll_label_b;
+
+    }  // -> else
+  }    // -> while
+
+  if (labels_read) {
+    std::cout << "Valid coordinates read: " << labels_read << std::endl;
+  } else if (labels_read <= 0) {
+    std::cerr << "Error: No coordinates read." << std::endl;
+    exit(1);
+  }
+}
+
+void geo_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(matrix_filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+
+  unsigned int spatial_iterations = 1000;
+  unsigned int total_iterations = 10;
+
+  vertex_t n_vertices = G.get_number_of_vertices();
+
+  // Coordinates: Latitude/Longitude
+  geo::coordinates_t default_invalid;
+  default_invalid.latitude = gunrock::numeric_limits<float>::invalid();
+  default_invalid.longitude = gunrock::numeric_limits<float>::invalid();
+
+  thrust::host_vector<geo::coordinates_t> load_coordinates(n_vertices,
+                                                           default_invalid);
+  read_coordinates_file(coordinates_filename, load_coordinates.data());
+  thrust::device_vector<geo::coordinates_t> coordinates(load_coordinates);
+
+  // --
+  // Run Geo with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::geo::run(G, coordinates.data().get(), total_iterations,
+                      spatial_iterations);
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  matrix_filename = params.matrix_filename;
+  coordinates_filename = params.coordinates_filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix and coordinate filenames to
+    // pass to NVBench.
+    char* args[argc - 4];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0 ||
+          strcmp(argv[i], "--coordinates") == 0 || strcmp(argv[i], "-c")) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(geo_bench);
+    NVBENCH_MAIN_BODY(argc - 4, args);
+  }
+}
diff --git a/benchmarks/hits_bench.cu b/benchmarks/hits_bench.cu
new file mode 100644
index 00000000..06696703
--- /dev/null
+++ b/benchmarks/hits_bench.cu
@@ -0,0 +1,129 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/hits.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv) : options(argv[0], "HITS Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void hits_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+  hits::param_c param{20};
+  hits::result_c result{G};
+
+  // --
+  // Run BC with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::hits::run(G, param, result);
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(hits_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}
diff --git a/benchmarks/kcore_bench.cu b/benchmarks/kcore_bench.cu
new file mode 100644
index 00000000..7fe57fa8
--- /dev/null
+++ b/benchmarks/kcore_bench.cu
@@ -0,0 +1,130 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/kcore.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv)
+      : options(argv[0], "K-Core Decomposition Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void kcore_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<int> k_cores(n_vertices);
+
+  // --
+  // Run K-Core Decomposition with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::kcore::run(G, k_cores.data().get());
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(kcore_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}
diff --git a/benchmarks/mst_bench.cu b/benchmarks/mst_bench.cu
new file mode 100644
index 00000000..b4a3a18e
--- /dev/null
+++ b/benchmarks/mst_bench.cu
@@ -0,0 +1,124 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/mst.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv) : options(argv[0], "MST Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void mst_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+  thrust::device_vector<weight_t> mst_weight(1);
+
+  // --
+  // Run MST with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::mst::run(G, mst_weight.data().get());
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(mst_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}
diff --git a/benchmarks/ppr_bench.cu b/benchmarks/ppr_bench.cu
new file mode 100644
index 00000000..963c8f57
--- /dev/null
+++ b/benchmarks/ppr_bench.cu
@@ -0,0 +1,137 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/ppr.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv)
+      : options(argv[0], "Personalized PR Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void ppr_bench(nvbench::state& state) {
+  // --
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // IO
+  weight_t alpha = 0.15;
+  weight_t epsilon = 1e-6;
+  vertex_t n_seeds = 10;
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<weight_t> p(n_seeds * n_vertices);
+
+  // --
+  // Run Personalized PR with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::ppr::run_batch(G, n_seeds, p.data().get(), alpha, epsilon);
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(ppr_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}
diff --git a/benchmarks/pr_bench.cu b/benchmarks/pr_bench.cu
new file mode 100644
index 00000000..ac60b06f
--- /dev/null
+++ b/benchmarks/pr_bench.cu
@@ -0,0 +1,134 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/pr.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv) : options(argv[0], "PR Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void pr_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+  srand(time(NULL));
+
+  weight_t alpha = 0.85;
+  weight_t tol = 1e-6;
+
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<weight_t> p(n_vertices);
+
+  // --
+  // Run PR with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::pr::run(G, alpha, tol, p.data().get());
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(pr_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}
diff --git a/benchmarks/spmv_bench.cu b/benchmarks/spmv_bench.cu
new file mode 100644
index 00000000..19d0c520
--- /dev/null
+++ b/benchmarks/spmv_bench.cu
@@ -0,0 +1,133 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/generate/random.hxx>
+#include <gunrock/algorithms/spmv.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv) : options(argv[0], "SPMV Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void spmv_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<weight_t> x(n_vertices);
+  thrust::device_vector<weight_t> y(n_vertices);
+
+  gunrock::generate::random::uniform_distribution(x);
+
+  // --
+  // Run SPMV with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::spmv::run(G, x.data().get(), y.data().get());
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(spmv_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}
diff --git a/benchmarks/sssp_bench.cu b/benchmarks/sssp_bench.cu
new file mode 100644
index 00000000..be386dbd
--- /dev/null
+++ b/benchmarks/sssp_bench.cu
@@ -0,0 +1,135 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/sssp.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename;
+
+struct parameters_t {
+  std::string filename;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv) : options(argv[0], "SSSP Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file",
+         cxxopts::value<std::string>());  // mtx
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void sssp_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr.from_coo(mm.load(filename));
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get(),  // values
+      row_indices.data().get(),         // row_indices
+      column_offsets.data().get()       // column_offsets
+  );
+
+  // --
+  // Params and memory allocation
+  srand(time(NULL));
+
+  vertex_t n_vertices = G.get_number_of_vertices();
+  vertex_t single_source = 0;  // rand() % n_vertices;
+
+  thrust::device_vector<weight_t> distances(n_vertices);
+  thrust::device_vector<vertex_t> predecessors(n_vertices);
+
+  // --
+  // Run SSSP with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    gunrock::sssp::run(G, single_source, distances.data().get(),
+                       predecessors.data().get());
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename = params.filename;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 2];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(sssp_bench);
+    NVBENCH_MAIN_BODY(argc - 2, args);
+  }
+}

From 4e2b3995554b1b76f7fb9d8e5abe4ab5ef7ecc44 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Wed, 1 Jun 2022 16:57:32 -0700
Subject: [PATCH 16/58] add spgemm

---
 benchmarks/CMakeLists.txt  |   1 +
 benchmarks/spgemm_bench.cu | 141 +++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+)
 create mode 100644 benchmarks/spgemm_bench.cu

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 7a327387..74b0e450 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -9,6 +9,7 @@ set(BENCHMARK_SOURCES
   mst_bench.cu
   ppr_bench.cu
   pr_bench.cu
+  spgemm_bench.cu
   spmv_bench.cu
   sssp_bench.cu
 )
diff --git a/benchmarks/spgemm_bench.cu b/benchmarks/spgemm_bench.cu
new file mode 100644
index 00000000..a42e73b3
--- /dev/null
+++ b/benchmarks/spgemm_bench.cu
@@ -0,0 +1,141 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/spgemm.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = int;
+using edge_t = int;
+using weight_t = float;
+
+std::string filename_a;
+std::string filename_b;
+
+struct parameters_t {
+  std::string filename_a;
+  std::string filename_b;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv)
+      : options(argv[0], "SPGEMM Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("a,amatrix", "Matrix A file",
+         cxxopts::value<std::string>())  // mtx A
+        ("b,bmatrix", "Matrix B file",
+         cxxopts::value<std::string>());  // mtx B
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("amatrix") == 1) {
+        filename_a = result["amatrix"].as<std::string>();
+        if (!util::is_market(filename_a)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+      if (result.count("bmatrix") == 1) {
+        filename_b = result["bmatrix"].as<std::string>();
+        if (!util::is_market(filename_b)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void spgemm_bench(nvbench::state& state) {
+  // Add metrics.
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graphs + metadata
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  csr_t a_csr;
+  a_csr.from_coo(mm.load(filename_a));
+
+  auto A = graph::build::from_csr<memory_space_t::device, graph::view_t::csr>(
+      a_csr.number_of_rows, a_csr.number_of_columns, a_csr.number_of_nonzeros,
+      a_csr.row_offsets.data().get(), a_csr.column_indices.data().get(),
+      a_csr.nonzero_values.data().get());
+
+  csr_t b_csr;
+  b_csr.from_coo(mm.load(filename_b));
+
+  auto B = graph::build::from_csr<memory_space_t::device, graph::view_t::csr>(
+      b_csr.number_of_rows, b_csr.number_of_columns, b_csr.number_of_nonzeros,
+      b_csr.row_offsets.data().get(), b_csr.column_indices.data().get(),
+      b_csr.nonzero_values.data().get());
+
+  csr_t C;
+
+  // --
+  // Run SPGEMM with NVBench
+  state.exec(nvbench::exec_tag::sync,
+             [&](nvbench::launch& launch) { gunrock::spgemm::run(A, B, C); });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename_a = params.filename_a;
+  filename_b = params.filename_b;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without matrix filename to pass to NVBench.
+    char* args[argc - 4];
+    int j = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--amatrix") == 0 || strcmp(argv[i], "-a") == 0 ||
+          strcmp(argv[i], "--bmatrix") == 0 || strcmp(argv[i], "-b") == 0) {
+        i++;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+
+    NVBENCH_BENCH(spgemm_bench);
+    NVBENCH_MAIN_BODY(argc - 4, args);
+  }
+}

From 5def27994eae57ae8055ed4e1821f3ff6054f213 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Wed, 1 Jun 2022 17:32:32 -0700
Subject: [PATCH 17/58] geo fixes

---
 benchmarks/geo_bench.cu    | 8 ++++----
 benchmarks/spgemm_bench.cu | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/benchmarks/geo_bench.cu b/benchmarks/geo_bench.cu
index 23b556e7..8b1f039b 100644
--- a/benchmarks/geo_bench.cu
+++ b/benchmarks/geo_bench.cu
@@ -30,9 +30,9 @@ struct parameters_t {
     // Add command line options
     options.add_options()("h,help", "Print help")  // help
         ("m,market", "Matrix file",
-         cxxopts::value<std::string>())  // coords
+         cxxopts::value<std::string>())  // mtx
         ("c,coordinates", "Coordinates file",
-         cxxopts::value<std::string>());  // mtx
+         cxxopts::value<std::string>());  // coords
 
     // Parse command line arguments
     auto result = options.parse(argc, argv);
@@ -56,7 +56,7 @@ struct parameters_t {
         std::exit(0);
       }
       if (result.count("coordinates") == 1) {
-        coordinates_filename = result["market"].as<std::string>();
+        coordinates_filename = result["coordinates"].as<std::string>();
       } else {
         std::cout << options.help({""});
         std::cout << "  [optional nvbench args]" << std::endl << std::endl;
@@ -241,7 +241,7 @@ int main(int argc, char** argv) {
     int j = 0;
     for (int i = 0; i < argc; i++) {
       if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0 ||
-          strcmp(argv[i], "--coordinates") == 0 || strcmp(argv[i], "-c")) {
+          strcmp(argv[i], "--coordinates") == 0 || strcmp(argv[i], "-c") == 0) {
         i++;
         continue;
       }
diff --git a/benchmarks/spgemm_bench.cu b/benchmarks/spgemm_bench.cu
index a42e73b3..fa859ae6 100644
--- a/benchmarks/spgemm_bench.cu
+++ b/benchmarks/spgemm_bench.cu
@@ -122,7 +122,7 @@ int main(int argc, char** argv) {
     const char* args[1] = {"-h"};
     NVBENCH_MAIN_BODY(1, args);
   } else {
-    // Create a new argument array without matrix filename to pass to NVBench.
+    // Create a new argument array without matrix filenames to pass to NVBench.
     char* args[argc - 4];
     int j = 0;
     for (int i = 0; i < argc; i++) {

From 37dc58693cfe02b927ef3bc76f2a467a67b0d3a3 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Wed, 1 Jun 2022 17:45:03 -0700
Subject: [PATCH 18/58] cleanup

---
 benchmarks/hits_bench.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/hits_bench.cu b/benchmarks/hits_bench.cu
index 06696703..05407ba8 100644
--- a/benchmarks/hits_bench.cu
+++ b/benchmarks/hits_bench.cu
@@ -96,7 +96,7 @@ void hits_bench(nvbench::state& state) {
   hits::result_c result{G};
 
   // --
-  // Run BC with NVBench
+  // Run HITS with NVBench
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     gunrock::hits::run(G, param, result);
   });

From 21e2a6f85400e76c408a9b0533b0413cef95e390 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Wed, 1 Jun 2022 18:07:31 -0700
Subject: [PATCH 19/58] cleanup

---
 benchmarks/bc_bench.cu     | 3 ++-
 benchmarks/bfs_bench.cu    | 4 ++--
 benchmarks/color_bench.cu  | 3 ++-
 benchmarks/geo_bench.cu    | 4 ++--
 benchmarks/hits_bench.cu   | 3 ++-
 benchmarks/kcore_bench.cu  | 3 ++-
 benchmarks/mst_bench.cu    | 3 ++-
 benchmarks/ppr_bench.cu    | 2 +-
 benchmarks/pr_bench.cu     | 3 ++-
 benchmarks/spgemm_bench.cu | 3 ++-
 benchmarks/spmv_bench.cu   | 3 ++-
 benchmarks/sssp_bench.cu   | 3 ++-
 12 files changed, 23 insertions(+), 14 deletions(-)

diff --git a/benchmarks/bc_bench.cu b/benchmarks/bc_bench.cu
index 6091db48..bdce55ed 100644
--- a/benchmarks/bc_bench.cu
+++ b/benchmarks/bc_bench.cu
@@ -56,7 +56,8 @@ struct parameters_t {
 };
 
 void bc_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
diff --git a/benchmarks/bfs_bench.cu b/benchmarks/bfs_bench.cu
index 8296fa69..09b33497 100644
--- a/benchmarks/bfs_bench.cu
+++ b/benchmarks/bfs_bench.cu
@@ -56,7 +56,8 @@ struct parameters_t {
 };
 
 void bfs_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
@@ -95,7 +96,6 @@ void bfs_bench(nvbench::state& state) {
 
   // --
   // Params and memory allocation
-
   vertex_t single_source = 0;
 
   vertex_t n_vertices = G.get_number_of_vertices();
diff --git a/benchmarks/color_bench.cu b/benchmarks/color_bench.cu
index 5177c369..63fad4e7 100644
--- a/benchmarks/color_bench.cu
+++ b/benchmarks/color_bench.cu
@@ -57,7 +57,8 @@ struct parameters_t {
 };
 
 void color_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
diff --git a/benchmarks/geo_bench.cu b/benchmarks/geo_bench.cu
index 8b1f039b..36043d29 100644
--- a/benchmarks/geo_bench.cu
+++ b/benchmarks/geo_bench.cu
@@ -165,7 +165,8 @@ void read_coordinates_file(std::string filename,
 }
 
 void geo_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
@@ -201,7 +202,6 @@ void geo_bench(nvbench::state& state) {
 
   // --
   // Params and memory allocation
-
   unsigned int spatial_iterations = 1000;
   unsigned int total_iterations = 10;
 
diff --git a/benchmarks/hits_bench.cu b/benchmarks/hits_bench.cu
index 05407ba8..01d72d58 100644
--- a/benchmarks/hits_bench.cu
+++ b/benchmarks/hits_bench.cu
@@ -56,7 +56,8 @@ struct parameters_t {
 };
 
 void hits_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
diff --git a/benchmarks/kcore_bench.cu b/benchmarks/kcore_bench.cu
index 7fe57fa8..9946ef9b 100644
--- a/benchmarks/kcore_bench.cu
+++ b/benchmarks/kcore_bench.cu
@@ -57,7 +57,8 @@ struct parameters_t {
 };
 
 void kcore_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
diff --git a/benchmarks/mst_bench.cu b/benchmarks/mst_bench.cu
index b4a3a18e..1dd1b2e0 100644
--- a/benchmarks/mst_bench.cu
+++ b/benchmarks/mst_bench.cu
@@ -52,7 +52,8 @@ struct parameters_t {
 };
 
 void mst_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
diff --git a/benchmarks/ppr_bench.cu b/benchmarks/ppr_bench.cu
index 963c8f57..73753d7a 100644
--- a/benchmarks/ppr_bench.cu
+++ b/benchmarks/ppr_bench.cu
@@ -58,7 +58,7 @@ struct parameters_t {
 
 void ppr_bench(nvbench::state& state) {
   // --
-  // Add metrics.
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
diff --git a/benchmarks/pr_bench.cu b/benchmarks/pr_bench.cu
index ac60b06f..86dd6f32 100644
--- a/benchmarks/pr_bench.cu
+++ b/benchmarks/pr_bench.cu
@@ -56,7 +56,8 @@ struct parameters_t {
 };
 
 void pr_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
diff --git a/benchmarks/spgemm_bench.cu b/benchmarks/spgemm_bench.cu
index fa859ae6..3e45139c 100644
--- a/benchmarks/spgemm_bench.cu
+++ b/benchmarks/spgemm_bench.cu
@@ -73,7 +73,8 @@ struct parameters_t {
 };
 
 void spgemm_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
diff --git a/benchmarks/spmv_bench.cu b/benchmarks/spmv_bench.cu
index 19d0c520..a0f323d2 100644
--- a/benchmarks/spmv_bench.cu
+++ b/benchmarks/spmv_bench.cu
@@ -57,7 +57,8 @@ struct parameters_t {
 };
 
 void spmv_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();
diff --git a/benchmarks/sssp_bench.cu b/benchmarks/sssp_bench.cu
index be386dbd..9ff67eb1 100644
--- a/benchmarks/sssp_bench.cu
+++ b/benchmarks/sssp_bench.cu
@@ -56,7 +56,8 @@ struct parameters_t {
 };
 
 void sssp_bench(nvbench::state& state) {
-  // Add metrics.
+  // --
+  // Add metrics
   state.collect_dram_throughput();
   state.collect_l1_hit_rates();
   state.collect_l2_hit_rates();

From dd66976ad9f57fc538c718d263dd9e0b889bcdab Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Wed, 1 Jun 2022 21:25:27 -0700
Subject: [PATCH 20/58] benchmark testing script

---
 benchmarks/test_benchmarks.sh | 48 +++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100755 benchmarks/test_benchmarks.sh

diff --git a/benchmarks/test_benchmarks.sh b/benchmarks/test_benchmarks.sh
new file mode 100755
index 00000000..2d613521
--- /dev/null
+++ b/benchmarks/test_benchmarks.sh
@@ -0,0 +1,48 @@
+# ------------------------------------------------------------------------
+# Algorithm benchmarking tests
+# Run this from build directory
+# ------------------------------------------------------------------------
+
+#!/bin/bash
+
+DATASET_DIR="../datasets"
+BIN_DIR="./bin"
+
+# Where to store output
+JSON_DIR="json"
+
+# Used for all algorithms except SPGEMM
+MATRIX_FILE="${DATASET_DIR}/chesapeake/chesapeake.mtx"
+
+# Used for Geo
+COORDINATES_FILE="${DATASET_DIR}/geolocation/sample.labels"
+
+# Used for SPGEMM 
+A_MATRIX="${DATASET_DIR}/spgemm/a.mtx"
+B_MATRIX="${DATASET_DIR}/spgemm/b.mtx"
+
+make bc_bench
+make bfs_bench
+make color_bench
+make geo_bench
+make hits_bench
+make kcore_bench
+make mst_bench
+make ppr_bench
+make pr_bench
+make spgemm_bench
+make spmv_bench
+make sssp_bench
+
+sudo ${BIN_DIR}/bc_bench -m ${MATRIX_FILE}  --json ${JSON_DIR}/bc.json
+sudo ${BIN_DIR}/bfs_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/bfs.json
+sudo ${BIN_DIR}/color_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/color.json
+sudo ${BIN_DIR}/geo_bench -m ${MATRIX_FILE} -c ${COORDINATES_FILE} --json ${JSON_DIR}/geo.json
+sudo ${BIN_DIR}/hits_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/hits.json
+sudo ${BIN_DIR}/kcore_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/kcore.json
+sudo ${BIN_DIR}/mst_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/mst.json
+sudo ${BIN_DIR}/ppr_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/ppr.json
+sudo ${BIN_DIR}/pr_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/pr.json
+sudo ${BIN_DIR}/spgemm_bench -a ${A_MATRIX} -b ${B_MATRIX} --json ${JSON_DIR}/spgemm.json
+sudo ${BIN_DIR}/spmv_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/spmv.json
+sudo ${BIN_DIR}/sssp_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/sssp.json

From 0766c56752508f9389bf9558230d42edcccc3320 Mon Sep 17 00:00:00 2001
From: Annie <annie@annielytical.com>
Date: Fri, 3 Jun 2022 10:59:14 -0700
Subject: [PATCH 21/58] [skip ci] test_benchmarks script updates

---
 benchmarks/test_benchmarks.sh | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/benchmarks/test_benchmarks.sh b/benchmarks/test_benchmarks.sh
index 2d613521..11d93600 100755
--- a/benchmarks/test_benchmarks.sh
+++ b/benchmarks/test_benchmarks.sh
@@ -1,6 +1,7 @@
 # ------------------------------------------------------------------------
 # Algorithm benchmarking tests
 # Run this from build directory
+# If error CUPTI_ERROR_INSUFFICIENT_PRIVILEGES: run with sudo
 # ------------------------------------------------------------------------
 
 #!/bin/bash
@@ -34,15 +35,15 @@ make spgemm_bench
 make spmv_bench
 make sssp_bench
 
-sudo ${BIN_DIR}/bc_bench -m ${MATRIX_FILE}  --json ${JSON_DIR}/bc.json
-sudo ${BIN_DIR}/bfs_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/bfs.json
-sudo ${BIN_DIR}/color_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/color.json
-sudo ${BIN_DIR}/geo_bench -m ${MATRIX_FILE} -c ${COORDINATES_FILE} --json ${JSON_DIR}/geo.json
-sudo ${BIN_DIR}/hits_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/hits.json
-sudo ${BIN_DIR}/kcore_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/kcore.json
-sudo ${BIN_DIR}/mst_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/mst.json
-sudo ${BIN_DIR}/ppr_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/ppr.json
-sudo ${BIN_DIR}/pr_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/pr.json
-sudo ${BIN_DIR}/spgemm_bench -a ${A_MATRIX} -b ${B_MATRIX} --json ${JSON_DIR}/spgemm.json
-sudo ${BIN_DIR}/spmv_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/spmv.json
-sudo ${BIN_DIR}/sssp_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/sssp.json
+${BIN_DIR}/bc_bench -m ${MATRIX_FILE}  --json ${JSON_DIR}/bc.json
+${BIN_DIR}/bfs_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/bfs.json
+${BIN_DIR}/color_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/color.json
+${BIN_DIR}/geo_bench -m ${MATRIX_FILE} -c ${COORDINATES_FILE} --json ${JSON_DIR}/geo.json
+${BIN_DIR}/hits_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/hits.json
+${BIN_DIR}/kcore_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/kcore.json
+${BIN_DIR}/mst_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/mst.json
+${BIN_DIR}/ppr_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/ppr.json
+${BIN_DIR}/pr_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/pr.json
+${BIN_DIR}/spgemm_bench -a ${A_MATRIX} -b ${B_MATRIX} --json ${JSON_DIR}/spgemm.json
+${BIN_DIR}/spmv_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/spmv.json
+${BIN_DIR}/sssp_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/sssp.json

From b7881e471f70102e5a2b4a47b46e3bee5d17cbf7 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 17 Jun 2022 12:39:57 -0700
Subject: [PATCH 22/58] Add initial TC implementation

---
 examples/algorithms/CMakeLists.txt    |   1 +
 examples/algorithms/tc/CMakeLists.txt |  21 +++++
 examples/algorithms/tc/tc.cu          |  74 +++++++++++++++
 include/gunrock/algorithms/tc.hxx     | 130 ++++++++++++++++++++++++++
 include/gunrock/graph/csr.hxx         |  45 ++++++++-
 5 files changed, 270 insertions(+), 1 deletion(-)
 create mode 100644 examples/algorithms/tc/CMakeLists.txt
 create mode 100644 examples/algorithms/tc/tc.cu
 create mode 100644 include/gunrock/algorithms/tc.hxx

diff --git a/examples/algorithms/CMakeLists.txt b/examples/algorithms/CMakeLists.txt
index 0e4b4698..8b73a849 100644
--- a/examples/algorithms/CMakeLists.txt
+++ b/examples/algorithms/CMakeLists.txt
@@ -11,6 +11,7 @@ add_subdirectory(kcore)
 add_subdirectory(spmv)
 add_subdirectory(spgemm)
 add_subdirectory(mst)
+add_subdirectory(tc)
 # end /* Add algorithms' subdirectories */
 
 # begin /* Add experimental algorithms' subdirectories */
diff --git a/examples/algorithms/tc/CMakeLists.txt b/examples/algorithms/tc/CMakeLists.txt
new file mode 100644
index 00000000..ea61ae40
--- /dev/null
+++ b/examples/algorithms/tc/CMakeLists.txt
@@ -0,0 +1,21 @@
+# begin /* Set the application name. */
+set(APPLICATION_NAME tc)
+# end /* Set the application name. */
+
+# begin /* Add CUDA executables */
+add_executable(${APPLICATION_NAME})
+
+set(SOURCE_LIST
+    ${APPLICATION_NAME}.cu
+)
+
+target_sources(${APPLICATION_NAME} PRIVATE ${SOURCE_LIST})
+target_link_libraries(${APPLICATION_NAME} PRIVATE essentials)
+get_target_property(ESSENTIALS_ARCHITECTURES essentials CUDA_ARCHITECTURES)
+set_target_properties(${APPLICATION_NAME}
+    PROPERTIES
+        CUDA_ARCHITECTURES ${ESSENTIALS_ARCHITECTURES}
+) # XXX: Find a better way to inherit essentials properties.
+
+message(STATUS "Example Added: ${APPLICATION_NAME}")
+# end /* Add CUDA executables */
\ No newline at end of file
diff --git a/examples/algorithms/tc/tc.cu b/examples/algorithms/tc/tc.cu
new file mode 100644
index 00000000..efabd604
--- /dev/null
+++ b/examples/algorithms/tc/tc.cu
@@ -0,0 +1,74 @@
+#include <gunrock/algorithms/tc.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+void test_tc(int num_arguments, char** argument_array) {
+  if (num_arguments != 2) {
+    std::cerr << "usage: ./bin/tc filename.mtx" << std::endl;
+    exit(1);
+  }
+
+  // --
+  // Define types
+
+  using vertex_t = int;
+  using edge_t = int;
+  using weight_t = float;
+  using count_t = int;
+
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+  csr_t csr;
+
+  // --
+  // IO
+
+  std::string filename = argument_array[1];
+
+  if (util::is_market(filename)) {
+    io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+    csr.from_coo(mm.load(filename));
+  } else if (util::is_binary_csr(filename)) {
+    csr.read_binary(filename);
+  } else {
+    std::cerr << "Unknown file format: " << filename << std::endl;
+    exit(1);
+  }
+
+  // --
+  // Build graph
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get()   // values
+  );  // supports row_indices and column_offsets (default = nullptr)
+
+  // --
+  // Params and memory allocation
+  srand(time(NULL));
+
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<count_t> triangles_count(n_vertices, 0);
+  std::cout << "TC for graph with n_vertices = " << n_vertices << std::endl;
+
+  // --
+  // GPU Run
+
+  float gpu_elapsed = tc::run(G, triangles_count.data().get());
+
+  // --
+  // Log + Validate
+  print::head(triangles_count, 40, "Per-vertex triangle count");
+
+  std::cout << "GPU Elapsed Time : " << gpu_elapsed << " (ms)" << std::endl;
+}
+
+int main(int argc, char** argv) {
+  test_tc(argc, argv);
+}
diff --git a/include/gunrock/algorithms/tc.hxx b/include/gunrock/algorithms/tc.hxx
new file mode 100644
index 00000000..96565c73
--- /dev/null
+++ b/include/gunrock/algorithms/tc.hxx
@@ -0,0 +1,130 @@
+/**
+ * @file sssp.hxx
+ * @author Muhammad A. Awad (mawad@ucdavis.edu)
+ * @brief Triangle Counting algorithm.
+ * @version 0.1
+ * @date 2022-08-06
+ *
+ * @copyright Copyright (c) 2022
+ *
+ */
+#pragma once
+
+#include <gunrock/algorithms/algorithms.hxx>
+
+namespace gunrock {
+namespace tc {
+
+template <typename vertex_t>
+struct param_t {
+  // No parameters for this algorithm
+};
+
+template <typename vertex_t>
+struct result_t {
+  vertex_t* triangles_count;
+  result_t(vertex_t* _triangles_count) : triangles_count(_triangles_count) {}
+};
+
+template <typename graph_t, typename param_type, typename result_type>
+struct problem_t : gunrock::problem_t<graph_t> {
+  param_type param;
+  result_type result;
+
+  problem_t(graph_t& G,
+            param_type& _param,
+            result_type& _result,
+            std::shared_ptr<gcuda::multi_context_t> _context)
+      : gunrock::problem_t<graph_t>(G, _context),
+        param(_param),
+        result(_result) {}
+
+  using vertex_t = typename graph_t::vertex_type;
+  using edge_t = typename graph_t::edge_type;
+
+  void init() override {}
+
+  void reset() override {}
+};
+
+template <typename problem_t>
+struct enactor_t : gunrock::enactor_t<problem_t> {
+  enactor_t(problem_t* _problem,
+            std::shared_ptr<gcuda::multi_context_t> _context)
+      : gunrock::enactor_t<problem_t>(_problem, _context) {}
+
+  using vertex_t = typename problem_t::vertex_t;
+  using edge_t = typename problem_t::edge_t;
+  using weight_t = typename problem_t::weight_t;
+  using frontier_t = typename enactor_t<problem_t>::frontier_t;
+
+  void prepare_frontier(frontier_t* f,
+                        gcuda::multi_context_t& context) override {
+    auto P = this->get_problem();
+    auto n_vertices = P->get_graph().get_number_of_vertices();
+
+    f->sequence((vertex_t)0, n_vertices, context.get_context(0)->stream());
+  }
+
+  void loop(gcuda::multi_context_t& context) override {
+    // Data slice
+    auto E = this->get_enactor();
+    auto P = this->get_problem();
+    auto G = P->get_graph();
+
+    auto triangles_count = P->result.triangles_count;
+    auto iteration = this->iteration;
+
+    auto intersect = [G, triangles_count] __host__ __device__(
+                         vertex_t const& source,    // ... source
+                         vertex_t const& neighbor,  // neighbor
+                         edge_t const& edge,        // edge
+                         weight_t const& weight     // weight (tuple).
+                         ) -> bool {
+      if (source < neighbor) {
+        auto src_triangles_count = G.get_intersection_count(source, neighbor);
+        math::atomic::add(&(triangles_count[source]), src_triangles_count);
+      }
+      return false;
+    };
+
+    // Execute advance operator on the provided lambda
+    operators::advance::execute<operators::load_balance_t::block_mapped>(
+        G, E, intersect, context);
+    std::cout << "iteration: " << iteration << std::endl;
+  }
+
+};  // struct enactor_t
+
+template <typename graph_t>
+float run(graph_t& G,
+          typename graph_t::vertex_type* triangles_count,  // Output
+          std::shared_ptr<gcuda::multi_context_t> context =
+              std::shared_ptr<gcuda::multi_context_t>(
+                  new gcuda::multi_context_t(0))  // Context
+) {
+  // <user-defined>
+  using vertex_t = typename graph_t::vertex_type;
+  using weight_t = typename graph_t::weight_type;
+
+  using param_type = param_t<vertex_t>;
+  using result_type = result_t<vertex_t>;
+
+  param_type param;
+  result_type result(triangles_count);
+  // </user-defined>
+
+  using problem_type = problem_t<graph_t, param_type, result_type>;
+  using enactor_type = enactor_t<problem_type>;
+
+  problem_type problem(G, param, result, context);
+  problem.init();
+  problem.reset();
+
+  enactor_type enactor(&problem, context);
+  return enactor.enact();
+  // </boiler-plate>
+}
+
+}  // namespace tc
+}  // namespace gunrock
\ No newline at end of file
diff --git a/include/gunrock/graph/csr.hxx b/include/gunrock/graph/csr.hxx
index 32e416e7..dae8873a 100644
--- a/include/gunrock/graph/csr.hxx
+++ b/include/gunrock/graph/csr.hxx
@@ -94,6 +94,49 @@ class graph_csr_t {
                                               offsets[source + 1] - 1);
   }
 
+  __host__ __device__ __forceinline__ vertex_type
+  get_intersection_count(const vertex_type& source,
+                         const vertex_type& destination) const {
+    vertex_type intersection_count = 0;
+
+    auto source_neighbors_count = get_number_of_neighbors(source);
+    auto destination_neighbors_count = get_number_of_neighbors(destination);
+
+    auto source_offset = offsets[source];
+    auto destination_offset = offsets[destination];
+
+    // if (source_neighbors_count > destination_neighbors_count) {
+    //   std::swap(source_offset, destination_offset);
+    //   std::swap(source_neighbors_count, destination_neighbors_count);
+    // }
+
+    auto source_edges_iter = indices + source_offset;
+    auto destination_edges_iter = indices + destination_offset;
+
+    auto needle = *destination_edges_iter;
+    auto source_search_start = search::binary::execute(
+        source_edges_iter, needle, 0, source_neighbors_count);
+    edge_type destination_search_start = 0;
+
+    while (source_search_start < source_neighbors_count &&
+           destination_search_start < destination_neighbors_count) {
+      auto cur_edge_src = source_edges_iter[source_search_start];
+      auto cur_edge_dst = destination_edges_iter[destination_search_start];
+      if (cur_edge_src == cur_edge_dst) {
+        intersection_count++;
+        source_search_start++;
+        destination_search_start++;
+        printf("Triangle: %i, %i, %i\n", source, destination, cur_edge_src);
+      } else if (cur_edge_src > cur_edge_dst) {
+        destination_search_start++;
+      } else {
+        source_search_start++;
+      }
+    }
+
+    return intersection_count;
+  }
+
   __host__ __device__ __forceinline__ weight_type
   get_edge_weight(edge_type const& e) const {
     return thread::load(&values[e]);
@@ -113,7 +156,7 @@ class graph_csr_t {
     return values;
   }
 
-  // Graph type (inherited from this class) has equivalents of this in graph 
+  // Graph type (inherited from this class) has equivalents of this in graph
   // terminology (vertices and edges). Also include these for linear algebra
   // terminology
   __host__ __device__ __forceinline__ auto get_number_of_rows() const {

From 992ab2765828b6866399062bf3df887a8194646b Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 17 Jun 2022 18:08:37 -0700
Subject: [PATCH 23/58] Add binary search using lower bound

---
 include/gunrock/algorithms/search/binary_search.hxx | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/gunrock/algorithms/search/binary_search.hxx b/include/gunrock/algorithms/search/binary_search.hxx
index 5d268170..18935d57 100644
--- a/include/gunrock/algorithms/search/binary_search.hxx
+++ b/include/gunrock/algorithms/search/binary_search.hxx
@@ -41,9 +41,11 @@ namespace binary {
 // that the element found will be leftmost or rightmost element.
 // XXX: Implement Search
 template <typename key_pointer_t, typename key_t, typename int_t>
-__host__ __device__ int_t
-execute(const key_pointer_t& keys, const key_t& key, int_t begin, int_t end) {
-  bound_t bounds = bound_t::upper;
+__host__ __device__ int_t execute(const key_pointer_t& keys,
+                                  const key_t& key,
+                                  int_t begin,
+                                  int_t end,
+                                  const bound_t bounds = bound_t::upper) {
   auto comp = [](const key_t& a, const key_t& b) { return a < b; };
   while (begin < end) {
     int_t mid = (begin + end) / 2;

From c9231d485783e2cf4b8cb9b40f0550dff2fcf8fd Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 17 Jun 2022 18:09:10 -0700
Subject: [PATCH 24/58] Count total number of triangles

---
 examples/algorithms/tc/tc.cu      | 36 ++++++++++++++---------
 include/gunrock/algorithms/tc.hxx | 48 +++++++++++++++++++++++--------
 2 files changed, 59 insertions(+), 25 deletions(-)

diff --git a/examples/algorithms/tc/tc.cu b/examples/algorithms/tc/tc.cu
index efabd604..fcd1143f 100644
--- a/examples/algorithms/tc/tc.cu
+++ b/examples/algorithms/tc/tc.cu
@@ -4,18 +4,18 @@ using namespace gunrock;
 using namespace memory;
 
 void test_tc(int num_arguments, char** argument_array) {
-  if (num_arguments != 2) {
-    std::cerr << "usage: ./bin/tc filename.mtx" << std::endl;
+  if (num_arguments != 3) {
+    std::cerr << "usage: ./bin/tc filename.mtx reduce" << std::endl;
     exit(1);
   }
 
   // --
   // Define types
 
-  using vertex_t = int;
-  using edge_t = int;
+  using vertex_t = uint32_t;
+  using edge_t = uint32_t;
   using weight_t = float;
-  using count_t = int;
+  using count_t = vertex_t;
 
   using csr_t =
       format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
@@ -24,11 +24,18 @@ void test_tc(int num_arguments, char** argument_array) {
   // --
   // IO
 
-  std::string filename = argument_array[1];
+  const std::string filename = argument_array[1];
+  const std::string reduce = argument_array[2];
+  const bool reduce_all_triangles = reduce.find("true") != std::string::npos;
 
   if (util::is_market(filename)) {
     io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    csr.from_coo(mm.load(filename));
+    auto mmatrix = mm.load(filename);
+    if (!mm_is_symmetric(mm.code)) {
+      std::cerr << "Error: input matrix must be symmetric" << std::endl;
+      exit(1);
+    }
+    csr.from_coo(mmatrix);
   } else if (util::is_binary_csr(filename)) {
     csr.read_binary(filename);
   } else {
@@ -47,25 +54,28 @@ void test_tc(int num_arguments, char** argument_array) {
       csr.row_offsets.data().get(),     // row_offsets
       csr.column_indices.data().get(),  // column_indices
       csr.nonzero_values.data().get()   // values
-  );  // supports row_indices and column_offsets (default = nullptr)
+  );
 
   // --
   // Params and memory allocation
-  srand(time(NULL));
 
   vertex_t n_vertices = G.get_number_of_vertices();
   thrust::device_vector<count_t> triangles_count(n_vertices, 0);
-  std::cout << "TC for graph with n_vertices = " << n_vertices << std::endl;
 
   // --
   // GPU Run
 
-  float gpu_elapsed = tc::run(G, triangles_count.data().get());
+  std::size_t total_triangles = 0;
+  float gpu_elapsed = tc::run(G, reduce_all_triangles,
+                              triangles_count.data().get(), &total_triangles);
 
   // --
-  // Log + Validate
-  print::head(triangles_count, 40, "Per-vertex triangle count");
+  // Log
 
+  print::head(triangles_count, 40, "Per-vertex triangle count");
+  if (reduce_all_triangles) {
+    std::cout << "Total Graph Traingles : " << total_triangles << std::endl;
+  }
   std::cout << "GPU Elapsed Time : " << gpu_elapsed << " (ms)" << std::endl;
 }
 
diff --git a/include/gunrock/algorithms/tc.hxx b/include/gunrock/algorithms/tc.hxx
index 96565c73..8424299d 100644
--- a/include/gunrock/algorithms/tc.hxx
+++ b/include/gunrock/algorithms/tc.hxx
@@ -17,13 +17,18 @@ namespace tc {
 
 template <typename vertex_t>
 struct param_t {
-  // No parameters for this algorithm
+  bool reduce_all_triangles;
+  param_t(bool _reduce_all_triangles)
+      : reduce_all_triangles(_reduce_all_triangles) {}
 };
 
 template <typename vertex_t>
 struct result_t {
-  vertex_t* triangles_count;
-  result_t(vertex_t* _triangles_count) : triangles_count(_triangles_count) {}
+  vertex_t* vertex_triangles_count;
+  std::size_t* total_triangles_count;
+  result_t(vertex_t* _vertex_triangles_count, uint64_t* _total_triangles_count)
+      : vertex_triangles_count(_vertex_triangles_count),
+        total_triangles_count(_total_triangles_count) {}
 };
 
 template <typename graph_t, typename param_type, typename result_type>
@@ -72,18 +77,22 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     auto P = this->get_problem();
     auto G = P->get_graph();
 
-    auto triangles_count = P->result.triangles_count;
+    auto vertex_triangles_count = P->result.vertex_triangles_count;
     auto iteration = this->iteration;
 
-    auto intersect = [G, triangles_count] __host__ __device__(
+    auto intersect = [G, vertex_triangles_count] __host__ __device__(
                          vertex_t const& source,    // ... source
                          vertex_t const& neighbor,  // neighbor
                          edge_t const& edge,        // edge
                          weight_t const& weight     // weight (tuple).
                          ) -> bool {
-      if (source < neighbor) {
-        auto src_triangles_count = G.get_intersection_count(source, neighbor);
-        math::atomic::add(&(triangles_count[source]), src_triangles_count);
+      if (neighbor > source) {
+        auto src_vertex_triangles_count = G.get_intersection_count(
+            source, neighbor,
+            [vertex_triangles_count](auto intersection_vertex) {
+              math::atomic::add(&(vertex_triangles_count[intersection_vertex]),
+                                vertex_t{1});
+            });
       }
       return false;
     };
@@ -98,7 +107,9 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
 
 template <typename graph_t>
 float run(graph_t& G,
-          typename graph_t::vertex_type* triangles_count,  // Output
+          bool reduce_all_triangles,
+          typename graph_t::vertex_type* vertex_triangles_count,  // Output
+          std::size_t* total_triangles_count,                     // Output
           std::shared_ptr<gcuda::multi_context_t> context =
               std::shared_ptr<gcuda::multi_context_t>(
                   new gcuda::multi_context_t(0))  // Context
@@ -110,8 +121,8 @@ float run(graph_t& G,
   using param_type = param_t<vertex_t>;
   using result_type = result_t<vertex_t>;
 
-  param_type param;
-  result_type result(triangles_count);
+  param_type param(reduce_all_triangles);
+  result_type result(vertex_triangles_count, total_triangles_count);
   // </user-defined>
 
   using problem_type = problem_t<graph_t, param_type, result_type>;
@@ -122,8 +133,21 @@ float run(graph_t& G,
   problem.reset();
 
   enactor_type enactor(&problem, context);
-  return enactor.enact();
+  auto time = enactor.enact();
+
+  if (param.reduce_all_triangles) {
+    auto policy = context->get_context(0)->execution_policy();
+    *result.total_triangles_count = thrust::transform_reduce(
+        policy, result.vertex_triangles_count,
+        result.vertex_triangles_count + G.get_number_of_vertices(),
+        [] __device__(const vertex_t& vertex_triangles) {
+          return static_cast<std::size_t>(vertex_triangles);
+        },
+        std::size_t{0}, thrust::plus<std::size_t>());
+  }
+
   // </boiler-plate>
+  return time;
 }
 
 }  // namespace tc

From 79b0d7810a9d3de0c3e0b1d6001cf1381542b717 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 17 Jun 2022 18:09:51 -0700
Subject: [PATCH 25/58] Use lower bound

---
 include/gunrock/graph/csr.hxx | 31 +++++++++++++++++++++++++++----
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/include/gunrock/graph/csr.hxx b/include/gunrock/graph/csr.hxx
index dae8873a..0ceafaae 100644
--- a/include/gunrock/graph/csr.hxx
+++ b/include/gunrock/graph/csr.hxx
@@ -94,9 +94,21 @@ class graph_csr_t {
                                               offsets[source + 1] - 1);
   }
 
+  /**
+   * @brief Count the number of vertices belonging to the set intersection
+   * between the source and destination vertices adjacency lists. Executes a
+   * function on each intersection.
+   *
+   * @param source Index of the source vertex
+   * @param destination Index of the destination
+   * @param on_intersection Lambda function executed at each intersection
+   * @return Number of shared vertices between source and destination
+   */
+  template <typename operator_type>
   __host__ __device__ __forceinline__ vertex_type
   get_intersection_count(const vertex_type& source,
-                         const vertex_type& destination) const {
+                         const vertex_type& destination,
+                         operator_type on_intersection) const {
     vertex_type intersection_count = 0;
 
     auto source_neighbors_count = get_number_of_neighbors(source);
@@ -114,19 +126,30 @@ class graph_csr_t {
     auto destination_edges_iter = indices + destination_offset;
 
     auto needle = *destination_edges_iter;
-    auto source_search_start = search::binary::execute(
-        source_edges_iter, needle, 0, source_neighbors_count);
+    auto source_search_start =
+        search::binary::execute(source_edges_iter, needle, vertex_t{0},
+                                source_neighbors_count, search::bound_t::lower);
     edge_type destination_search_start = 0;
+    // printf("[%i -> %i] %i, [%i, %i], [%i, %i]\n", source, destination,
+    // needle,
+    //        source_search_start, destination_search_start,
+    //        source_neighbors_count, destination_neighbors_count);
 
     while (source_search_start < source_neighbors_count &&
            destination_search_start < destination_neighbors_count) {
       auto cur_edge_src = source_edges_iter[source_search_start];
       auto cur_edge_dst = destination_edges_iter[destination_search_start];
+      // printf("%i, %i | %i, %i\n", cur_edge_src, cur_edge_dst,
+      //        source_search_start, destination_search_start);
+      // if (source == 1 and destination == 2) {
+      //   printf("%i, %i\n", cur_edge_src, cur_edge_dst);
+      // }
       if (cur_edge_src == cur_edge_dst) {
         intersection_count++;
         source_search_start++;
         destination_search_start++;
-        printf("Triangle: %i, %i, %i\n", source, destination, cur_edge_src);
+        on_intersection(cur_edge_src);
+        // printf("Triangle: %i, %i, %i\n", source, destination, cur_edge_src);
       } else if (cur_edge_src > cur_edge_dst) {
         destination_search_start++;
       } else {

From 19011d199aaf8ea3b0d2b848253ded37acf77f46 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 17 Jun 2022 18:10:03 -0700
Subject: [PATCH 26/58] Add unit test

---
 unittests/algorithms/tc.cuh | 55 +++++++++++++++++++++++++++++++++++++
 unittests/unittests.hxx     |  2 ++
 2 files changed, 57 insertions(+)
 create mode 100644 unittests/algorithms/tc.cuh

diff --git a/unittests/algorithms/tc.cuh b/unittests/algorithms/tc.cuh
new file mode 100644
index 00000000..764b181f
--- /dev/null
+++ b/unittests/algorithms/tc.cuh
@@ -0,0 +1,55 @@
+/**
+ * @file tc.cuh
+ * @author Muhammad A. Awad (awad@ucdavis.edu)
+ * @brief Unit test for the triangle counting algorithm.
+ * @version 0.1
+ * @date 2022-17-06
+ *
+ * @copyright Copyright (c) 2022
+ *
+ */
+
+#include <gunrock/graph/graph.hxx>
+#include <gunrock/formats/formats.hxx>
+#include <gunrock/algorithms/tc.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+TEST(algorithm, tc) {
+  // CSR Matrix Representation
+  // V            = [ 1 1 1 1 ]
+  // ROW_OFFSETS  = [ 0 3 5 8 10 ]
+  // COL_INDEX    = [ 1 2 3 0 2 0 1 3 0 2]
+
+  using vertex_t = int;
+  using edge_t = int;
+  using weight_t = int;
+
+  vertex_t number_of_rows = 4, number_of_columns = 4;
+  edge_t number_of_nonzeros = 10;
+  thrust::device_vector<edge_t> Ap = std::vector{0, 3, 5, 8, 10};
+  thrust::device_vector<vertex_t> Aj =
+      std::vector{1, 2, 3, 0, 2, 0, 1, 3, 0, 2};
+  thrust::device_vector<weight_t> Ax(number_of_nonzeros, 0);
+
+  auto G = graph::build::from_csr<memory_space_t::device, graph::view_t::csr>(
+      number_of_rows, number_of_columns, number_of_nonzeros, Ap.data().get(),
+      Aj.data().get(), Ax.data().get());
+
+  std::size_t total_triangles = 0;
+  thrust::device_vector<vertex_t> d_triangles_count(number_of_rows, 0);
+  tc::run(G, true, d_triangles_count.data().get(), &total_triangles);
+
+  thrust::host_vector<vertex_t> h_triangles_count(d_triangles_count);
+
+  std::size_t reference_total_triangles = 6;
+  thrust::host_vector<vertex_t> reference_traingles_count =
+      std::vector{2, 1, 2, 1};
+
+  for (std::size_t v = 0; v < number_of_rows; v++) {
+    EXPECT_EQ(h_triangles_count[v], reference_traingles_count[v]);
+  }
+
+  EXPECT_EQ(total_triangles, reference_total_triangles);
+}
diff --git a/unittests/unittests.hxx b/unittests/unittests.hxx
index 4bfba366..927e8f7f 100644
--- a/unittests/unittests.hxx
+++ b/unittests/unittests.hxx
@@ -41,3 +41,5 @@
 // #include "io/matrix_market.cuh"
 #include "io/smtx.cuh"
 // #include "io/mtxbin.cuh"
+
+#include "algorithms/tc.cuh"

From 166809f66497512d479abf62b783deeb0a0244a1 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Wed, 22 Jun 2022 09:37:15 -0700
Subject: [PATCH 27/58] Add switch source and destionation optimization

---
 include/gunrock/graph/csr.hxx | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/include/gunrock/graph/csr.hxx b/include/gunrock/graph/csr.hxx
index 0ceafaae..8903499e 100644
--- a/include/gunrock/graph/csr.hxx
+++ b/include/gunrock/graph/csr.hxx
@@ -111,16 +111,19 @@ class graph_csr_t {
                          operator_type on_intersection) const {
     vertex_type intersection_count = 0;
 
+    auto intersection_source = source;
+    auto intersection_destination = destination;
+
     auto source_neighbors_count = get_number_of_neighbors(source);
     auto destination_neighbors_count = get_number_of_neighbors(destination);
 
-    auto source_offset = offsets[source];
-    auto destination_offset = offsets[destination];
+    if (source_neighbors_count > destination_neighbors_count) {
+      std::swap(intersection_source, intersection_destination);
+      std::swap(source_neighbors_count, destination_neighbors_count);
+    }
 
-    // if (source_neighbors_count > destination_neighbors_count) {
-    //   std::swap(source_offset, destination_offset);
-    //   std::swap(source_neighbors_count, destination_neighbors_count);
-    // }
+    auto source_offset = offsets[intersection_source];
+    auto destination_offset = offsets[intersection_destination];
 
     auto source_edges_iter = indices + source_offset;
     auto destination_edges_iter = indices + destination_offset;

From 9fd00ac3630c6d5a4f1279cae850313a9c41cd9b Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Wed, 22 Jun 2022 09:38:38 -0700
Subject: [PATCH 28/58] Remove comments

---
 include/gunrock/graph/csr.hxx | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/include/gunrock/graph/csr.hxx b/include/gunrock/graph/csr.hxx
index 8903499e..2df6a664 100644
--- a/include/gunrock/graph/csr.hxx
+++ b/include/gunrock/graph/csr.hxx
@@ -133,26 +133,16 @@ class graph_csr_t {
         search::binary::execute(source_edges_iter, needle, vertex_t{0},
                                 source_neighbors_count, search::bound_t::lower);
     edge_type destination_search_start = 0;
-    // printf("[%i -> %i] %i, [%i, %i], [%i, %i]\n", source, destination,
-    // needle,
-    //        source_search_start, destination_search_start,
-    //        source_neighbors_count, destination_neighbors_count);
 
     while (source_search_start < source_neighbors_count &&
            destination_search_start < destination_neighbors_count) {
       auto cur_edge_src = source_edges_iter[source_search_start];
       auto cur_edge_dst = destination_edges_iter[destination_search_start];
-      // printf("%i, %i | %i, %i\n", cur_edge_src, cur_edge_dst,
-      //        source_search_start, destination_search_start);
-      // if (source == 1 and destination == 2) {
-      //   printf("%i, %i\n", cur_edge_src, cur_edge_dst);
-      // }
       if (cur_edge_src == cur_edge_dst) {
         intersection_count++;
         source_search_start++;
         destination_search_start++;
         on_intersection(cur_edge_src);
-        // printf("Triangle: %i, %i, %i\n", source, destination, cur_edge_src);
       } else if (cur_edge_src > cur_edge_dst) {
         destination_search_start++;
       } else {

From 115a1b16579b393f2e761b2b0789ddfcb1de0906 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Wed, 22 Jun 2022 09:40:34 -0700
Subject: [PATCH 29/58] Fix documentation

---
 include/gunrock/algorithms/tc.hxx | 4 +---
 unittests/algorithms/tc.cuh       | 2 +-
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/include/gunrock/algorithms/tc.hxx b/include/gunrock/algorithms/tc.hxx
index 8424299d..3d86b14a 100644
--- a/include/gunrock/algorithms/tc.hxx
+++ b/include/gunrock/algorithms/tc.hxx
@@ -1,5 +1,5 @@
 /**
- * @file sssp.hxx
+ * @file tc.hxx
  * @author Muhammad A. Awad (mawad@ucdavis.edu)
  * @brief Triangle Counting algorithm.
  * @version 0.1
@@ -100,9 +100,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     // Execute advance operator on the provided lambda
     operators::advance::execute<operators::load_balance_t::block_mapped>(
         G, E, intersect, context);
-    std::cout << "iteration: " << iteration << std::endl;
   }
-
 };  // struct enactor_t
 
 template <typename graph_t>
diff --git a/unittests/algorithms/tc.cuh b/unittests/algorithms/tc.cuh
index 764b181f..41ece23f 100644
--- a/unittests/algorithms/tc.cuh
+++ b/unittests/algorithms/tc.cuh
@@ -1,6 +1,6 @@
 /**
  * @file tc.cuh
- * @author Muhammad A. Awad (awad@ucdavis.edu)
+ * @author Muhammad A. Awad (mawad@ucdavis.edu)
  * @brief Unit test for the triangle counting algorithm.
  * @version 0.1
  * @date 2022-17-06

From 2b0874249a2d57b821842f3bbf251a47abb72d39 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Wed, 22 Jun 2022 12:56:35 -0700
Subject: [PATCH 30/58] Add post processing step

---
 include/gunrock/algorithms/tc.hxx | 32 ++++++++++++++++++++-----------
 1 file changed, 21 insertions(+), 11 deletions(-)

diff --git a/include/gunrock/algorithms/tc.hxx b/include/gunrock/algorithms/tc.hxx
index 3d86b14a..de1aaef1 100644
--- a/include/gunrock/algorithms/tc.hxx
+++ b/include/gunrock/algorithms/tc.hxx
@@ -11,6 +11,7 @@
 #pragma once
 
 #include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/util/timer.hxx>
 
 namespace gunrock {
 namespace tc {
@@ -101,6 +102,25 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     operators::advance::execute<operators::load_balance_t::block_mapped>(
         G, E, intersect, context);
   }
+
+  float post_process() {
+    util::timer_t timer;
+    timer.begin();
+    auto P = this->get_problem();
+    auto G = P->get_graph();
+
+    if (P->param.reduce_all_triangles) {
+      auto policy = this->context->get_context(0)->execution_policy();
+      *P->result.total_triangles_count = thrust::transform_reduce(
+          policy, P->result.vertex_triangles_count,
+          P->result.vertex_triangles_count + G.get_number_of_vertices(),
+          [] __device__(const vertex_t& vertex_triangles) {
+            return static_cast<std::size_t>(vertex_triangles);
+          },
+          std::size_t{0}, thrust::plus<std::size_t>());
+    }
+    return timer.end();
+  }
 };  // struct enactor_t
 
 template <typename graph_t>
@@ -132,17 +152,7 @@ float run(graph_t& G,
 
   enactor_type enactor(&problem, context);
   auto time = enactor.enact();
-
-  if (param.reduce_all_triangles) {
-    auto policy = context->get_context(0)->execution_policy();
-    *result.total_triangles_count = thrust::transform_reduce(
-        policy, result.vertex_triangles_count,
-        result.vertex_triangles_count + G.get_number_of_vertices(),
-        [] __device__(const vertex_t& vertex_triangles) {
-          return static_cast<std::size_t>(vertex_triangles);
-        },
-        std::size_t{0}, thrust::plus<std::size_t>());
-  }
+  time += enactor.post_process();
 
   // </boiler-plate>
   return time;

From c9ee76801ad9ae247b4e7149da79f1b304337e77 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Wed, 22 Jun 2022 17:37:59 -0700
Subject: [PATCH 31/58] [skip ci] Use thrust algorithm for lower bound

---
 include/gunrock/graph/csr.hxx | 30 ++++++++++++++++++++++++------
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/include/gunrock/graph/csr.hxx b/include/gunrock/graph/csr.hxx
index 2df6a664..35e7bc7d 100644
--- a/include/gunrock/graph/csr.hxx
+++ b/include/gunrock/graph/csr.hxx
@@ -10,6 +10,9 @@
 #include <gunrock/graph/vertex_pair.hxx>
 #include <gunrock/algorithms/search/binary_search.hxx>
 
+#include <thrust/binary_search.h>
+#include <thrust/execution_policy.h>
+
 namespace gunrock {
 namespace graph {
 
@@ -117,10 +120,14 @@ class graph_csr_t {
     auto source_neighbors_count = get_number_of_neighbors(source);
     auto destination_neighbors_count = get_number_of_neighbors(destination);
 
-    if (source_neighbors_count > destination_neighbors_count) {
-      std::swap(intersection_source, intersection_destination);
-      std::swap(source_neighbors_count, destination_neighbors_count);
+    if (source_neighbors_count == 0 || destination_neighbors_count == 0) {
+      printf("Singleton node\n");
+      return 0;
     }
+    // if (source_neighbors_count > destination_neighbors_count) {
+    //   std::swap(intersection_source, intersection_destination);
+    //   std::swap(source_neighbors_count, destination_neighbors_count);
+    // }
 
     auto source_offset = offsets[intersection_source];
     auto destination_offset = offsets[intersection_destination];
@@ -129,9 +136,20 @@ class graph_csr_t {
     auto destination_edges_iter = indices + destination_offset;
 
     auto needle = *destination_edges_iter;
-    auto source_search_start =
-        search::binary::execute(source_edges_iter, needle, vertex_t{0},
-                                source_neighbors_count, search::bound_t::lower);
+    // auto source_search_start =
+    //     search::binary::execute(source_edges_iter, needle, vertex_t{0},
+    //                             source_neighbors_count,
+    //                             search::bound_t::lower);
+
+    auto source_search_start = thrust::distance(
+        source_edges_iter,
+        thrust::lower_bound(thrust::seq, source_edges_iter,
+                            source_edges_iter + source_neighbors_count,
+                            needle));
+
+    if (source_search_start == source_neighbors_count) {
+      return 0;
+    }
     edge_type destination_search_start = 0;
 
     while (source_search_start < source_neighbors_count &&

From d89537d6be6f0e0c46b4d6b7fd79c7d83e49c645 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Wed, 22 Jun 2022 17:38:41 -0700
Subject: [PATCH 32/58] [skip ci] User entire graph as input frontier

---
 include/gunrock/algorithms/tc.hxx | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/include/gunrock/algorithms/tc.hxx b/include/gunrock/algorithms/tc.hxx
index de1aaef1..2d543d93 100644
--- a/include/gunrock/algorithms/tc.hxx
+++ b/include/gunrock/algorithms/tc.hxx
@@ -56,8 +56,9 @@ struct problem_t : gunrock::problem_t<graph_t> {
 template <typename problem_t>
 struct enactor_t : gunrock::enactor_t<problem_t> {
   enactor_t(problem_t* _problem,
-            std::shared_ptr<gcuda::multi_context_t> _context)
-      : gunrock::enactor_t<problem_t>(_problem, _context) {}
+            std::shared_ptr<gcuda::multi_context_t> _context,
+            enactor_properties_t _properties)
+      : gunrock::enactor_t<problem_t>(_problem, _context, _properties) {}
 
   using vertex_t = typename problem_t::vertex_t;
   using edge_t = typename problem_t::edge_t;
@@ -79,7 +80,6 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     auto G = P->get_graph();
 
     auto vertex_triangles_count = P->result.vertex_triangles_count;
-    auto iteration = this->iteration;
 
     auto intersect = [G, vertex_triangles_count] __host__ __device__(
                          vertex_t const& source,    // ... source
@@ -99,10 +99,19 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     };
 
     // Execute advance operator on the provided lambda
-    operators::advance::execute<operators::load_balance_t::block_mapped>(
+    operators::advance::execute<operators::load_balance_t::block_mapped,
+                                operators::advance_direction_t::forward,
+                                operators::advance_io_type_t::graph,
+                                operators::advance_io_type_t::none>(
         G, E, intersect, context);
   }
 
+  virtual bool is_converged(gcuda::multi_context_t& context) {
+    if (this->iteration == 1)
+      return true;
+    return false;
+  }
+
   float post_process() {
     util::timer_t timer;
     timer.begin();
@@ -150,7 +159,10 @@ float run(graph_t& G,
   problem.init();
   problem.reset();
 
-  enactor_type enactor(&problem, context);
+  // Disable internal-frontiers:
+  enactor_properties_t props;
+  props.self_manage_frontiers = true;
+  enactor_type enactor(&problem, context, props);
   auto time = enactor.enact();
   time += enactor.post_process();
 

From f514314df5b3bc92210a88913a611d0a67d55cbd Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Wed, 22 Jun 2022 17:53:53 -0700
Subject: [PATCH 33/58] [skip ci] Remove dead code

---
 include/gunrock/algorithms/tc.hxx | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/include/gunrock/algorithms/tc.hxx b/include/gunrock/algorithms/tc.hxx
index 2d543d93..ee5abcd9 100644
--- a/include/gunrock/algorithms/tc.hxx
+++ b/include/gunrock/algorithms/tc.hxx
@@ -65,14 +65,6 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
   using weight_t = typename problem_t::weight_t;
   using frontier_t = typename enactor_t<problem_t>::frontier_t;
 
-  void prepare_frontier(frontier_t* f,
-                        gcuda::multi_context_t& context) override {
-    auto P = this->get_problem();
-    auto n_vertices = P->get_graph().get_number_of_vertices();
-
-    f->sequence((vertex_t)0, n_vertices, context.get_context(0)->stream());
-  }
-
   void loop(gcuda::multi_context_t& context) override {
     // Data slice
     auto E = this->get_enactor();

From c1f1b551ba0920121dcfbb0ce13021d6878c8509 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <muhammad.a.awad@gmail.com>
Date: Thu, 23 Jun 2022 15:35:17 -0700
Subject: [PATCH 34/58] Remove edge and vertices count from `graph`

---
 include/gunrock/graph/coo.hxx   | 12 ++++++++++--
 include/gunrock/graph/csc.hxx   | 12 ++++++++++--
 include/gunrock/graph/csr.hxx   | 12 ++++++++++--
 include/gunrock/graph/graph.hxx | 10 ++++------
 4 files changed, 34 insertions(+), 12 deletions(-)

diff --git a/include/gunrock/graph/coo.hxx b/include/gunrock/graph/coo.hxx
index 51971861..8d4f28fa 100644
--- a/include/gunrock/graph/coo.hxx
+++ b/include/gunrock/graph/coo.hxx
@@ -105,6 +105,14 @@ class graph_coo_t {
     return values;
   }
 
+  __host__ __device__ __forceinline__ auto get_number_of_vertices() const {
+    return number_of_vertices;
+  }
+
+  __host__ __device__ __forceinline__ auto get_number_of_edges() const {
+    return number_of_edges;
+  }
+
  protected:
   __host__ __device__ void set(vertex_type const& _number_of_vertices,
                                edge_type const& _number_of_edges,
@@ -121,8 +129,8 @@ class graph_coo_t {
 
  private:
   // Underlying data storage
-  vertex_type number_of_vertices;  // XXX: redundant
-  edge_type number_of_edges;       // XXX: redundant
+  vertex_type number_of_vertices;
+  edge_type number_of_edges;
 
   vertex_type* row_indices;
   vertex_type* column_indices;
diff --git a/include/gunrock/graph/csc.hxx b/include/gunrock/graph/csc.hxx
index 0555a14a..6e3e4f11 100644
--- a/include/gunrock/graph/csc.hxx
+++ b/include/gunrock/graph/csc.hxx
@@ -108,6 +108,14 @@ class graph_csc_t {
     return values;
   }
 
+  __host__ __device__ __forceinline__ auto get_number_of_vertices() const {
+    return number_of_vertices;
+  }
+
+  __host__ __device__ __forceinline__ auto get_number_of_edges() const {
+    return number_of_edges;
+  }
+
  protected:
   __host__ __device__ void set(vertex_type const& _number_of_vertices,
                                edge_type const& _number_of_edges,
@@ -124,8 +132,8 @@ class graph_csc_t {
 
  private:
   // Underlying data storage
-  vertex_type number_of_vertices;  // XXX: redundant
-  edge_type number_of_edges;       // XXX: redundant
+  vertex_type number_of_vertices;
+  edge_type number_of_edges;
 
   edge_type* offsets;
   vertex_type* indices;
diff --git a/include/gunrock/graph/csr.hxx b/include/gunrock/graph/csr.hxx
index 35e7bc7d..c008f1c9 100644
--- a/include/gunrock/graph/csr.hxx
+++ b/include/gunrock/graph/csr.hxx
@@ -205,6 +205,14 @@ class graph_csr_t {
     return number_of_edges;
   }
 
+  __host__ __device__ __forceinline__ auto get_number_of_vertices() const {
+    return number_of_vertices;
+  }
+
+  __host__ __device__ __forceinline__ auto get_number_of_edges() const {
+    return number_of_edges;
+  }
+
  protected:
   __host__ __device__ void set(vertex_type const& _number_of_vertices,
                                edge_type const& _number_of_edges,
@@ -221,8 +229,8 @@ class graph_csr_t {
 
  private:
   // Underlying data storage
-  vertex_type number_of_vertices;  // XXX: redundant
-  edge_type number_of_edges;       // XXX: redundant
+  vertex_type number_of_vertices;  
+  edge_type number_of_edges;       
 
   edge_type* offsets;
   vertex_type* indices;
diff --git a/include/gunrock/graph/graph.hxx b/include/gunrock/graph/graph.hxx
index 7a39e55e..9a4e77da 100644
--- a/include/gunrock/graph/graph.hxx
+++ b/include/gunrock/graph/graph.hxx
@@ -99,9 +99,10 @@ class graph_t : public graph_view_t... {
    *
    * @return vertex_type number of vertices in the graph.
    */
+  template <class input_view_t = default_view_t>
   __host__ __device__ __forceinline__ const vertex_type
   get_number_of_vertices() const {
-    return number_of_vertices;
+    return input_view_t::get_number_of_vertices();
   }
 
   /**
@@ -110,9 +111,10 @@ class graph_t : public graph_view_t... {
    *
    * @return edge_type number of edges in the graph.
    */
+  template <class input_view_t = default_view_t>
   __host__ __device__ __forceinline__ const edge_type
   get_number_of_edges() const {
-    return number_of_edges;
+    return input_view_t::get_number_of_edges();
   }
 
   /**
@@ -190,8 +192,6 @@ class graph_t : public graph_view_t... {
   __host__ __device__ void set(vertex_type const& _number_of_vertices,
                                edge_type const& _number_of_edges,
                                args_t... args) {
-    this->number_of_vertices = _number_of_vertices;
-    this->number_of_edges = _number_of_edges;
     input_view_t::set(_number_of_vertices, _number_of_edges, args...);
   }
 
@@ -316,8 +316,6 @@ class graph_t : public graph_view_t... {
   static constexpr std::size_t number_of_formats_inherited =
       std::tuple_size_v<true_view_t>;
 
-  vertex_type number_of_vertices;
-  edge_type number_of_edges;
   graph_properties_t properties;
 
 };  // namespace graph

From c53f7d88992124572457ecf75da1193417b473c4 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <muhammad.a.awad@gmail.com>
Date: Thu, 23 Jun 2022 15:37:10 -0700
Subject: [PATCH 35/58] Move `syncthreads` to correct location

---
 include/gunrock/framework/operators/advance/block_mapped.hxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/gunrock/framework/operators/advance/block_mapped.hxx b/include/gunrock/framework/operators/advance/block_mapped.hxx
index 72add815..bdd0a754 100644
--- a/include/gunrock/framework/operators/advance/block_mapped.hxx
+++ b/include/gunrock/framework/operators/advance/block_mapped.hxx
@@ -99,8 +99,8 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK, 2)
     if (local_idx == 0)
       offset[0] = math::atomic::add(
           &block_offsets[0], (offset_counter_t)aggregate_degree_per_block);
-    __syncthreads();
   }
+  __syncthreads();
 
   auto length = global_idx - local_idx + gcuda::block::size::x();
 

From 3e163751ef72beeb752eb53a04ad89bfbdea0571 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <muhammad.a.awad@gmail.com>
Date: Thu, 23 Jun 2022 15:40:31 -0700
Subject: [PATCH 36/58] Correct initializer list order

---
 include/gunrock/framework/enactor.hxx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/gunrock/framework/enactor.hxx b/include/gunrock/framework/enactor.hxx
index ede978d5..99bde9a9 100644
--- a/include/gunrock/framework/enactor.hxx
+++ b/include/gunrock/framework/enactor.hxx
@@ -161,15 +161,15 @@ struct enactor_t {
   enactor_t(algorithm_problem_t* _problem,
             std::shared_ptr<gcuda::multi_context_t> _context,
             enactor_properties_t _properties = enactor_properties_t())
-      : problem(_problem),
-        properties(_properties),
+      : properties(_properties),
         context(_context),
+        problem(_problem),
         frontiers(properties.number_of_frontier_buffers),
+        scanned_work_domain(problem->get_graph().get_number_of_vertices() + 1),
         active_frontier(reinterpret_cast<frontier_t*>(&frontiers[0])),
         inactive_frontier(reinterpret_cast<frontier_t*>(&frontiers[1])),
         buffer_selector(0),
-        iteration(0),
-        scanned_work_domain(problem->get_graph().get_number_of_vertices() + 1) {
+        iteration(0) {
     /*!
      * If the self manage frontiers property is false, the enactor interface
      * will resize the frontier buffers ahead of time to avoid the first

From 197325fe0c0cab9fed2f52c79a820e446a045bc5 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Thu, 23 Jun 2022 16:39:30 -0700
Subject: [PATCH 37/58] [skip ci] Add intersection optimization back

---
 include/gunrock/graph/csr.hxx | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/include/gunrock/graph/csr.hxx b/include/gunrock/graph/csr.hxx
index c008f1c9..4d7467ae 100644
--- a/include/gunrock/graph/csr.hxx
+++ b/include/gunrock/graph/csr.hxx
@@ -121,13 +121,12 @@ class graph_csr_t {
     auto destination_neighbors_count = get_number_of_neighbors(destination);
 
     if (source_neighbors_count == 0 || destination_neighbors_count == 0) {
-      printf("Singleton node\n");
       return 0;
     }
-    // if (source_neighbors_count > destination_neighbors_count) {
-    //   std::swap(intersection_source, intersection_destination);
-    //   std::swap(source_neighbors_count, destination_neighbors_count);
-    // }
+    if (source_neighbors_count > destination_neighbors_count) {
+      std::swap(intersection_source, intersection_destination);
+      std::swap(source_neighbors_count, destination_neighbors_count);
+    }
 
     auto source_offset = offsets[intersection_source];
     auto destination_offset = offsets[intersection_destination];
@@ -229,8 +228,8 @@ class graph_csr_t {
 
  private:
   // Underlying data storage
-  vertex_type number_of_vertices;  
-  edge_type number_of_edges;       
+  vertex_type number_of_vertices;
+  edge_type number_of_edges;
 
   edge_type* offsets;
   vertex_type* indices;

From 17567b8fdb169ccb5d0fa2da049ac877473acd91 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Thu, 23 Jun 2022 16:39:48 -0700
Subject: [PATCH 38/58] Fix compile error

---
 include/gunrock/graph/graph.hxx | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/include/gunrock/graph/graph.hxx b/include/gunrock/graph/graph.hxx
index 9a4e77da..29f95d69 100644
--- a/include/gunrock/graph/graph.hxx
+++ b/include/gunrock/graph/graph.hxx
@@ -87,11 +87,7 @@ class graph_t : public graph_view_t... {
   /**
    * @brief Default constructor for the graph.
    */
-  __host__ __device__ graph_t()
-      : number_of_vertices(0),
-        number_of_edges(0),
-        properties(),
-        graph_view_t()... {}
+  __host__ __device__ graph_t() : properties(), graph_view_t()... {}
 
   /**
    * @brief Get the number of vertices in the graph. Callable from both host and

From bc7cb06a42248405c32250acf94b647820652bca Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Thu, 23 Jun 2022 16:59:09 -0700
Subject: [PATCH 39/58] Remove dead code

---
 include/gunrock/graph/csr.hxx | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/include/gunrock/graph/csr.hxx b/include/gunrock/graph/csr.hxx
index 4d7467ae..ca4e31a3 100644
--- a/include/gunrock/graph/csr.hxx
+++ b/include/gunrock/graph/csr.hxx
@@ -135,10 +135,6 @@ class graph_csr_t {
     auto destination_edges_iter = indices + destination_offset;
 
     auto needle = *destination_edges_iter;
-    // auto source_search_start =
-    //     search::binary::execute(source_edges_iter, needle, vertex_t{0},
-    //                             source_neighbors_count,
-    //                             search::bound_t::lower);
 
     auto source_search_start = thrust::distance(
         source_edges_iter,

From 0cc81b13b5ff1976bda422f96cd3029cb33d33be Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 13:09:55 -0700
Subject: [PATCH 40/58] Add TC benchmarking

---
 benchmarks/CMakeLists.txt |  13 ++--
 benchmarks/tc_bench.cu    | 146 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 153 insertions(+), 6 deletions(-)
 create mode 100644 benchmarks/tc_bench.cu

diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 74b0e450..36f5fa02 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -12,27 +12,28 @@ set(BENCHMARK_SOURCES
   spgemm_bench.cu
   spmv_bench.cu
   sssp_bench.cu
+  tc_bench.cu
 )
 
 foreach(SOURCE IN LISTS BENCHMARK_SOURCES)
   get_filename_component(BENCHMARK_NAME ${SOURCE} NAME_WLE)
   add_executable(${BENCHMARK_NAME} ${SOURCE})
   if(SOURCE MATCHES "for.cu")
-    target_link_libraries(${BENCHMARK_NAME} 
+    target_link_libraries(${BENCHMARK_NAME}
       PRIVATE essentials
       PRIVATE nvbench::main
     )
   else()
-    target_link_libraries(${BENCHMARK_NAME} 
+    target_link_libraries(${BENCHMARK_NAME}
       PRIVATE essentials
       PRIVATE nvbench::nvbench
     )
-  endif()  
-  get_target_property(ESSENTIALS_ARCHITECTURES 
+  endif()
+  get_target_property(ESSENTIALS_ARCHITECTURES
     essentials CUDA_ARCHITECTURES
   )
-  set_target_properties(${BENCHMARK_NAME} 
-    PROPERTIES 
+  set_target_properties(${BENCHMARK_NAME}
+    PROPERTIES
       CUDA_ARCHITECTURES ${ESSENTIALS_ARCHITECTURES}
   )
   message(STATUS "Benchmark Added: ${BENCHMARK_NAME}")
diff --git a/benchmarks/tc_bench.cu b/benchmarks/tc_bench.cu
new file mode 100644
index 00000000..fbf4e00d
--- /dev/null
+++ b/benchmarks/tc_bench.cu
@@ -0,0 +1,146 @@
+#include <nvbench/nvbench.cuh>
+#include <cxxopts.hpp>
+#include <gunrock/algorithms/algorithms.hxx>
+#include <gunrock/algorithms/tc.hxx>
+
+using namespace gunrock;
+using namespace memory;
+
+using vertex_t = uint32_t;
+using edge_t = uint32_t;
+using weight_t = float;
+using count_t = vertex_t;
+
+std::string filename_;
+bool reduce_all_triangles_;
+struct parameters_t {
+  std::string filename;
+  bool reduce_all_triangles;
+  bool help = false;
+  cxxopts::Options options;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv) : options(argv[0], "TC Benchmarking") {
+    options.allow_unrecognised_options();
+    // Add command line options
+    options.add_options()("h,help", "Print help")  // help
+        ("m,market", "Matrix file", cxxopts::value<std::string>())(
+            "r,reduce",
+            "Compute a single triangle count for the entire graph (default = "
+            "false)",
+            cxxopts::value<bool>()->default_value("false"));
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help")) {
+      help = true;
+      std::cout << options.help({""});
+      std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+      // Do not exit so we also print NVBench help.
+    } else {
+      if (result.count("market") == 1) {
+        filename = result["market"].as<std::string>();
+        if (!util::is_market(filename)) {
+          std::cout << options.help({""});
+          std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+          std::exit(0);
+        }
+        reduce_all_triangles = result["reduce"].as<bool>();
+      } else {
+        std::cout << options.help({""});
+        std::cout << "  [optional nvbench args]" << std::endl << std::endl;
+        std::exit(0);
+      }
+    }
+  }
+};
+
+void tc_bench(nvbench::state& state) {
+  // --
+  // Add metrics
+  state.collect_dram_throughput();
+  state.collect_l1_hit_rates();
+  state.collect_l2_hit_rates();
+  state.collect_loads_efficiency();
+  state.collect_stores_efficiency();
+
+  // --
+  // Define types
+  using csr_t =
+      format::csr_t<memory_space_t::device, vertex_t, edge_t, weight_t>;
+
+  // --
+  // Build graph + metadata
+  csr_t csr;
+  io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
+  auto mmatrix = mm.load(filename_);
+  if (!mm_is_symmetric(mm.code)) {
+    std::cerr << "Error: input matrix must be symmetric" << std::endl;
+    exit(1);
+  }
+  csr.from_coo(mmatrix);
+
+  thrust::device_vector<vertex_t> row_indices(csr.number_of_nonzeros);
+  thrust::device_vector<vertex_t> column_indices(csr.number_of_nonzeros);
+  thrust::device_vector<edge_t> column_offsets(csr.number_of_columns + 1);
+
+  auto G = graph::build::from_csr<memory_space_t::device,
+                                  graph::view_t::csr>(
+      csr.number_of_rows,               // rows
+      csr.number_of_columns,            // columns
+      csr.number_of_nonzeros,           // nonzeros
+      csr.row_offsets.data().get(),     // row_offsets
+      csr.column_indices.data().get(),  // column_indices
+      csr.nonzero_values.data().get()   // values
+  );
+
+  // --
+  // Params and memory allocation
+  vertex_t n_vertices = G.get_number_of_vertices();
+  thrust::device_vector<count_t> triangles_count(n_vertices, 0);
+  std::size_t total_triangles = 0;
+  // --
+  // Run TC with NVBench
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    tc::run(G, reduce_all_triangles_, triangles_count.data().get(),
+            &total_triangles);
+  });
+}
+
+int main(int argc, char** argv) {
+  parameters_t params(argc, argv);
+  filename_ = params.filename;
+  reduce_all_triangles_ = params.reduce_all_triangles;
+
+  if (params.help) {
+    // Print NVBench help.
+    const char* args[1] = {"-h"};
+    NVBENCH_MAIN_BODY(1, args);
+  } else {
+    // Create a new argument array without TC options to pass to NVBench.
+    char* args[argc];
+    int j = 0;
+    int num_tc_arguments = 0;
+    for (int i = 0; i < argc; i++) {
+      if (strcmp(argv[i], "--market") == 0 || strcmp(argv[i], "-m") == 0) {
+        num_tc_arguments += 2;
+        i++;
+        continue;
+      }
+      if (strcmp(argv[i], "--reduce") == 0 || strcmp(argv[i], "-r") == 0) {
+        num_tc_arguments += 1;
+        continue;
+      }
+      args[j] = argv[i];
+      j++;
+    }
+    NVBENCH_BENCH(tc_bench);
+    NVBENCH_MAIN_BODY(argc - num_tc_arguments, args);
+  }
+}

From f8efd99d89a08298e15a8185991b57af69f9ae55 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 13:10:18 -0700
Subject: [PATCH 41/58] Add TC to script

---
 benchmarks/test_benchmarks.sh | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/benchmarks/test_benchmarks.sh b/benchmarks/test_benchmarks.sh
index 11d93600..93d7e863 100755
--- a/benchmarks/test_benchmarks.sh
+++ b/benchmarks/test_benchmarks.sh
@@ -2,6 +2,7 @@
 # Algorithm benchmarking tests
 # Run this from build directory
 # If error CUPTI_ERROR_INSUFFICIENT_PRIVILEGES: run with sudo
+# Make sure to pass -DESSENTIALS_BUILD_BENCHMARKS=ON -DNVBench_ENABLE_CUPTI=ON to CMake
 # ------------------------------------------------------------------------
 
 #!/bin/bash
@@ -18,7 +19,7 @@ MATRIX_FILE="${DATASET_DIR}/chesapeake/chesapeake.mtx"
 # Used for Geo
 COORDINATES_FILE="${DATASET_DIR}/geolocation/sample.labels"
 
-# Used for SPGEMM 
+# Used for SPGEMM
 A_MATRIX="${DATASET_DIR}/spgemm/a.mtx"
 B_MATRIX="${DATASET_DIR}/spgemm/b.mtx"
 
@@ -34,6 +35,7 @@ make pr_bench
 make spgemm_bench
 make spmv_bench
 make sssp_bench
+make tc_bench
 
 ${BIN_DIR}/bc_bench -m ${MATRIX_FILE}  --json ${JSON_DIR}/bc.json
 ${BIN_DIR}/bfs_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/bfs.json
@@ -47,3 +49,4 @@ ${BIN_DIR}/pr_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/pr.json
 ${BIN_DIR}/spgemm_bench -a ${A_MATRIX} -b ${B_MATRIX} --json ${JSON_DIR}/spgemm.json
 ${BIN_DIR}/spmv_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/spmv.json
 ${BIN_DIR}/sssp_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/sssp.json
+${BIN_DIR}/tc_bench -m ${MATRIX_FILE} --json ${JSON_DIR}/tc.json

From efd574b73e6fa3931df031068a2b7002dc0251d2 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 13:12:45 -0700
Subject: [PATCH 42/58] Remove extra comment

---
 benchmarks/tc_bench.cu | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/tc_bench.cu b/benchmarks/tc_bench.cu
index fbf4e00d..0cb5eb2d 100644
--- a/benchmarks/tc_bench.cu
+++ b/benchmarks/tc_bench.cu
@@ -28,12 +28,12 @@ struct parameters_t {
   parameters_t(int argc, char** argv) : options(argv[0], "TC Benchmarking") {
     options.allow_unrecognised_options();
     // Add command line options
-    options.add_options()("h,help", "Print help")  // help
-        ("m,market", "Matrix file", cxxopts::value<std::string>())(
-            "r,reduce",
-            "Compute a single triangle count for the entire graph (default = "
-            "false)",
-            cxxopts::value<bool>()->default_value("false"));
+    options.add_options()("h,help", "Print help")(
+        "m,market", "Matrix file", cxxopts::value<std::string>())(
+        "r,reduce",
+        "Compute a single triangle count for the entire graph (default = "
+        "false)",
+        cxxopts::value<bool>()->default_value("false"));
 
     // Parse command line arguments
     auto result = options.parse(argc, argv);

From bd2d44d38b45a25ce68a6f38db009990bfbe089d Mon Sep 17 00:00:00 2001
From: Muhammad Osama <osama94@gmail.com>
Date: Fri, 24 Jun 2022 18:57:55 -0700
Subject: [PATCH 43/58] Simplifying readme.

---
 README.md | 40 +++++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index fe734ee6..5eb70bb2 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,13 @@
-# **Essentials:** High-Performance C++ GPU Graph Analytics 
+# [Essentials: High-Performance C++ GPU Graph Analytics](https://github.com/gunrock/essentials/wiki)
 [![Ubuntu](https://github.com/gunrock/essentials/actions/workflows/ubuntu.yml/badge.svg)](https://github.com/gunrock/essentials/actions/workflows/ubuntu.yml) [![Windows](https://github.com/gunrock/essentials/actions/workflows/windows.yml/badge.svg)](https://github.com/gunrock/essentials/actions/workflows/windows.yml) [![Code Quality](https://github.com/gunrock/essentials/actions/workflows/codeql-analysis.yml/badge.svg)](https://github.com/gunrock/essentials/actions/workflows/codeql-analysis.yml) [![Ubuntu: Testing](https://github.com/gunrock/essentials/actions/workflows/ubuntu-tests.yml/badge.svg)](https://github.com/gunrock/essentials/actions/workflows/ubuntu-tests.yml)
 
 **Gunrock/Essentials** is a CUDA library for graph-processing designed specifically for the GPU. It uses a **high-level**, **bulk-synchronous**, **data-centric abstraction** focused on operations on vertex or edge frontiers. Gunrock achieves a balance between performance and expressiveness by coupling high-performance GPU computing primitives and optimization strategies, particularly in the area of fine-grained load balancing, with a high-level programming model that allows programmers to quickly develop new graph primitives that scale from one to many GPUs on a node with small code size and minimal GPU programming knowledge.
 
 ## Quick Start Guide
 
-Before building Gunrock make sure you have **CUDA Toolkit**[^1] installed on your system. Other external dependencies such as `NVIDIA/thrust`, `NVIDIA/cub`, etc. are automatically fetched using `cmake`.
+- [Gunrock's Documentation](https://github.com/gunrock/essentials/wiki)
+
+Before building Gunrock make sure you have **CUDA Toolkit**[<sup>[1]</sup>](#footnotes) installed on your system. Other external dependencies such as `NVIDIA/thrust`, `NVIDIA/cub`, etc. are automatically fetched using `cmake`.
 
 ```shell
 git clone https://github.com/gunrock/essentials.git
@@ -15,20 +17,8 @@ cmake ..
 make sssp # or for all algorithms, use: make -j$(nproc)
 bin/sssp ../datasets/chesapeake/chesapeake.mtx
 ```
-[^1]: Preferred **CUDA v11.5.1 or higher** due to support for stream ordered memory allocators (e.g. `cudaFreeAsync()`).
-
-## Getting Started with Gunrock
-
-- [👻 (GitHub Template) `essentials` project example](https://github.com/gunrock/applications)
-- [Gunrock's documentation](https://github.com/gunrock/essentials/wiki)
-- [Gunrock's overview](https://github.com/gunrock/essentials/wiki/Overview)
-- [Gunrock's programming model](https://github.com/gunrock/essentials/wiki/Programming-Model)
-- [Publications](https://github.com/gunrock/essentials/wiki/Publications) and [presentations](https://github.com/gunrock/essentials/wiki/Presentations)
-- [Essentials](https://github.com/gunrock/essentials) versus [Gunrock](https://github.com/gunrock/gunrock)[^2]
 
-[^2]: Essentials is the future of Gunrock. The idea is to take the lessons learned from Gunrock to a new design, which simplifies the effort it takes to **(1)** implement graph algorithms, **(2)** add internal optimizations, **(3)** conduct future research. One example is Gunrock's SSSP, implemented in 4-5 files with 1000s of lines of code versus in essentials, it is a single file with less than 200 lines of code. Our end goal with essentials is possibly releasing it as a `v2.0.0` for Gunrock.
-
-## How to Cite Gunrock
+## How to Cite Gunrock & Essentials
 Thank you for citing our work.
 
 ```tex
@@ -53,6 +43,26 @@ Thank you for citing our work.
 }
 ```
 
+```tex
+@InProceedings{Osama:2022:EOP,
+  author =	 {Muhammad Osama and Serban D. Porumbescu and John D. Owens},
+  title =	 {Essentials of Parallel Graph Analytics},
+  booktitle =	 {Proceedings of the Workshop on Graphs,
+                  Architectures, Programming, and Learning},
+  year =	 2022,
+  series =	 {GrAPL 2022},
+  month =	 may,
+  pages =	 {314--317},
+  doi =		 {10.1109/IPDPSW55747.2022.00061},
+  url =          {https://escholarship.org/uc/item/2p19z28q},
+}
+```
+
 ## Copyright and License
 
 Gunrock is copyright The Regents of the University of California. The library, examples, and all source code are released under [Apache 2.0](https://github.com/gunrock/essentials/blob/master/LICENSE).
+
+<a class="anchor" id="1"></a>
+## Footnotes
+1. Preferred **CUDA v11.5.1 or higher** due to support for stream ordered memory allocators (e.g. `cudaFreeAsync()`).
+2. Essentials is intended as a future release of Gunrock. You can read more about in our vision paper: [Essentials of Parallel Graph Analytics](https://escholarship.org/content/qt2p19z28q/qt2p19z28q_noSplash_38a658bccc817ba025517311a776840f.pdf).

From 51f005bc2754c75f3fd814029db051fb8b55cfe8 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 19:06:19 -0700
Subject: [PATCH 44/58] Add reference CPU TC implementation

---
 examples/algorithms/tc/tc_cpu.hxx | 93 +++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 examples/algorithms/tc/tc_cpu.hxx

diff --git a/examples/algorithms/tc/tc_cpu.hxx b/examples/algorithms/tc/tc_cpu.hxx
new file mode 100644
index 00000000..11627321
--- /dev/null
+++ b/examples/algorithms/tc/tc_cpu.hxx
@@ -0,0 +1,93 @@
+/**
+ * @file tc_cpu.hxx
+ * @author Muhammad A. Awad (mawad@ucdavis.edu)
+ * @brief
+ * @version 0.1
+ * @date 2022-06-24
+ *
+ * @copyright Copyright (c) 2022
+ *
+ */
+
+#pragma once
+
+#include <chrono>
+#include <vector>
+#include <algorithm>
+
+namespace tc_cpu {
+
+using namespace std::chrono;
+
+template <typename csr_t, typename count_t>
+float run(const csr_t& csr,
+          std::vector<count_t>& triangles_count,
+          std::size_t& total_triangles) {
+  using edge_t = typename csr_t::offset_type;
+  using vertex_t = typename csr_t::index_type;
+
+  // Copy data to CPU
+  thrust::host_vector<edge_t> row_offsets(csr.row_offsets);
+  thrust::host_vector<vertex_t> column_indices(csr.column_indices);
+  vertex_t n_vertices = csr.number_of_rows;
+  vertex_t n_edges = csr.number_of_nonzeros;
+
+  auto t_start = high_resolution_clock::now();
+
+  for (vertex_t source = 0; source < n_vertices; source++) {
+    auto source_offset_start = row_offsets[source];
+    auto source_offset_end = row_offsets[source + 1];
+    auto source_neighbors_count = source_offset_end - source_offset_start;
+    if (source_neighbors_count == 0)
+      continue;
+    auto source_neighbors_ptr = column_indices.data() + source_offset_start;
+
+    auto needle = source_neighbors_ptr[0];
+
+    for (vertex_t i = 0; i < source_neighbors_count; i++) {
+      auto destination = source_neighbors_ptr[i];
+      continue;
+      if (destination >= source)
+        break;
+      auto destination_offset_start = row_offsets[destination];
+      auto destination_offset_end = row_offsets[destination + 1];
+      auto destination_neighbors_count =
+          destination_offset_end - destination_offset_start;
+      if (destination_neighbors_count == 0)
+        continue;
+      auto destination_neighbors_ptr =
+          column_indices.data() + destination_offset_start;
+
+      auto destination_search_end =
+          destination_neighbors_ptr + destination_neighbors_count;
+      auto destination_search_begin = std::lower_bound(
+          destination_neighbors_ptr, destination_search_end, needle);
+
+      auto source_search_begin = source_neighbors_ptr;
+      auto source_search_end = source_neighbors_ptr + source_neighbors_count;
+
+      while (source_search_begin < source_search_end &&
+             destination_search_begin < destination_search_end) {
+        auto source_neighbor = *source_search_begin;
+        auto destination_neighbor = *destination_search_begin;
+        if (source_neighbor == destination_neighbor) {
+          if (source_neighbor != source && source_neighbor != destination) {
+            triangles_count[source_neighbor]++;
+            total_triangles++;
+          }
+          destination_search_begin++;
+          source_search_begin++;
+        } else if (source_neighbor > destination_neighbor) {
+          destination_search_begin++;
+        } else {
+          source_search_begin++;
+        }
+      }
+    }
+  }
+  auto t_stop = high_resolution_clock::now();
+  auto elapsed = duration_cast<microseconds>(t_stop - t_start).count();
+  return (float)elapsed / 1000;
+}
+
+}  // namespace tc_cpu
\ No newline at end of file

From 101b6301675fab8cb3f68818c31424d7de886fed Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 19:07:00 -0700
Subject: [PATCH 45/58] Use `cxxopts` and add validation option

---
 examples/algorithms/tc/tc.cu | 86 +++++++++++++++++++++++++++++-------
 1 file changed, 71 insertions(+), 15 deletions(-)

diff --git a/examples/algorithms/tc/tc.cu b/examples/algorithms/tc/tc.cu
index fcd1143f..abf04217 100644
--- a/examples/algorithms/tc/tc.cu
+++ b/examples/algorithms/tc/tc.cu
@@ -1,14 +1,52 @@
+#include <vector>
+
 #include <gunrock/algorithms/tc.hxx>
+#include "tc_cpu.hxx"
+
+#include <cxxopts.hpp>
 
 using namespace gunrock;
 using namespace memory;
 
-void test_tc(int num_arguments, char** argument_array) {
-  if (num_arguments != 3) {
-    std::cerr << "usage: ./bin/tc filename.mtx reduce" << std::endl;
-    exit(1);
+struct parameters_t {
+  std::string filename;
+  cxxopts::Options options;
+  bool validate;
+  bool reduce_all_triangles;
+
+  /**
+   * @brief Construct a new parameters object and parse command line arguments.
+   *
+   * @param argc Number of command line arguments.
+   * @param argv Command line arguments.
+   */
+  parameters_t(int argc, char** argv)
+      : options(argv[0], "Traingle Counting example") {
+    // Add command line options
+    options.add_options()       //
+        ("help", "Print help")  //
+        ("validate", "CPU validation",
+         cxxopts::value<bool>()->default_value("false"))            //
+        ("m,market", "Matrix file", cxxopts::value<std::string>())  //
+        ("r,reduce",
+         "Compute a single triangle count for the entire graph (default = "
+         "false)",
+         cxxopts::value<bool>()->default_value("false"));
+
+    // Parse command line arguments
+    auto result = options.parse(argc, argv);
+
+    if (result.count("help") || (result.count("market") == 0)) {
+      std::cout << options.help({""}) << std::endl;
+      std::exit(0);
+    }
+    filename = result["market"].as<std::string>();
+    validate = result["validate"].as<bool>();
+    reduce_all_triangles = result["reduce"].as<bool>();
   }
+};
 
+void test_tc(int num_arguments, char** argument_array) {
   // --
   // Define types
 
@@ -23,23 +61,20 @@ void test_tc(int num_arguments, char** argument_array) {
 
   // --
   // IO
+  parameters_t params(num_arguments, argument_array);
 
-  const std::string filename = argument_array[1];
-  const std::string reduce = argument_array[2];
-  const bool reduce_all_triangles = reduce.find("true") != std::string::npos;
-
-  if (util::is_market(filename)) {
+  if (util::is_market(params.filename)) {
     io::matrix_market_t<vertex_t, edge_t, weight_t> mm;
-    auto mmatrix = mm.load(filename);
+    auto mmatrix = mm.load(params.filename);
     if (!mm_is_symmetric(mm.code)) {
       std::cerr << "Error: input matrix must be symmetric" << std::endl;
       exit(1);
     }
     csr.from_coo(mmatrix);
-  } else if (util::is_binary_csr(filename)) {
-    csr.read_binary(filename);
+  } else if (util::is_binary_csr(params.filename)) {
+    csr.read_binary(params.filename);
   } else {
-    std::cerr << "Unknown file format: " << filename << std::endl;
+    std::cerr << "Unknown file format: " << params.filename << std::endl;
     exit(1);
   }
 
@@ -66,17 +101,38 @@ void test_tc(int num_arguments, char** argument_array) {
   // GPU Run
 
   std::size_t total_triangles = 0;
-  float gpu_elapsed = tc::run(G, reduce_all_triangles,
+  float gpu_elapsed = tc::run(G, params.reduce_all_triangles,
                               triangles_count.data().get(), &total_triangles);
 
   // --
   // Log
 
   print::head(triangles_count, 40, "Per-vertex triangle count");
-  if (reduce_all_triangles) {
+  if (params.reduce_all_triangles) {
     std::cout << "Total Graph Traingles : " << total_triangles << std::endl;
   }
   std::cout << "GPU Elapsed Time : " << gpu_elapsed << " (ms)" << std::endl;
+
+  // --
+  // CPU validation
+  if (params.validate) {
+    std::vector<count_t> reference_triangles_count(n_vertices, 0);
+    std::size_t reference_total_triangles = 0;
+
+    float cpu_elapsed =
+        tc_cpu::run(csr, reference_triangles_count, reference_total_triangles);
+    uint32_t n_errors = 0;
+    if (total_triangles != reference_total_triangles) {
+      std::cout << "Error: Total TC mismatch: " << total_triangles
+                << "! = " << reference_total_triangles << std::endl;
+      n_errors++;
+    }
+    n_errors += util::compare(
+        triangles_count.data().get(), reference_triangles_count.data(),
+        n_vertices, [](const auto x, const auto y) { return x != y; }, true);
+    std::cout << "CPU Elapsed Time : " << cpu_elapsed << " (ms)" << std::endl;
+    std::cout << "Number of errors : " << n_errors << std::endl;
+  }
 }
 
 int main(int argc, char** argv) {

From 914b2a24c1f5c2a44ea27457911ebc98e75daed8 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 19:08:04 -0700
Subject: [PATCH 46/58] Add self loop unit test

---
 unittests/algorithms/tc.cuh | 40 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 39 insertions(+), 1 deletion(-)

diff --git a/unittests/algorithms/tc.cuh b/unittests/algorithms/tc.cuh
index 41ece23f..fe786cdd 100644
--- a/unittests/algorithms/tc.cuh
+++ b/unittests/algorithms/tc.cuh
@@ -20,7 +20,7 @@ TEST(algorithm, tc) {
   // CSR Matrix Representation
   // V            = [ 1 1 1 1 ]
   // ROW_OFFSETS  = [ 0 3 5 8 10 ]
-  // COL_INDEX    = [ 1 2 3 0 2 0 1 3 0 2]
+  // COL_INDEX    = [ 1 2 3 | 0 2 | 0 1 3 | 0 2]
 
   using vertex_t = int;
   using edge_t = int;
@@ -53,3 +53,41 @@ TEST(algorithm, tc) {
 
   EXPECT_EQ(total_triangles, reference_total_triangles);
 }
+
+TEST(algorithm, tc_self_loop_vertex) {
+  // CSR Matrix Representation
+  // V            = [ 1 1 1 1 ]
+  // ROW_OFFSETS  = [ 0 4 7 10 12 ]
+  // COL_INDEX    = [ 0 1 2 3 | 0 1 2 | 0 1 3 | 0 2]
+
+  using vertex_t = int;
+  using edge_t = int;
+  using weight_t = int;
+
+  vertex_t number_of_rows = 4, number_of_columns = 4;
+  edge_t number_of_nonzeros = 12;
+  thrust::device_vector<edge_t> Ap = std::vector{0, 4, 7, 10, 12};
+  thrust::device_vector<vertex_t> Aj =
+      std::vector{0, 1, 2, 3, 0, 1, 2, 0, 1, 3, 0, 2};
+  thrust::device_vector<weight_t> Ax(number_of_nonzeros, 0);
+
+  auto G = graph::build::from_csr<memory_space_t::device, graph::view_t::csr>(
+      number_of_rows, number_of_columns, number_of_nonzeros, Ap.data().get(),
+      Aj.data().get(), Ax.data().get());
+
+  std::size_t total_triangles = 0;
+  thrust::device_vector<vertex_t> d_triangles_count(number_of_rows, 0);
+  tc::run(G, true, d_triangles_count.data().get(), &total_triangles);
+
+  thrust::host_vector<vertex_t> h_triangles_count(d_triangles_count);
+
+  std::size_t reference_total_triangles = 6;
+  thrust::host_vector<vertex_t> reference_traingles_count =
+      std::vector{2, 1, 2, 1};
+
+  for (std::size_t v = 0; v < number_of_rows; v++) {
+    EXPECT_EQ(h_triangles_count[v], reference_traingles_count[v]);
+  }
+
+  EXPECT_EQ(total_triangles, reference_total_triangles);
+}

From d69f7624541590edefb5f6a3df2d24830f8680dc Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 19:08:34 -0700
Subject: [PATCH 47/58] Handle self loops in intersection lambda

---
 include/gunrock/algorithms/tc.hxx | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/include/gunrock/algorithms/tc.hxx b/include/gunrock/algorithms/tc.hxx
index ee5abcd9..58ba4e1b 100644
--- a/include/gunrock/algorithms/tc.hxx
+++ b/include/gunrock/algorithms/tc.hxx
@@ -82,9 +82,14 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
       if (neighbor > source) {
         auto src_vertex_triangles_count = G.get_intersection_count(
             source, neighbor,
-            [vertex_triangles_count](auto intersection_vertex) {
-              math::atomic::add(&(vertex_triangles_count[intersection_vertex]),
-                                vertex_t{1});
+            [vertex_triangles_count, source,
+             neighbor](auto intersection_vertex) {
+              if (source != intersection_vertex &&
+                  neighbor != intersection_vertex) {
+                math::atomic::add(
+                    &(vertex_triangles_count[intersection_vertex]),
+                    vertex_t{1});
+              }
             });
       }
       return false;

From 10dc2bf1c1bbd459fd504492abd209bf34faf1fc Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 19:09:01 -0700
Subject: [PATCH 48/58] Add self loops not in `get_intersection_count`

---
 include/gunrock/graph/csr.hxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/gunrock/graph/csr.hxx b/include/gunrock/graph/csr.hxx
index ca4e31a3..1dfc2b6f 100644
--- a/include/gunrock/graph/csr.hxx
+++ b/include/gunrock/graph/csr.hxx
@@ -100,7 +100,7 @@ class graph_csr_t {
   /**
    * @brief Count the number of vertices belonging to the set intersection
    * between the source and destination vertices adjacency lists. Executes a
-   * function on each intersection.
+   * function on each intersection. This function does not handle self-loops.
    *
    * @param source Index of the source vertex
    * @param destination Index of the destination

From ba0decab497fd6f817a102215d604e9d33509343 Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 19:18:39 -0700
Subject: [PATCH 49/58] Remove extra comments

---
 examples/algorithms/tc/tc.cu      | 17 ++++++++---------
 examples/algorithms/tc/tc_cpu.hxx |  1 -
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/examples/algorithms/tc/tc.cu b/examples/algorithms/tc/tc.cu
index abf04217..527bd1be 100644
--- a/examples/algorithms/tc/tc.cu
+++ b/examples/algorithms/tc/tc.cu
@@ -23,15 +23,14 @@ struct parameters_t {
   parameters_t(int argc, char** argv)
       : options(argv[0], "Traingle Counting example") {
     // Add command line options
-    options.add_options()       //
-        ("help", "Print help")  //
-        ("validate", "CPU validation",
-         cxxopts::value<bool>()->default_value("false"))            //
-        ("m,market", "Matrix file", cxxopts::value<std::string>())  //
-        ("r,reduce",
-         "Compute a single triangle count for the entire graph (default = "
-         "false)",
-         cxxopts::value<bool>()->default_value("false"));
+    options.add_options()("help", "Print help")(
+        "validate", "CPU validation",
+        cxxopts::value<bool>()->default_value("false"))(
+        "m,market", "Matrix file", cxxopts::value<std::string>())(
+        "r,reduce",
+        "Compute a single triangle count for the entire graph (default = "
+        "false)",
+        cxxopts::value<bool>()->default_value("false"));
 
     // Parse command line arguments
     auto result = options.parse(argc, argv);
diff --git a/examples/algorithms/tc/tc_cpu.hxx b/examples/algorithms/tc/tc_cpu.hxx
index 11627321..67b0ebde 100644
--- a/examples/algorithms/tc/tc_cpu.hxx
+++ b/examples/algorithms/tc/tc_cpu.hxx
@@ -26,7 +26,6 @@ float run(const csr_t& csr,
   using edge_t = typename csr_t::offset_type;
   using vertex_t = typename csr_t::index_type;
 
-  // Copy data to CPU
   thrust::host_vector<edge_t> row_offsets(csr.row_offsets);
   thrust::host_vector<vertex_t> column_indices(csr.column_indices);
   vertex_t n_vertices = csr.number_of_rows;

From 7463dbedbaf35e1f6630ff903eca4581e06bbb0d Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 19:25:42 -0700
Subject: [PATCH 50/58] Remove debugging code

---
 examples/algorithms/tc/tc_cpu.hxx | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/algorithms/tc/tc_cpu.hxx b/examples/algorithms/tc/tc_cpu.hxx
index 67b0ebde..73d0456d 100644
--- a/examples/algorithms/tc/tc_cpu.hxx
+++ b/examples/algorithms/tc/tc_cpu.hxx
@@ -45,9 +45,8 @@ float run(const csr_t& csr,
 
     for (vertex_t i = 0; i < source_neighbors_count; i++) {
       auto destination = source_neighbors_ptr[i];
-      continue;
       if (destination >= source)
-        break;
+        continue;
       auto destination_offset_start = row_offsets[destination];
       auto destination_offset_end = row_offsets[destination + 1];
       auto destination_neighbors_count =

From ab7b3ec46f3344b6e4b0bd22b290920399a2f77d Mon Sep 17 00:00:00 2001
From: Muhammad Awad <mawad@ucdavis.edu>
Date: Fri, 24 Jun 2022 19:32:33 -0700
Subject: [PATCH 51/58] Add optimization back

---
 examples/algorithms/tc/tc_cpu.hxx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/algorithms/tc/tc_cpu.hxx b/examples/algorithms/tc/tc_cpu.hxx
index 73d0456d..6dd592bc 100644
--- a/examples/algorithms/tc/tc_cpu.hxx
+++ b/examples/algorithms/tc/tc_cpu.hxx
@@ -46,7 +46,7 @@ float run(const csr_t& csr,
     for (vertex_t i = 0; i < source_neighbors_count; i++) {
       auto destination = source_neighbors_ptr[i];
       if (destination >= source)
-        continue;
+        break;
       auto destination_offset_start = row_offsets[destination];
       auto destination_offset_end = row_offsets[destination + 1];
       auto destination_neighbors_count =

From df0f3c19f9dc3933824c3e4ff9eae1021a963734 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Sat, 25 Jun 2022 19:15:16 -0700
Subject: [PATCH 52/58] Improved dataset collection from gunrock.

---
 datasets/Makefile                  | 47 ++++++++++++++++++++++++++++++
 datasets/ak2010/Makefile           |  5 ++++
 datasets/arabic-2005/Makefile      |  5 ++++
 datasets/asia_osm/Makefile         |  5 ++++
 datasets/belgium_osm/Makefile      |  5 ++++
 datasets/cit-Patents/Makefile      |  5 ++++
 datasets/coAuthorsDBLP/Makefile    |  5 ++++
 datasets/common.mk                 | 27 +++++++++++++++++
 datasets/delaunay_n13/Makefile     |  6 ++++
 datasets/delaunay_n21/Makefile     |  6 ++++
 datasets/delaunay_n24/Makefile     |  6 ++++
 datasets/europe_osm/Makefile       |  5 ++++
 datasets/germany_osm/Makefile      |  5 ++++
 datasets/hollywood-2009/Makefile   |  5 ++++
 datasets/indochina-2004/Makefile   |  5 ++++
 datasets/kron_g500-logn21/Makefile |  6 ++++
 datasets/networkrepo.mk            | 17 +++++++++++
 datasets/rgg_n_2_24_s0/readme.txt  |  2 ++
 datasets/roadNet-CA/Makefile       |  6 ++++
 datasets/road_central/Makefile     |  5 ++++
 datasets/road_usa/Makefile         |  5 ++++
 datasets/soc-LiveJournal1/Makefile |  5 ++++
 datasets/soc-orkut/Makefile        |  7 +++++
 datasets/soc-sinaweibo/Makefile    |  7 +++++
 datasets/soc-twitter-2010/Makefile |  7 +++++
 datasets/toy.mtx                   |  5 ----
 datasets/ufl.mk                    | 18 ++++++++++++
 datasets/uk-2002/Makefile          |  5 ++++
 datasets/uk-2005/Makefile          |  5 ++++
 datasets/webbase-1M/Makefile       |  5 ++++
 datasets/webbase-2001/Makefile     |  5 ++++
 31 files changed, 247 insertions(+), 5 deletions(-)
 create mode 100644 datasets/Makefile
 create mode 100644 datasets/ak2010/Makefile
 create mode 100644 datasets/arabic-2005/Makefile
 create mode 100644 datasets/asia_osm/Makefile
 create mode 100644 datasets/belgium_osm/Makefile
 create mode 100644 datasets/cit-Patents/Makefile
 create mode 100644 datasets/coAuthorsDBLP/Makefile
 create mode 100644 datasets/common.mk
 create mode 100644 datasets/delaunay_n13/Makefile
 create mode 100644 datasets/delaunay_n21/Makefile
 create mode 100644 datasets/delaunay_n24/Makefile
 create mode 100644 datasets/europe_osm/Makefile
 create mode 100644 datasets/germany_osm/Makefile
 create mode 100755 datasets/hollywood-2009/Makefile
 create mode 100644 datasets/indochina-2004/Makefile
 create mode 100644 datasets/kron_g500-logn21/Makefile
 create mode 100644 datasets/networkrepo.mk
 create mode 100644 datasets/rgg_n_2_24_s0/readme.txt
 create mode 100644 datasets/roadNet-CA/Makefile
 create mode 100644 datasets/road_central/Makefile
 create mode 100644 datasets/road_usa/Makefile
 create mode 100644 datasets/soc-LiveJournal1/Makefile
 create mode 100644 datasets/soc-orkut/Makefile
 create mode 100644 datasets/soc-sinaweibo/Makefile
 create mode 100644 datasets/soc-twitter-2010/Makefile
 delete mode 100644 datasets/toy.mtx
 create mode 100644 datasets/ufl.mk
 create mode 100644 datasets/uk-2002/Makefile
 create mode 100644 datasets/uk-2005/Makefile
 create mode 100644 datasets/webbase-1M/Makefile
 create mode 100644 datasets/webbase-2001/Makefile

diff --git a/datasets/Makefile b/datasets/Makefile
new file mode 100644
index 00000000..f3cc6a8d
--- /dev/null
+++ b/datasets/Makefile
@@ -0,0 +1,47 @@
+#Makefile to fetch and install graph data for regression
+#testing borrowed from Royal Caliber
+
+#Each graph lives in its own directory
+SUBDIRS = ak2010 belgium_osm delaunay_n13 delaunay_n21 delaunay_n24 coAuthorsDBLP kron_g500-logn21 soc-LiveJournal1 webbase-1M europe_osm road_usa cit-Patents soc-orkut indochina-2004 hollywood-2009 roadNet-CA
+
+SUBDIRS_IPDPS17 = soc-LiveJournal1 hollywood-2009 soc-orkut soc-sinaweibo soc-twitter-2010 indochina-2004 uk-2002 arabic-2005 uk-2005 webbase-2001 germany_osm asia_osm europe_osm road_central road_usa kron_g500-logn21
+
+SUBDIRS_TOPC = soc-LiveJournal1 hollywood-2009 soc-orkut indochina-2004 road_usa
+
+SUBDIRS_STANDARD = soc-LiveJournal1 hollywood-2009 soc-orkut indochina-2004 road_usa
+
+.PHONY: $(GRAPHS)
+
+#fetches all graphs, extracts and sets up files for tests
+all: recurse
+
+#only download the graphs, but do not proceed further
+fetch: recurse
+
+#clean everything except the downloaded graphs
+clean: recurse
+
+#clean everything including the downloaded graphs
+realclean: recurse
+
+#recurse into each subdirectory and try to build the provided targets
+recurse:
+	for subdir in $(SUBDIRS); do $(MAKE) -C $$subdir $(MAKECMDGOALS); done
+
+IPDPS17: recurse_ipdps17
+
+recurse_ipdps17:
+	for subdir in $(SUBDIRS_IPDPS17); do $(MAKE) -C $$subdir; done
+
+TOPC: recurse_topc
+
+recurse_topc:
+	for subdir in $(SUBDIRS_TOPC); do $(MAKE) -C $$subdir; done
+
+STANDARD: recurse_standard
+
+recurse_standard:
+	for subdir in $(SUBDIRS_STANDARD); do $(MAKE) -C $$subdir; done
+
+$(GRAPHS):
+	$(MAKE) -C $@
diff --git a/datasets/ak2010/Makefile b/datasets/ak2010/Makefile
new file mode 100644
index 00000000..4c9a2971
--- /dev/null
+++ b/datasets/ak2010/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = ak2010
+GRAPH_URL  = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/ak2010.tar.gz
+
+include ../common.mk
+include ../ufl.mk
diff --git a/datasets/arabic-2005/Makefile b/datasets/arabic-2005/Makefile
new file mode 100644
index 00000000..7746f017
--- /dev/null
+++ b/datasets/arabic-2005/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = arabic-2005
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/LAW/arabic-2005.tar.gz
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/asia_osm/Makefile b/datasets/asia_osm/Makefile
new file mode 100644
index 00000000..574822d0
--- /dev/null
+++ b/datasets/asia_osm/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = asia_osm
+GRAPH_URL  = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/asia_osm.tar.gz
+
+include ../common.mk
+include ../ufl.mk
diff --git a/datasets/belgium_osm/Makefile b/datasets/belgium_osm/Makefile
new file mode 100644
index 00000000..7e29c753
--- /dev/null
+++ b/datasets/belgium_osm/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = belgium_osm
+GRAPH_URL  = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/belgium_osm.tar.gz
+
+include ../common.mk
+include ../ufl.mk
diff --git a/datasets/cit-Patents/Makefile b/datasets/cit-Patents/Makefile
new file mode 100644
index 00000000..fcd7b3a1
--- /dev/null
+++ b/datasets/cit-Patents/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = cit-Patents
+GRAPH_URL  = https://www.cise.ufl.edu/research/sparse/MM/SNAP/cit-Patents.tar.gz
+
+include ../common.mk
+include ../ufl.mk
diff --git a/datasets/coAuthorsDBLP/Makefile b/datasets/coAuthorsDBLP/Makefile
new file mode 100644
index 00000000..9a730013
--- /dev/null
+++ b/datasets/coAuthorsDBLP/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = coAuthorsDBLP
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/coAuthorsDBLP.tar.gz
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/common.mk b/datasets/common.mk
new file mode 100644
index 00000000..667b63bd
--- /dev/null
+++ b/datasets/common.mk
@@ -0,0 +1,27 @@
+#The following variables must be defined prior to including this
+#makefile fragment
+#
+#GRAPH_URL:  the url path to the file
+
+OSUPPER := $(shell uname -s 2>/dev/null | tr [:lower:] [:upper:])
+
+ifeq (DARWIN, $(findstring DARWIN, $(OSUPPER)))
+    WGET := curl -O
+else
+    WGET := wget -N
+endif
+
+TAR  := tar
+GZIP := gzip
+MATRIX2SNAP := ../matrix2snap.py
+
+GRAPH_FILE := $(notdir $(GRAPH_URL))
+
+all: setup
+
+fetch: $(GRAPH_FILE)
+
+$(GRAPH_FILE):
+	$(WGET) $(GRAPH_URL)
+
+IPDPS17: setup
diff --git a/datasets/delaunay_n13/Makefile b/datasets/delaunay_n13/Makefile
new file mode 100644
index 00000000..23a3f8aa
--- /dev/null
+++ b/datasets/delaunay_n13/Makefile
@@ -0,0 +1,6 @@
+GRAPH_NAME = delaunay_n13
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/delaunay_n13.tar.gz
+
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/delaunay_n21/Makefile b/datasets/delaunay_n21/Makefile
new file mode 100644
index 00000000..027fe139
--- /dev/null
+++ b/datasets/delaunay_n21/Makefile
@@ -0,0 +1,6 @@
+GRAPH_NAME = delaunay_n21
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/delaunay_n21.tar.gz
+
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/delaunay_n24/Makefile b/datasets/delaunay_n24/Makefile
new file mode 100644
index 00000000..52e449e4
--- /dev/null
+++ b/datasets/delaunay_n24/Makefile
@@ -0,0 +1,6 @@
+GRAPH_NAME = delaunay_n24
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/delaunay_n24.tar.gz
+
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/europe_osm/Makefile b/datasets/europe_osm/Makefile
new file mode 100644
index 00000000..b736f187
--- /dev/null
+++ b/datasets/europe_osm/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = europe_osm
+GRAPH_URL  = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/europe_osm.tar.gz
+
+include ../common.mk
+include ../ufl.mk
diff --git a/datasets/germany_osm/Makefile b/datasets/germany_osm/Makefile
new file mode 100644
index 00000000..25dac8a0
--- /dev/null
+++ b/datasets/germany_osm/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = germany_osm
+GRAPH_URL  = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/germany_osm.tar.gz
+
+include ../common.mk
+include ../ufl.mk
diff --git a/datasets/hollywood-2009/Makefile b/datasets/hollywood-2009/Makefile
new file mode 100755
index 00000000..58ab0ed9
--- /dev/null
+++ b/datasets/hollywood-2009/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = hollywood-2009
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/LAW/hollywood-2009.tar.gz
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/indochina-2004/Makefile b/datasets/indochina-2004/Makefile
new file mode 100644
index 00000000..27ccaa03
--- /dev/null
+++ b/datasets/indochina-2004/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = indochina-2004
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/LAW/indochina-2004.tar.gz
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/kron_g500-logn21/Makefile b/datasets/kron_g500-logn21/Makefile
new file mode 100644
index 00000000..cba9de6c
--- /dev/null
+++ b/datasets/kron_g500-logn21/Makefile
@@ -0,0 +1,6 @@
+GRAPH_NAME = kron_g500-logn21
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/kron_g500-logn21.tar.gz
+
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/networkrepo.mk b/datasets/networkrepo.mk
new file mode 100644
index 00000000..0fd5c83d
--- /dev/null
+++ b/datasets/networkrepo.mk
@@ -0,0 +1,17 @@
+#common make file fragment for networkrepository.com
+#just define GRAPH_NAME prior to including this fragment
+
+GRAPH_ZIP  = $(GRAPH_NAME).zip
+
+setup: $(GRAPH_NAME).mtx
+
+$(GRAPH_NAME).mtx: $(GRAPH_ZIP)
+	unzip $(GRAPH_ZIP)
+	rm -rf readme.txt
+
+clean:
+	rm $(GRAPH_NAME).mtx
+
+realclean: clean
+	rm $(GRAPH_ZIP)
+
diff --git a/datasets/rgg_n_2_24_s0/readme.txt b/datasets/rgg_n_2_24_s0/readme.txt
new file mode 100644
index 00000000..448705c1
--- /dev/null
+++ b/datasets/rgg_n_2_24_s0/readme.txt
@@ -0,0 +1,2 @@
+Dataset download link:
+https://drive.google.com/uc?export=download&id=0Bw6LwCuER0a3VWNrVUV6eTZyeFUI
diff --git a/datasets/roadNet-CA/Makefile b/datasets/roadNet-CA/Makefile
new file mode 100644
index 00000000..2025342f
--- /dev/null
+++ b/datasets/roadNet-CA/Makefile
@@ -0,0 +1,6 @@
+GRAPH_NAME = roadNet-CA
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/SNAP/roadNet-CA.tar.gz
+
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/road_central/Makefile b/datasets/road_central/Makefile
new file mode 100644
index 00000000..7f4cdbc9
--- /dev/null
+++ b/datasets/road_central/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = road_central
+GRAPH_URL  = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/road_central.tar.gz
+
+include ../common.mk
+include ../ufl.mk
diff --git a/datasets/road_usa/Makefile b/datasets/road_usa/Makefile
new file mode 100644
index 00000000..5b8cd30f
--- /dev/null
+++ b/datasets/road_usa/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = road_usa
+GRAPH_URL  = https://www.cise.ufl.edu/research/sparse/MM/DIMACS10/road_usa.tar.gz
+
+include ../common.mk
+include ../ufl.mk
diff --git a/datasets/soc-LiveJournal1/Makefile b/datasets/soc-LiveJournal1/Makefile
new file mode 100644
index 00000000..e34b11fb
--- /dev/null
+++ b/datasets/soc-LiveJournal1/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = soc-LiveJournal1
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/SNAP/soc-LiveJournal1.tar.gz
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/soc-orkut/Makefile b/datasets/soc-orkut/Makefile
new file mode 100644
index 00000000..3c73a1a4
--- /dev/null
+++ b/datasets/soc-orkut/Makefile
@@ -0,0 +1,7 @@
+GRAPH_NAME = soc-orkut
+GRAPH_URL = http://nrvis.com/download/data/soc/soc-orkut.zip
+
+
+include ../networkrepo.mk
+include ../common.mk
+
diff --git a/datasets/soc-sinaweibo/Makefile b/datasets/soc-sinaweibo/Makefile
new file mode 100644
index 00000000..16c47d2e
--- /dev/null
+++ b/datasets/soc-sinaweibo/Makefile
@@ -0,0 +1,7 @@
+GRAPH_NAME = soc-sinaweibo
+GRAPH_URL = http://nrvis.com/download/data/massive/soc-sinaweibo.zip
+
+
+include ../networkrepo.mk
+include ../common.mk
+
diff --git a/datasets/soc-twitter-2010/Makefile b/datasets/soc-twitter-2010/Makefile
new file mode 100644
index 00000000..42bca34e
--- /dev/null
+++ b/datasets/soc-twitter-2010/Makefile
@@ -0,0 +1,7 @@
+GRAPH_NAME = soc-twitter-2010
+GRAPH_URL = http://nrvis.com/download/data/massive/soc-twitter-2010.zip
+
+
+include ../networkrepo.mk
+include ../common.mk
+
diff --git a/datasets/toy.mtx b/datasets/toy.mtx
deleted file mode 100644
index 8b5a958c..00000000
--- a/datasets/toy.mtx
+++ /dev/null
@@ -1,5 +0,0 @@
-%%MatrixMarket matrix coordinate pattern symmetric
-3 3 3
-1 2
-1 3
-2 3
diff --git a/datasets/ufl.mk b/datasets/ufl.mk
new file mode 100644
index 00000000..0c894dd5
--- /dev/null
+++ b/datasets/ufl.mk
@@ -0,0 +1,18 @@
+#common make file fragment for ufl graph datasets
+#just define GRAPH_NAME prior to including this fragment
+
+GRAPH_TAR  = $(GRAPH_NAME).tar.gz
+
+setup: $(GRAPH_NAME).mtx
+
+$(GRAPH_NAME).mtx: $(GRAPH_TAR)
+	tar xvfz $(GRAPH_TAR)
+	cp $(GRAPH_NAME)/$(GRAPH_NAME).mtx $(GRAPH_NAME).mtx
+	rm -rf $(GRAPH_NAME)
+
+clean:
+	rm $(GRAPH_NAME).mtx
+
+realclean: clean
+	rm $(GRAPH_TAR)
+
diff --git a/datasets/uk-2002/Makefile b/datasets/uk-2002/Makefile
new file mode 100644
index 00000000..fa07e72f
--- /dev/null
+++ b/datasets/uk-2002/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = uk-2002
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/LAW/uk-2002.tar.gz
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/uk-2005/Makefile b/datasets/uk-2005/Makefile
new file mode 100644
index 00000000..d221da0f
--- /dev/null
+++ b/datasets/uk-2005/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = uk-2005
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/LAW/uk-2005.tar.gz
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/webbase-1M/Makefile b/datasets/webbase-1M/Makefile
new file mode 100644
index 00000000..080d648c
--- /dev/null
+++ b/datasets/webbase-1M/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = webbase-1M
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/Williams/webbase-1M.tar.gz
+
+include ../ufl.mk
+include ../common.mk
diff --git a/datasets/webbase-2001/Makefile b/datasets/webbase-2001/Makefile
new file mode 100644
index 00000000..cfc8394f
--- /dev/null
+++ b/datasets/webbase-2001/Makefile
@@ -0,0 +1,5 @@
+GRAPH_NAME = webbase-2001
+GRAPH_URL = https://www.cise.ufl.edu/research/sparse/MM/LAW/webbase-2001.tar.gz
+
+include ../ufl.mk
+include ../common.mk

From 36373c08aad37628fe0137a16024ba5bdacefcaf Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Sat, 25 Jun 2022 19:15:53 -0700
Subject: [PATCH 53/58] Points generator imported from gunrock.

---
 examples/tools/CMakeLists.txt                 |   3 +-
 .../tools/nearest_neighbor/CMakeLists.txt     |  21 +++
 .../nearest_neighbor/nearest_neighbor.cu      |  32 +++++
 include/gunrock/io/points.hxx                 | 130 ++++++++++++++++++
 4 files changed, 185 insertions(+), 1 deletion(-)
 create mode 100644 examples/tools/nearest_neighbor/CMakeLists.txt
 create mode 100644 examples/tools/nearest_neighbor/nearest_neighbor.cu
 create mode 100644 include/gunrock/io/points.hxx

diff --git a/examples/tools/CMakeLists.txt b/examples/tools/CMakeLists.txt
index 09ae4eeb..aa31d41f 100644
--- a/examples/tools/CMakeLists.txt
+++ b/examples/tools/CMakeLists.txt
@@ -1,4 +1,5 @@
 # begin /* Add tools' subdirectories */
 add_subdirectory(cmd)
 add_subdirectory(csr_binary)
-# end /* Add tools' subdirectories */
\ No newline at end of file
+add_subdirectory(nearest_neighbor)
+# end /* Add tools' subdirectories */
diff --git a/examples/tools/nearest_neighbor/CMakeLists.txt b/examples/tools/nearest_neighbor/CMakeLists.txt
new file mode 100644
index 00000000..a03ccf8b
--- /dev/null
+++ b/examples/tools/nearest_neighbor/CMakeLists.txt
@@ -0,0 +1,21 @@
+# begin /* Set the application name. */
+set(APPLICATION_NAME nearest_neighbor)
+# end /* Set the application name. */
+
+# begin /* Add CUDA executables */
+add_executable(${APPLICATION_NAME})
+
+set(SOURCE_LIST 
+    ${APPLICATION_NAME}.cu
+)
+
+target_sources(${APPLICATION_NAME} PRIVATE ${SOURCE_LIST})
+target_link_libraries(${APPLICATION_NAME} PRIVATE essentials)
+get_target_property(ESSENTIALS_ARCHITECTURES essentials CUDA_ARCHITECTURES)
+set_target_properties(${APPLICATION_NAME} 
+    PROPERTIES 
+        CUDA_ARCHITECTURES ${ESSENTIALS_ARCHITECTURES}
+) # XXX: Find a better way to inherit essentials properties.
+
+message(STATUS "Example Added: ${APPLICATION_NAME}")
+# end /* Add CUDA executables */
\ No newline at end of file
diff --git a/examples/tools/nearest_neighbor/nearest_neighbor.cu b/examples/tools/nearest_neighbor/nearest_neighbor.cu
new file mode 100644
index 00000000..f163e23f
--- /dev/null
+++ b/examples/tools/nearest_neighbor/nearest_neighbor.cu
@@ -0,0 +1,32 @@
+/**
+ * @file points.hxx
+ * @author Agnieszka Lupinska (lupinska.agnieszka@gmail.com)
+ * @brief Class for generating points for nearest neighbor.
+ * @version 0.1
+ * @date 2019-11-01
+ *
+ * @copyright Copyright (c) 2022
+ *
+ */
+
+#include <gunrock/io/points.hxx>
+
+using namespace gunrock;
+using namespace io;
+
+int main() {
+  int dim1, dim2, n;
+  point_t<int> o(1, 1);
+  point_t<int> left(0, 1);
+  point_t<int> right(2, 1);
+  point_t<int> up(1, 2);
+  point_t<int> down(1, 0);
+  star_t<int> S(o, left, right, up, down);
+  int inputs = std::scanf("%d %d %d", &dim1, &dim2, &n);
+  std::set<point_t<int>> points_set;
+  generate(points_set, S, dim1, dim2, n);
+  std::vector<point_t<int>> points(points_set.size());
+  copy(points_set.begin(), points_set.end(), points.begin());
+  draw(points, dim1, dim2, n);
+  write(points);
+}
\ No newline at end of file
diff --git a/include/gunrock/io/points.hxx b/include/gunrock/io/points.hxx
new file mode 100644
index 00000000..1b761f6a
--- /dev/null
+++ b/include/gunrock/io/points.hxx
@@ -0,0 +1,130 @@
+/**
+ * @file points.hxx
+ * @author Agnieszka Lupinska (lupinska.agnieszka@gmail.com)
+ * @brief Class for generating points for nearest neighbor.
+ * @version 0.1
+ * @date 2019-11-01
+ *
+ * @copyright Copyright (c) 2022
+ *
+ */
+
+#pragma once
+
+#include <cstdio>
+#include <vector>
+#include <algorithm>
+#include <cstdlib>
+#include <set>
+#include <time.h>
+
+using namespace std;
+
+namespace gunrock {
+namespace io {
+
+template <typename type_t = int>
+struct point_t {
+  type_t x;
+  type_t y;
+
+  point_t() {}
+  point_t(type_t X, type_t Y) : x(X), y(Y) {}
+
+  point_t operator+(const point_t& p) { return point_t(x + p.x, y + p.y); }
+  bool operator<(const point_t& p1) const {
+    return (p1.y > y) or (p1.y == y and p1.x < x);
+  }
+};
+
+struct comp {
+  template <typename type_t = int>
+  inline bool operator()(const point_t<type_t>& p1, const point_t<type_t>& p2) {
+    return (p1.y > p2.y) or (p1.y == p2.y and p1.x < p2.x);
+  }
+};
+
+template <typename type_t = int>
+struct star_t {
+  point_t<type_t> o;
+  point_t<type_t> left;
+  point_t<type_t> right;
+  point_t<type_t> up;
+  point_t<type_t> down;
+
+  star_t(point_t<type_t> O,
+         point_t<type_t> Left,
+         point_t<type_t> Right,
+         point_t<type_t> Up,
+         point_t<type_t> Down) {
+    o = O;
+    left = Left;
+    right = Right;
+    up = Up;
+    down = Down;
+  }
+
+  star_t(star_t<type_t> S, int x, int y) {
+    o = S.o + point_t<type_t>(x, y);
+    left = S.left + point_t<type_t>(x, y);
+    right = S.right + point_t<type_t>(x, y);
+    up = S.up + point_t<type_t>(x, y);
+    down = S.down + point_t<type_t>(x, y);
+  }
+};
+
+template <typename type_t = int>
+void generate(std::set<point_t<int>>& stars,
+              star_t<type_t> S,
+              int dim1,
+              int dim2,
+              int n) {
+  srand(time(NULL));
+  for (int i = 0; i < n; ++i) {
+    int x = 1 + rand() % (dim1 - 2);
+    int y = 1 + rand() % (dim2 - 2);
+    star_t<type_t> ns = star_t<type_t>(S, x, y);
+    stars.insert(ns.o);
+    stars.insert(ns.up);
+    stars.insert(ns.down);
+    stars.insert(ns.left);
+    stars.insert(ns.right);
+  }
+}
+
+template <typename type_t = int>
+void draw(std::vector<point_t<type_t>>& points, int dim1, int dim2, int n) {
+  sort(points.begin(), points.end(), comp());
+  for (int i = 0; i < points.size(); ++i)
+    fprintf(stderr, "(%d,%d) ", points[i].x, points[i].y);
+  fprintf(stderr, "\n");
+  int iterator = 0;
+  fprintf(stderr, "|_|");
+  for (int i = 1; i <= dim1; ++i)
+    fprintf(stderr, "%2.d ", i);
+  fprintf(stderr, "\n");
+  int O = 1;
+  for (int i = dim2; i > 0; i--) {
+    fprintf(stderr, "%2.d|", i);
+    for (int j = 1; j <= dim1; ++j) {
+      if (points[iterator].y == i and points[iterator].x == j) {
+        fprintf(stderr, "%2.d ", O);
+        ++iterator;
+      } else {
+        fprintf(stderr, "   ");
+      }
+    }
+    fprintf(stderr, "\n");
+  }
+}
+
+template <typename type_t = int>
+void write(std::vector<point_t<type_t>>& points) {
+  sort(points.begin(), points.end(), comp());
+  printf("%d 2\n", (int)points.size());
+  for (int i = 0; i < points.size(); ++i)
+    printf("%d %d %d\n", i + 1, points[i].x, points[i].y);
+}
+
+}  // namespace io
+}  // namespace gunrock
\ No newline at end of file

From ff50e260555bad671db45d169c9a9e2a62ceadf3 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Sat, 25 Jun 2022 19:16:24 -0700
Subject: [PATCH 54/58] (wip) improvements to coloring.

---
 examples/algorithms/color/color_cpu.hxx | 19 +++++++++++----
 include/gunrock/algorithms/color.hxx    | 32 ++++++++++++++++---------
 2 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/examples/algorithms/color/color_cpu.hxx b/examples/algorithms/color/color_cpu.hxx
index b019a790..eca300f3 100644
--- a/examples/algorithms/color/color_cpu.hxx
+++ b/examples/algorithms/color/color_cpu.hxx
@@ -42,8 +42,8 @@ float run(csr_t& csr, vertex_t* colors) {
       for (edge_t e = start_edge; e < start_edge + num_neighbors; ++e) {
         vertex_t u = column_indices[e];
 
-        if ((colors[u] != -1) && (colors[u] != color + 1) &&
-                (colors[u] != color + 2) ||
+        if ((colors[u] != -1) && (colors[u] != color) &&
+                (colors[u] != color + 1) ||
             (v == u))
           continue;
 
@@ -54,9 +54,9 @@ float run(csr_t& csr, vertex_t* colors) {
       }
 
       if (colormax) {
-        colors[v] = color + 1;
+        colors[v] = color;
       } else if (colormin) {
-        colors[v] = color + 2;
+        colors[v] = color + 1;
       }
 
       if (colormax || colormin)
@@ -87,8 +87,17 @@ float compute_error(csr_t& csr,
   for (vertex_t v = 0; v < n_vertices; v++) {
     for (edge_t e = row_offsets[v]; e < row_offsets[v + 1]; e++) {
       vertex_t u = column_indices[e];
-      if (gpu_colors[u] == gpu_colors[v] || gpu_colors[v] == -1)
+
+      // Do not check self-loops.
+      if(v == u)
+        continue;
+        
+      // Check if colors are the same among neighborhoods.
+      if (gpu_colors[u] == gpu_colors[v] || gpu_colors[v] == -1) {
+        std::cout << "Error: " << v << " " << u << " " << gpu_colors[v] << " "
+                  << gpu_colors[u] << std::endl;
         gpu_errors++;
+      }
       // if(cpu_colors[u] == cpu_colors[v] || cpu_colors[v] == -1) cpu_errors++;
     }
   }
diff --git a/include/gunrock/algorithms/color.hxx b/include/gunrock/algorithms/color.hxx
index 049499cf..c862a548 100644
--- a/include/gunrock/algorithms/color.hxx
+++ b/include/gunrock/algorithms/color.hxx
@@ -44,7 +44,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
   using edge_t = typename graph_t::edge_type;
   using weight_t = typename graph_t::weight_type;
 
-  thrust::device_vector<weight_t> randoms;
+  thrust::device_vector<float> randoms;
 
   void init() override {
     auto g = this->get_graph();
@@ -62,7 +62,7 @@ struct problem_t : gunrock::problem_t<graph_t> {
                  gunrock::numeric_limits<vertex_t>::invalid());
 
     // Generate random numbers.
-    generate::random::uniform_distribution(randoms);
+    generate::random::uniform_distribution(randoms, float(0.0f), float(n_vertices));
   }
 };
 
@@ -98,14 +98,22 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
 
     auto color_me_in = [G, colors, randoms, iteration] __host__ __device__(
                            vertex_t const& vertex) -> bool {
-      edge_t start_edge = G.get_starting_edge(vertex);
       edge_t num_neighbors = G.get_number_of_neighbors(vertex);
 
+      // Color two nodes at the same time.
+      const int color = iteration * 2;
+
+      // Exit early if the vertex has no neighbors.
+      if (num_neighbors == 0) {
+        colors[vertex] = color;
+        return false; // remove (colored)
+      }
+
       bool colormax = true;
       bool colormin = true;
 
-      // Color two nodes at the same time.
-      int color = iteration * 2;
+      edge_t start_edge = G.get_starting_edge(vertex);
+      auto rand_v = randoms[vertex];
 
       // Main loop that goes over all the neighbors and finds the maximum or
       // minimum random number vertex.
@@ -113,22 +121,24 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
         vertex_t u = G.get_destination_vertex(e);
 
         if (gunrock::util::limits::is_valid(colors[u]) &&
-                (colors[u] != color + 1) && (colors[u] != color + 2) ||
+                (colors[u] != color) && (colors[u] != color + 1) ||
             (vertex == u))
           continue;
-        if (randoms[vertex] <= randoms[u])
+
+        auto rand_u = randoms[u];
+        if (rand_v <= rand_u)
           colormax = false;
-        if (randoms[vertex] >= randoms[u])
+        if (rand_v >= rand_u)
           colormin = false;
       }
 
       // Color if the node has the maximum OR minimum random number, this way,
       // per iteration we can possibly fill 2 colors at the same time.
       if (colormax) {
-        colors[vertex] = color + 1;
+        colors[vertex] = color;
         return false;  // remove (colored).
       } else if (colormin) {
-        colors[vertex] = color + 2;
+        colors[vertex] = color + 1;
         return false;  // remove (colored).
       } else {
         return true;  // keep (not colored).
@@ -136,7 +146,7 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
     };
 
     // Execute filter operator on the provided lambda.
-    operators::filter::execute<operators::filter_algorithm_t::compact>(
+    operators::filter::execute<operators::filter_algorithm_t::predicated>(
         G, E, color_me_in, context);
   }
 

From 0df4c41604c83e557c0b3958daa8957a1d36f52f Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Sat, 25 Jun 2022 19:16:54 -0700
Subject: [PATCH 55/58] Advance does NOT remove self-loops for the user.

---
 include/gunrock/framework/operators/advance/advance.hxx     | 6 ++++++
 .../gunrock/framework/operators/advance/block_mapped.hxx    | 2 +-
 .../gunrock/framework/operators/advance/merge_path_v2.hxx   | 5 ++---
 .../gunrock/framework/operators/advance/thread_mapped.hxx   | 5 ++---
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/include/gunrock/framework/operators/advance/advance.hxx b/include/gunrock/framework/operators/advance/advance.hxx
index 43ee0611..6964bc0e 100644
--- a/include/gunrock/framework/operators/advance/advance.hxx
+++ b/include/gunrock/framework/operators/advance/advance.hxx
@@ -42,6 +42,9 @@ namespace advance {
  * Thus a vertex in an input frontier map to multiple output items. An efficient
  * advance is the most significant challenge of a GPU implementation.
  *
+ * @note Advance does not remove self-loops, i.e., a vertex can be a neighbor of
+ * itself.
+ *
  * @par Example
  *  The following code is a simple snippet on how to use advance within the
  * enactor loop.
@@ -139,6 +142,9 @@ void execute(graph_t& G,
  * Thus a vertex in an input frontier map to multiple output items. An efficient
  * advance is the most significant challenge of a GPU implementation.
  *
+ * @note Advance does not remove self-loops, i.e., a vertex can be a neighbor of
+ * itself.
+ *
  * @par Example
  *  The following code is a simple snippet on how to use advance within the
  * enactor loop.
diff --git a/include/gunrock/framework/operators/advance/block_mapped.hxx b/include/gunrock/framework/operators/advance/block_mapped.hxx
index bdd0a754..00f17af8 100644
--- a/include/gunrock/framework/operators/advance/block_mapped.hxx
+++ b/include/gunrock/framework/operators/advance/block_mapped.hxx
@@ -141,7 +141,7 @@ __global__ void __launch_bounds__(THREADS_PER_BLOCK, 2)
     // Store [neighbor] into the output frontier.
     if constexpr (output_type != advance_io_type_t::none) {
       output[offset[0] + i] =
-          (cond && n != v) ? n : gunrock::numeric_limits<vertex_t>::invalid();
+          cond ? n : gunrock::numeric_limits<vertex_t>::invalid();
     }
   }
 }
diff --git a/include/gunrock/framework/operators/advance/merge_path_v2.hxx b/include/gunrock/framework/operators/advance/merge_path_v2.hxx
index 6c2b6ea1..c3b3f7e3 100644
--- a/include/gunrock/framework/operators/advance/merge_path_v2.hxx
+++ b/include/gunrock/framework/operators/advance/merge_path_v2.hxx
@@ -169,9 +169,8 @@ __global__ void merge_path_v2_kernel(graph_t G,
 
       if (output_type != advance_io_type_t::none) {
         // std::size_t out_idx = ;
-        // type_t element = (cond && neighbor != source)
-        //                      ? neighbor
-        //                      : gunrock::numeric_limits<type_t>::invalid();
+        // type_t element = cond ? neighbor
+        //                  : gunrock::numeric_limits<type_t>::invalid();
         // output.set_element_at(element, out_idx);
       }
 
diff --git a/include/gunrock/framework/operators/advance/thread_mapped.hxx b/include/gunrock/framework/operators/advance/thread_mapped.hxx
index 6e2af4e4..65746867 100644
--- a/include/gunrock/framework/operators/advance/thread_mapped.hxx
+++ b/include/gunrock/framework/operators/advance/thread_mapped.hxx
@@ -64,10 +64,10 @@ void execute(graph_t& G,
     if (!gunrock::util::limits::is_valid(v))
       return;
 
-    auto starting_edge = G.get_starting_edge(v);
     auto total_edges = G.get_number_of_neighbors(v);
 
     for (auto i = 0; i < total_edges; ++i) {
+      auto starting_edge = G.get_starting_edge(v);
       auto e = i + starting_edge;            // edge id
       auto n = G.get_destination_vertex(e);  // neighbor id
       auto w = G.get_edge_weight(e);         // weight
@@ -75,8 +75,7 @@ void execute(graph_t& G,
 
       if (output_type != advance_io_type_t::none) {
         std::size_t out_idx = segments_ptr[tid] + i;
-        type_t element =
-            (cond && n != v) ? n : gunrock::numeric_limits<type_t>::invalid();
+        type_t element = cond ? n : gunrock::numeric_limits<type_t>::invalid();
         output.set_element_at(element, out_idx);
       }
     }

From 3e1a6bc3a00e8fcab55f5f3f21e9cfb23b1f2bd0 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Sat, 25 Jun 2022 19:38:38 -0700
Subject: [PATCH 56/58] simpler logic for bfs.

---
 include/gunrock/algorithms/bfs.hxx | 20 +++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/include/gunrock/algorithms/bfs.hxx b/include/gunrock/algorithms/bfs.hxx
index 78bdb9b0..a2fd08d9 100644
--- a/include/gunrock/algorithms/bfs.hxx
+++ b/include/gunrock/algorithms/bfs.hxx
@@ -24,7 +24,7 @@ struct param_t {
 template <typename vertex_t>
 struct result_t {
   vertex_t* distances;
-  vertex_t* predecessors;
+  vertex_t* predecessors; /// @todo: implement this.
   result_t(vertex_t* _distances, vertex_t* _predecessors)
       : distances(_distances), predecessors(_predecessors) {}
 };
@@ -99,12 +99,18 @@ struct enactor_t : gunrock::enactor_t<problem_t> {
       // here means that the neighbor is not added to the output frontier, and
       // instead an invalid vertex is added in its place. These invalides (-1 in
       // most cases) can be removed using a filter operator or uniquify.
-      if (distances[neighbor] != std::numeric_limits<vertex_t>::max())
-        return false;
-      else
-        return (math::atomic::cas(
-                    &distances[neighbor], std::numeric_limits<vertex_t>::max(),
-                    iteration + 1) == std::numeric_limits<vertex_t>::max());
+      // if (distances[neighbor] != std::numeric_limits<vertex_t>::max())
+      //   return false;
+      // else
+      //   return (math::atomic::cas(
+      //               &distances[neighbor],
+      //               std::numeric_limits<vertex_t>::max(), iteration + 1) ==
+      //               std::numeric_limits<vertex_t>::max());
+
+      // Simpler logic for the above.
+      auto old_distance =
+          math::atomic::min(&distances[neighbor], iteration + 1);
+      return (iteration + 1 < old_distance);
     };
 
     auto remove_invalids =

From 2db6b528daf995ee8c11e6df9e9947e7f7c86f82 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Sat, 25 Jun 2022 20:03:02 -0700
Subject: [PATCH 57/58] or/and fixes.

---
 include/gunrock/io/points.hxx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/gunrock/io/points.hxx b/include/gunrock/io/points.hxx
index 1b761f6a..03ea7038 100644
--- a/include/gunrock/io/points.hxx
+++ b/include/gunrock/io/points.hxx
@@ -33,14 +33,14 @@ struct point_t {
 
   point_t operator+(const point_t& p) { return point_t(x + p.x, y + p.y); }
   bool operator<(const point_t& p1) const {
-    return (p1.y > y) or (p1.y == y and p1.x < x);
+    return (p1.y > y) || (p1.y == y && p1.x < x);
   }
 };
 
 struct comp {
   template <typename type_t = int>
   inline bool operator()(const point_t<type_t>& p1, const point_t<type_t>& p2) {
-    return (p1.y > p2.y) or (p1.y == p2.y and p1.x < p2.x);
+    return (p1.y > p2.y) || (p1.y == p2.y && p1.x < p2.x);
   }
 };
 
@@ -107,7 +107,7 @@ void draw(std::vector<point_t<type_t>>& points, int dim1, int dim2, int n) {
   for (int i = dim2; i > 0; i--) {
     fprintf(stderr, "%2.d|", i);
     for (int j = 1; j <= dim1; ++j) {
-      if (points[iterator].y == i and points[iterator].x == j) {
+      if (points[iterator].y == i && points[iterator].x == j) {
         fprintf(stderr, "%2.d ", O);
         ++iterator;
       } else {

From fa0fa0650946b6e9b245890392ca08160288fe36 Mon Sep 17 00:00:00 2001
From: neoblizz <osama94@gmail.com>
Date: Sat, 25 Jun 2022 20:26:44 -0700
Subject: [PATCH 58/58] [skip ubuntu] minor clarity in code.

---
 include/gunrock/io/points.hxx | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/gunrock/io/points.hxx b/include/gunrock/io/points.hxx
index 03ea7038..c9951140 100644
--- a/include/gunrock/io/points.hxx
+++ b/include/gunrock/io/points.hxx
@@ -33,14 +33,14 @@ struct point_t {
 
   point_t operator+(const point_t& p) { return point_t(x + p.x, y + p.y); }
   bool operator<(const point_t& p1) const {
-    return (p1.y > y) || (p1.y == y && p1.x < x);
+    return ((p1.y > y) || ((p1.y == y) && (p1.x < x)));
   }
 };
 
 struct comp {
   template <typename type_t = int>
   inline bool operator()(const point_t<type_t>& p1, const point_t<type_t>& p2) {
-    return (p1.y > p2.y) || (p1.y == p2.y && p1.x < p2.x);
+    return ((p1.y > p2.y) || ((p1.y == p2.y) && (p1.x < p2.x)));
   }
 };
 
@@ -107,7 +107,7 @@ void draw(std::vector<point_t<type_t>>& points, int dim1, int dim2, int n) {
   for (int i = dim2; i > 0; i--) {
     fprintf(stderr, "%2.d|", i);
     for (int j = 1; j <= dim1; ++j) {
-      if (points[iterator].y == i && points[iterator].x == j) {
+      if ((points[iterator].y == i) && (points[iterator].x == j)) {
         fprintf(stderr, "%2.d ", O);
         ++iterator;
       } else {