From 4f66be0c1ab0bcde4e6cb30940a3ac60c28866dd Mon Sep 17 00:00:00 2001 From: Davide Rossetti Date: Fri, 28 Feb 2020 16:37:31 -0800 Subject: [PATCH 1/3] print estimated bw in copylat That helps comparing performance for large buffer sizes --- tests/copylat.cpp | 51 +++++++++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 17 deletions(-) diff --git a/tests/copylat.cpp b/tests/copylat.cpp index dfb91c42..7d55c842 100644 --- a/tests/copylat.cpp +++ b/tests/copylat.cpp @@ -40,6 +40,7 @@ using namespace gdrcopy::test; // manually tuned... int num_write_iters = 10000; int num_read_iters = 100; +int small_size_iter_factor = 1000; int main(int argc, char *argv[]) { @@ -49,6 +50,7 @@ int main(int argc, char *argv[]) bool do_cumemcpy = false; struct timespec beg, end; double lat_us; + double bw; while(1) { int c; @@ -137,39 +139,46 @@ int main(int argc, char *argv[]) if (do_cumemcpy) { cout << endl; - cout << "cuMemcpy_H2D num iters for each size: " << num_write_iters << endl; - printf("Test \t\t Size(B) \t Avg.Time(us)\n"); + cout << "cuMemcpy_H2D num iters for each size: " << small_size_iter_factor * num_write_iters << "/" << num_write_iters << endl; + printf("Test \t\t Size(B) \t Avg.Time(us) \t Avg.BW(MB/s)\n"); BEGIN_CHECK { // cuMemcpy H2D benchmark copy_size = 1; while (copy_size <= size) { int iter = 0; + size_t num_iters = (size < 100000 ? num_write_iters*small_size_iter_factor: num_write_iters); clock_gettime(MYCLOCK, &beg); - for (iter = 0; iter < num_write_iters; ++iter) { + for (iter = 0; iter < num_iters; ++iter) { ASSERTDRV(cuMemcpy(d_A, (CUdeviceptr)init_buf, copy_size)); } clock_gettime(MYCLOCK, &end); - lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; - printf("cuMemcpy_H2D \t %8zu \t %11.4f\n", copy_size, lat_us); + double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0; + lat_us = dt_us / (double)num_iters; + bw = copy_size / lat_us; + printf("cuMemcpy_H2D \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw); copy_size <<= 1; } } END_CHECK; cout << endl; - cout << "cuMemcpy_D2H num iters for each size: " << num_read_iters << endl; - printf("Test \t\t Size(B) \t Avg.Time(us)\n"); + cout << "cuMemcpy_D2H num iters for each size: " << small_size_iter_factor * num_read_iters << "/" << num_read_iters << endl; + printf("Test \t\t Size(B) \t Avg.Time(us) \t Avg.BW(MB/s)\n"); BEGIN_CHECK { // cuMemcpy D2H benchmark copy_size = 1; while (copy_size <= size) { int iter = 0; + size_t num_iters = (size < 100000 ? small_size_iter_factor*num_read_iters:num_read_iters); clock_gettime(MYCLOCK, &beg); - for (iter = 0; iter < num_read_iters; ++iter) { + for (iter = 0; iter < num_iters; ++iter) { ASSERTDRV(cuMemcpy((CUdeviceptr)h_buf, d_A, copy_size)); } clock_gettime(MYCLOCK, &end); - lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; - printf("cuMemcpy_D2H \t %8zu \t %11.4f\n", copy_size, lat_us); + //lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; + double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0; + lat_us = dt_us / (double)num_iters; + bw = copy_size / lat_us; + printf("cuMemcpy_D2H \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw); copy_size <<= 1; } } END_CHECK; @@ -216,17 +225,21 @@ int main(int argc, char *argv[]) cout << "WARNING: Measuring the issue overhead as observed by the CPU. Data might not be ordered all the way to the GPU internal visibility." << endl; // For more information, see // https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#sync-behavior - printf("Test \t\t\t Size(B) \t Avg.Time(us)\n"); + printf("Test \t\t\t Size(B) \t Avg.Time(us) \t Avg.BW(MB/s)\n"); copy_size = 1; while (copy_size <= size) { int iter = 0; clock_gettime(MYCLOCK, &beg); - for (iter = 0; iter < num_write_iters; ++iter) { + size_t num_iters = (size < 100000 ? num_write_iters*small_size_iter_factor: num_write_iters); + clock_gettime(MYCLOCK, &beg); + for (iter = 0; iter < num_iters; ++iter) { gdr_copy_to_mapping(mh, buf_ptr, init_buf, copy_size); } clock_gettime(MYCLOCK, &end); - lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; - printf("gdr_copy_to_mapping \t %8zu \t %11.4f\n", copy_size, lat_us); + double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0; + lat_us = dt_us / (double)num_iters; + bw = copy_size / lat_us; + printf("gdr_copy_to_mapping \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw); copy_size <<= 1; } @@ -239,12 +252,16 @@ int main(int argc, char *argv[]) copy_size = 1; while (copy_size <= size) { int iter = 0; + size_t num_iters = (size < 100000 ? small_size_iter_factor*num_read_iters:num_read_iters); clock_gettime(MYCLOCK, &beg); - for (iter = 0; iter < num_read_iters; ++iter) + for (iter = 0; iter < num_iters; ++iter) { gdr_copy_from_mapping(mh, h_buf, buf_ptr, copy_size); + } clock_gettime(MYCLOCK, &end); - lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; - printf("gdr_copy_from_mapping \t %8zu \t %11.4f\n", copy_size, lat_us); + double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0; + lat_us = dt_us / (double)num_iters; + bw = copy_size / lat_us; + printf("gdr_copy_from_mapping \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw); copy_size <<= 1; } From 0647993c8039afde1ecc813fb48d03a9ef2f8d42 Mon Sep 17 00:00:00 2001 From: Davide Rossetti Date: Fri, 28 Feb 2020 16:53:00 -0800 Subject: [PATCH 2/3] add param -d gpuid to sanity --- tests/sanity.cpp | 36 ++++++++++++++++++++---------------- 1 file changed, 20 insertions(+), 16 deletions(-) diff --git a/tests/sanity.cpp b/tests/sanity.cpp index a958ef77..b207a7e9 100644 --- a/tests/sanity.cpp +++ b/tests/sanity.cpp @@ -48,6 +48,7 @@ using namespace std; using namespace gdrcopy::test; volatile bool expecting_exception_signal = false; +int gpu_id = 0; void exception_signal_handle(int sig) { @@ -168,7 +169,7 @@ BEGIN_GDRCOPY_TEST(basic) expecting_exception_signal = false; MB(); - init_cuda(0); + init_cuda(gpu_id); const size_t _size = 256*1024+16; const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; @@ -201,7 +202,7 @@ BEGIN_GDRCOPY_TEST(basic_with_tokens) expecting_exception_signal = false; MB(); - init_cuda(0); + init_cuda(gpu_id); const size_t _size = 256*1024+16; const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; @@ -242,7 +243,7 @@ BEGIN_GDRCOPY_TEST(basic_unaligned_mapping) expecting_exception_signal = false; MB(); - init_cuda(0); + init_cuda(gpu_id); // Allocate for a few bytes so that cuMemAlloc returns an unaligned address // in the next allocation. This behavior is observed in GPU Driver 410 and @@ -337,7 +338,7 @@ BEGIN_GDRCOPY_TEST(data_validation) expecting_exception_signal = false; MB(); - init_cuda(0); + init_cuda(gpu_id); const size_t _size = 256*1024+16; const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; @@ -461,7 +462,7 @@ BEGIN_GDRCOPY_TEST(invalidation_access_after_gdr_close) int mydata = (rand() % 1000) + 1; - init_cuda(0); + init_cuda(gpu_id); CUdeviceptr d_A; ASSERTDRV(gpuMemAlloc(&d_A, size)); @@ -537,7 +538,7 @@ BEGIN_GDRCOPY_TEST(invalidation_access_after_cumemfree) int mydata = (rand() % 1000) + 1; - init_cuda(0); + init_cuda(gpu_id); CUdeviceptr d_A; ASSERTDRV(gpuMemAlloc(&d_A, size)); @@ -613,7 +614,7 @@ BEGIN_GDRCOPY_TEST(invalidation_two_mappings) int mydata = (rand() % 1000) + 1; - init_cuda(0); + init_cuda(gpu_id); CUdeviceptr d_A[2]; @@ -762,7 +763,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_access_after_cumemfree) if (pid == 0) mydata += 10; - init_cuda(0); + init_cuda(gpu_id); CUdeviceptr d_A; ASSERTDRV(gpuMemAlloc(&d_A, size)); @@ -871,7 +872,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_after_gdr_map) const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; const char *myname; - init_cuda(0); + init_cuda(gpu_id); CUdeviceptr d_A; ASSERTDRV(gpuMemAlloc(&d_A, size)); @@ -1011,7 +1012,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_child_gdr_map_parent) const size_t size = (_size + GPU_PAGE_SIZE - 1) & GPU_PAGE_MASK; const char *myname; - init_cuda(0); + init_cuda(gpu_id); CUdeviceptr d_A; ASSERTDRV(gpuMemAlloc(&d_A, size)); @@ -1124,7 +1125,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_map_and_free) int mydata = (rand() % 1000) + 1; - init_cuda(0); + init_cuda(gpu_id); CUdeviceptr d_A; ASSERTDRV(gpuMemAlloc(&d_A, size)); @@ -1229,7 +1230,7 @@ BEGIN_GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_pin_buffer) print_dbg("%s: Start\n", myname); - init_cuda(0); + init_cuda(gpu_id); CUdeviceptr d_A; ASSERTDRV(gpuMemAlloc(&d_A, size)); @@ -1344,7 +1345,7 @@ BEGIN_GDRCOPY_TEST(invalidation_unix_sock_shared_fd_gdr_map) write_fd = filedes_1[1]; } - init_cuda(0); + init_cuda(gpu_id); CUdeviceptr d_A; ASSERTDRV(gpuMemAlloc(&d_A, size)); @@ -1487,7 +1488,7 @@ BEGIN_GDRCOPY_TEST(invalidation_fork_child_gdr_pin_parent_with_tokens) read_fd = filedes_0[0]; write_fd = filedes_1[1]; - init_cuda(0); + init_cuda(gpu_id); ASSERTDRV(gpuMemAlloc(&d_A, size)); ASSERTDRV(cuPointerGetAttribute(&tokens, CU_POINTER_ATTRIBUTE_P2P_TOKENS, d_A)); @@ -1512,13 +1513,16 @@ int main(int argc, char *argv[]) { int c; - while ((c = getopt(argc, argv, "h::v::")) != -1) { + while ((c = getopt(argc, argv, "d:h::v::")) != -1) { switch (c) { + case 'd': + gpu_id = atoi(optarg); + break; case 'v': gdrcopy::test::print_dbg_msg = true; break; case 'h': - cout << "Usage: " << argv[0] << " [-v] [-h]" << endl; + cout << "Usage: " << argv[0] << " [-d gpuid] [-v] [-h]" << endl; break; case '?': if (isprint(optopt)) From a6115ca1806f6765cf8b37e013374248f8a98c91 Mon Sep 17 00:00:00 2001 From: Davide Rossetti Date: Mon, 2 Mar 2020 16:20:21 -0800 Subject: [PATCH 3/3] add extra warmup iterations to latency test --- tests/copylat.cpp | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/tests/copylat.cpp b/tests/copylat.cpp index 7d55c842..e0482c0d 100644 --- a/tests/copylat.cpp +++ b/tests/copylat.cpp @@ -41,6 +41,7 @@ using namespace gdrcopy::test; int num_write_iters = 10000; int num_read_iters = 100; int small_size_iter_factor = 1000; +int warmup = 10; int main(int argc, char *argv[]) { @@ -54,7 +55,7 @@ int main(int argc, char *argv[]) while(1) { int c; - c = getopt(argc, argv, "s:d:w:r:hc"); + c = getopt(argc, argv, "s:d:w:r:hcW:"); if (c == -1) break; @@ -71,11 +72,18 @@ int main(int argc, char *argv[]) case 'r': num_read_iters = strtol(optarg, NULL, 0); break; + case 'W': + warmup = strtol(optarg, NULL, 0); + break; case 'c': do_cumemcpy = true; break; case 'h': - printf("syntax: %s -s -d -w -r -h[help] -c[do-cuMemcpy]\n", argv[0]); + printf("syntax: %s [-s ][-d ][-w ][-r ][-h][-c][-w]\n" + "-c benchmark cuMemcpy\n" + "-w <# iterations> modify warmup (default %d)\n", + argv[0], + warmup); exit(EXIT_FAILURE); break; default: @@ -147,8 +155,9 @@ int main(int argc, char *argv[]) while (copy_size <= size) { int iter = 0; size_t num_iters = (size < 100000 ? num_write_iters*small_size_iter_factor: num_write_iters); - clock_gettime(MYCLOCK, &beg); - for (iter = 0; iter < num_iters; ++iter) { + for (iter = 0; iter < num_iters+warmup; ++iter) { + if (iter == warmup) + clock_gettime(MYCLOCK, &beg); ASSERTDRV(cuMemcpy(d_A, (CUdeviceptr)init_buf, copy_size)); } clock_gettime(MYCLOCK, &end); @@ -169,8 +178,9 @@ int main(int argc, char *argv[]) while (copy_size <= size) { int iter = 0; size_t num_iters = (size < 100000 ? small_size_iter_factor*num_read_iters:num_read_iters); - clock_gettime(MYCLOCK, &beg); - for (iter = 0; iter < num_iters; ++iter) { + for (iter = 0; iter < num_iters+warmup; ++iter) { + if (iter == warmup) + clock_gettime(MYCLOCK, &beg); ASSERTDRV(cuMemcpy((CUdeviceptr)h_buf, d_A, copy_size)); } clock_gettime(MYCLOCK, &end); @@ -231,8 +241,9 @@ int main(int argc, char *argv[]) int iter = 0; clock_gettime(MYCLOCK, &beg); size_t num_iters = (size < 100000 ? num_write_iters*small_size_iter_factor: num_write_iters); - clock_gettime(MYCLOCK, &beg); - for (iter = 0; iter < num_iters; ++iter) { + for (iter = 0; iter < num_iters+warmup; ++iter) { + if (iter == warmup) + clock_gettime(MYCLOCK, &beg); gdr_copy_to_mapping(mh, buf_ptr, init_buf, copy_size); } clock_gettime(MYCLOCK, &end); @@ -253,8 +264,9 @@ int main(int argc, char *argv[]) while (copy_size <= size) { int iter = 0; size_t num_iters = (size < 100000 ? small_size_iter_factor*num_read_iters:num_read_iters); - clock_gettime(MYCLOCK, &beg); - for (iter = 0; iter < num_iters; ++iter) { + for (iter = 0; iter < num_iters+warmup; ++iter) { + if (iter == warmup) + clock_gettime(MYCLOCK, &beg); gdr_copy_from_mapping(mh, h_buf, buf_ptr, copy_size); } clock_gettime(MYCLOCK, &end);