-
Notifications
You must be signed in to change notification settings - Fork 143
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
improve latency test #112
base: master
Are you sure you want to change the base?
improve latency test #112
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -40,6 +40,8 @@ using namespace gdrcopy::test; | |
// manually tuned... | ||
int num_write_iters = 10000; | ||
int num_read_iters = 100; | ||
int small_size_iter_factor = 1000; | ||
int warmup = 10; | ||
|
||
int main(int argc, char *argv[]) | ||
{ | ||
|
@@ -49,10 +51,11 @@ int main(int argc, char *argv[]) | |
bool do_cumemcpy = false; | ||
struct timespec beg, end; | ||
double lat_us; | ||
double bw; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Isn’t this redundant with copybw? If you want to do shmoo for bw, is it better to rename the test? “copylat” doesn’t sound right anymore in that case. |
||
|
||
while(1) { | ||
int c; | ||
c = getopt(argc, argv, "s:d:w:r:hc"); | ||
c = getopt(argc, argv, "s:d:w:r:hcW:"); | ||
if (c == -1) | ||
break; | ||
|
||
|
@@ -69,11 +72,18 @@ int main(int argc, char *argv[]) | |
case 'r': | ||
num_read_iters = strtol(optarg, NULL, 0); | ||
break; | ||
case 'W': | ||
warmup = strtol(optarg, NULL, 0); | ||
break; | ||
case 'c': | ||
do_cumemcpy = true; | ||
break; | ||
case 'h': | ||
printf("syntax: %s -s <buf size> -d <gpu dev id> -w <write iters> -r <read iters> -h[help] -c[do-cuMemcpy]\n", argv[0]); | ||
printf("syntax: %s [-s <buf size>][-d <gpu dev id>][-w <write iters>][-r <read iters>][-h][-c][-w]\n" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The last option should be [-W <# iterations>]. You forgot to capitalize the letter. |
||
"-c benchmark cuMemcpy\n" | ||
"-w <# iterations> modify warmup (default %d)\n", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Capitalize the latter W. |
||
argv[0], | ||
warmup); | ||
exit(EXIT_FAILURE); | ||
break; | ||
default: | ||
|
@@ -137,39 +147,48 @@ int main(int argc, char *argv[]) | |
|
||
if (do_cumemcpy) { | ||
cout << endl; | ||
cout << "cuMemcpy_H2D num iters for each size: " << num_write_iters << endl; | ||
printf("Test \t\t Size(B) \t Avg.Time(us)\n"); | ||
cout << "cuMemcpy_H2D num iters for each size: " << small_size_iter_factor * num_write_iters << "/" << num_write_iters << endl; | ||
printf("Test \t\t Size(B) \t Avg.Time(us) \t Avg.BW(MB/s)\n"); | ||
BEGIN_CHECK { | ||
// cuMemcpy H2D benchmark | ||
copy_size = 1; | ||
while (copy_size <= size) { | ||
int iter = 0; | ||
clock_gettime(MYCLOCK, &beg); | ||
for (iter = 0; iter < num_write_iters; ++iter) { | ||
size_t num_iters = (size < 100000 ? num_write_iters*small_size_iter_factor: num_write_iters); | ||
for (iter = 0; iter < num_iters+warmup; ++iter) { | ||
if (iter == warmup) | ||
clock_gettime(MYCLOCK, &beg); | ||
ASSERTDRV(cuMemcpy(d_A, (CUdeviceptr)init_buf, copy_size)); | ||
} | ||
clock_gettime(MYCLOCK, &end); | ||
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; | ||
printf("cuMemcpy_H2D \t %8zu \t %11.4f\n", copy_size, lat_us); | ||
double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0; | ||
lat_us = dt_us / (double)num_iters; | ||
bw = copy_size / lat_us; | ||
printf("cuMemcpy_H2D \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw); | ||
copy_size <<= 1; | ||
} | ||
} END_CHECK; | ||
|
||
cout << endl; | ||
cout << "cuMemcpy_D2H num iters for each size: " << num_read_iters << endl; | ||
printf("Test \t\t Size(B) \t Avg.Time(us)\n"); | ||
cout << "cuMemcpy_D2H num iters for each size: " << small_size_iter_factor * num_read_iters << "/" << num_read_iters << endl; | ||
printf("Test \t\t Size(B) \t Avg.Time(us) \t Avg.BW(MB/s)\n"); | ||
BEGIN_CHECK { | ||
// cuMemcpy D2H benchmark | ||
copy_size = 1; | ||
while (copy_size <= size) { | ||
int iter = 0; | ||
clock_gettime(MYCLOCK, &beg); | ||
for (iter = 0; iter < num_read_iters; ++iter) { | ||
size_t num_iters = (size < 100000 ? small_size_iter_factor*num_read_iters:num_read_iters); | ||
for (iter = 0; iter < num_iters+warmup; ++iter) { | ||
if (iter == warmup) | ||
clock_gettime(MYCLOCK, &beg); | ||
ASSERTDRV(cuMemcpy((CUdeviceptr)h_buf, d_A, copy_size)); | ||
} | ||
clock_gettime(MYCLOCK, &end); | ||
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; | ||
printf("cuMemcpy_D2H \t %8zu \t %11.4f\n", copy_size, lat_us); | ||
//lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; | ||
double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0; | ||
lat_us = dt_us / (double)num_iters; | ||
bw = copy_size / lat_us; | ||
printf("cuMemcpy_D2H \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw); | ||
copy_size <<= 1; | ||
} | ||
} END_CHECK; | ||
|
@@ -216,17 +235,22 @@ int main(int argc, char *argv[]) | |
cout << "WARNING: Measuring the issue overhead as observed by the CPU. Data might not be ordered all the way to the GPU internal visibility." << endl; | ||
// For more information, see | ||
// https://docs.nvidia.com/cuda/gpudirect-rdma/index.html#sync-behavior | ||
printf("Test \t\t\t Size(B) \t Avg.Time(us)\n"); | ||
printf("Test \t\t\t Size(B) \t Avg.Time(us) \t Avg.BW(MB/s)\n"); | ||
copy_size = 1; | ||
while (copy_size <= size) { | ||
int iter = 0; | ||
clock_gettime(MYCLOCK, &beg); | ||
for (iter = 0; iter < num_write_iters; ++iter) { | ||
size_t num_iters = (size < 100000 ? num_write_iters*small_size_iter_factor: num_write_iters); | ||
for (iter = 0; iter < num_iters+warmup; ++iter) { | ||
if (iter == warmup) | ||
clock_gettime(MYCLOCK, &beg); | ||
gdr_copy_to_mapping(mh, buf_ptr, init_buf, copy_size); | ||
} | ||
clock_gettime(MYCLOCK, &end); | ||
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; | ||
printf("gdr_copy_to_mapping \t %8zu \t %11.4f\n", copy_size, lat_us); | ||
double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0; | ||
lat_us = dt_us / (double)num_iters; | ||
bw = copy_size / lat_us; | ||
printf("gdr_copy_to_mapping \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw); | ||
copy_size <<= 1; | ||
} | ||
|
||
|
@@ -239,12 +263,17 @@ int main(int argc, char *argv[]) | |
copy_size = 1; | ||
while (copy_size <= size) { | ||
int iter = 0; | ||
clock_gettime(MYCLOCK, &beg); | ||
for (iter = 0; iter < num_read_iters; ++iter) | ||
size_t num_iters = (size < 100000 ? small_size_iter_factor*num_read_iters:num_read_iters); | ||
for (iter = 0; iter < num_iters+warmup; ++iter) { | ||
if (iter == warmup) | ||
clock_gettime(MYCLOCK, &beg); | ||
gdr_copy_from_mapping(mh, h_buf, buf_ptr, copy_size); | ||
} | ||
clock_gettime(MYCLOCK, &end); | ||
lat_us = ((end.tv_nsec-beg.tv_nsec)/1000.0 + (end.tv_sec-beg.tv_sec)*1000000.0) / (double)iter; | ||
printf("gdr_copy_from_mapping \t %8zu \t %11.4f\n", copy_size, lat_us); | ||
double dt_us = (end.tv_nsec - beg.tv_nsec)/1000.0 + (end.tv_sec - beg.tv_sec)*1000000.0; | ||
lat_us = dt_us / (double)num_iters; | ||
bw = copy_size / lat_us; | ||
printf("gdr_copy_from_mapping \t %8zu \t %11.4f\t %11.4f\n", copy_size, lat_us, bw); | ||
copy_size <<= 1; | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I understand the intention and usefulness for small sizes. However, it changes what the number of iterations users specify. Is there a better way to do this or could you provide an explanation message? Currently, the users need to read the code in order to know that small sizes and large sizes use different number of iterations.