From ecd3e05f2ae19d3d6228d747d94424bb3e152120 Mon Sep 17 00:00:00 2001 From: Mark Allen Date: Tue, 12 Jan 2021 14:50:56 -0500 Subject: [PATCH] [from mpich] make ADIOI_GEN_WriteStrided not step on itself Merging a romio commit that was accepted by mpich: > make ADIOI_GEN_WriteStrided not step on itself > > The ADIOI_GEN_WriteStrided funcion uses data sieving on non-contiguous > types. That is, if it wants to write data at locations > [W...X...Y...Z...] > it reads the whole buffer > [dddddddddddddddd] > changes the locations it wants to write to > [WdddXdddYdddZddd] > then writes the whole thing back. It uses locks to make this safe, but > the problem is this only protects against other parts of the product that > are using locks. And without this PR a peer who is simultaneously making > a simple non-contiguous write wouldn't have locked. > > A testcase to demonstrate the original problem is here: > https://gist.github.com/markalle/d7da240c19e57f095c5d1b13240dae24 > > % mpicc -o x romio_write_timing.c > % mpirun -np 4 ./x > > Note: you need to use a filesystem that uses ADIOI_GEN_WriteStrided to > hit the bug. I was using GPFS. > > This commit is pulled from wkliao after discussing where to put the > new lock. It adds locks to contiguous writes in independent write > functions when data sieving write is not disabled Signed-off-by: Mark Allen --- .../romio/adio/ad_lustre/ad_lustre_wrstr.c | 28 +++++++++------ .../romio321/romio/adio/common/ad_write_str.c | 34 ++++++++++++------- 2 files changed, 38 insertions(+), 24 deletions(-) diff --git a/ompi/mca/io/romio321/romio/adio/ad_lustre/ad_lustre_wrstr.c b/ompi/mca/io/romio321/romio/adio/ad_lustre/ad_lustre_wrstr.c index ce538d4a6b6..a0494d48751 100644 --- a/ompi/mca/io/romio321/romio/adio/ad_lustre/ad_lustre_wrstr.c +++ b/ompi/mca/io/romio321/romio/adio/ad_lustre/ad_lustre_wrstr.c @@ -18,7 +18,7 @@ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ADIO_EXPLICIT_OFFSET, writebuf_off, \ &status1, error_code); \ - if (!(fd->atomicity)) \ + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ if (*error_code != MPI_SUCCESS) { \ *error_code = MPIO_Err_create_code(*error_code, \ @@ -35,7 +35,7 @@ writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \ (writebuf_off / stripe_size + 1) * \ stripe_size - writebuf_off); \ - if (!(fd->atomicity)) \ + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ADIO_EXPLICIT_OFFSET, \ @@ -58,7 +58,7 @@ while (write_sz != req_len) { \ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \ - if (!(fd->atomicity)) \ + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \ ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ if (*error_code != MPI_SUCCESS) { \ *error_code = MPIO_Err_create_code(*error_code, \ @@ -75,7 +75,7 @@ writebuf_len = (unsigned) ADIOI_MIN(end_offset - writebuf_off + 1, \ (writebuf_off / stripe_size + 1) * \ stripe_size - writebuf_off); \ - if (!(fd->atomicity)) \ + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \ ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ADIO_EXPLICIT_OFFSET, \ @@ -221,8 +221,9 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count, writebuf_off = 0; writebuf_len = 0; - /* if atomicity is true, lock the region to be accessed */ - if (fd->atomicity) + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, bufsize); for (j = 0; j < count; j++) { @@ -241,7 +242,7 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); - if (fd->atomicity) + if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE) ADIOI_UNLOCK(fd, start_off, SEEK_SET, bufsize); if (*error_code != MPI_SUCCESS) { ADIOI_Free(writebuf); @@ -325,9 +326,13 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count, userbuf_off = 0; ADIOI_BUFFERED_WRITE_WITHOUT_READ /* write the buffer out finally */ + if (fd->hints->ds_write != ADIOI_HINT_DISABLE) + ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); + if (fd->hints->ds_write != ADIOI_HINT_DISABLE) + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte @@ -378,8 +383,9 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count, fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } -/* if atomicity is true, lock the region to be accessed */ - if (fd->atomicity) + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE) ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); writebuf_off = 0; @@ -502,11 +508,11 @@ void ADIOI_LUSTRE_WriteStrided(ADIO_File fd, const void *buf, int count, ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); - if (!(fd->atomicity)) + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (*error_code != MPI_SUCCESS) return; } - if (fd->atomicity) + if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); ADIOI_Free(writebuf); diff --git a/ompi/mca/io/romio321/romio/adio/common/ad_write_str.c b/ompi/mca/io/romio321/romio/adio/common/ad_write_str.c index 83f2420ddc9..34c7b123ea6 100644 --- a/ompi/mca/io/romio321/romio/adio/common/ad_write_str.c +++ b/ompi/mca/io/romio321/romio/adio/common/ad_write_str.c @@ -14,7 +14,8 @@ if (writebuf_len) { \ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \ - if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \ + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ if (*error_code != MPI_SUCCESS) { \ *error_code = MPIO_Err_create_code(*error_code, \ MPIR_ERR_RECOVERABLE, myname, \ @@ -25,7 +26,8 @@ } \ writebuf_off = req_off; \ writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\ - if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \ + ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \ if (*error_code != MPI_SUCCESS) { \ @@ -42,7 +44,8 @@ while (write_sz != req_len) { \ ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \ - if (!(fd->atomicity)) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \ + ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ if (*error_code != MPI_SUCCESS) { \ *error_code = MPIO_Err_create_code(*error_code, \ MPIR_ERR_RECOVERABLE, myname, \ @@ -54,7 +57,8 @@ userbuf_off += write_sz; \ writebuf_off += writebuf_len; \ writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize,end_offset-writebuf_off+1));\ - if (!(fd->atomicity)) ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) \ + ADIOI_WRITE_LOCK(fd, writebuf_off, SEEK_SET, writebuf_len); \ ADIO_ReadContig(fd, writebuf, writebuf_len, MPI_BYTE, \ ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); \ if (*error_code != MPI_SUCCESS) { \ @@ -191,9 +195,10 @@ void ADIOI_GEN_WriteStrided(ADIO_File fd, const void *buf, int count, writebuf = (char *) ADIOI_Malloc(max_bufsize); writebuf_len = (unsigned) (ADIOI_MIN(max_bufsize, end_offset-writebuf_off+1)); -/* if atomicity is true, lock the region to be accessed */ - if (fd->atomicity) - ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE) + ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); for (j=0; jatomicity) + if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); if (*error_code != MPI_SUCCESS) goto fn_exit; @@ -287,8 +292,10 @@ void ADIOI_GEN_WriteStrided(ADIO_File fd, const void *buf, int count, * datatypes, instead of a count of bytes (which might overflow) * Other WriteContig calls in this path are operating on data * sieving buffer */ + ADIOI_WRITE_LOCK(fd, offset, SEEK_SET, bufsize); ADIO_WriteContig(fd, buf, count, datatype, ADIO_EXPLICIT_OFFSET, offset, status, error_code); + ADIOI_UNLOCK(fd, offset, SEEK_SET, bufsize); if (file_ptr_type == ADIO_INDIVIDUAL) { /* update MPI-IO file pointer to point to the first byte @@ -338,9 +345,10 @@ void ADIOI_GEN_WriteStrided(ADIO_File fd, const void *buf, int count, fwr_size = ADIOI_MIN(flat_file->blocklens[j], bufsize-i_offset); } -/* if atomicity is true, lock the region to be accessed */ - if (fd->atomicity) - ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); + /* if atomicity is true or data sieving is not disable, lock the region + * to be accessed */ + if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE) + ADIOI_WRITE_LOCK(fd, start_off, SEEK_SET, end_offset - start_off + 1); writebuf_off = 0; writebuf_len = 0; @@ -460,11 +468,11 @@ void ADIOI_GEN_WriteStrided(ADIO_File fd, const void *buf, int count, if (writebuf_len) { ADIO_WriteContig(fd, writebuf, writebuf_len, MPI_BYTE, ADIO_EXPLICIT_OFFSET, writebuf_off, &status1, error_code); - if (!(fd->atomicity)) + if (!fd->atomicity && fd->hints->ds_write == ADIOI_HINT_DISABLE) ADIOI_UNLOCK(fd, writebuf_off, SEEK_SET, writebuf_len); if (*error_code != MPI_SUCCESS) goto fn_exit; } - if (fd->atomicity) + if (fd->atomicity || fd->hints->ds_write != ADIOI_HINT_DISABLE) ADIOI_UNLOCK(fd, start_off, SEEK_SET, end_offset-start_off+1); if (file_ptr_type == ADIO_INDIVIDUAL) fd->fp_ind = off;