Skip to content

Commit 931161d

Browse files
edgargabrielKKraljic
authored andcommitted
common/ompio: fix calculation in simple-grouping option
This is based on a bug reported on the mailing list using a netcdf testcase. The problem occurs if processes are using a custom file view, but on some of them it appears as if the default file view is being used. Because of that, the simple-grouping option lead to different number of aggregators used on different processes, and ultimately to a deadlock. This patch fixes the problem by not using the file_view size anymore for the calculation in the simple-grouping option, but the contiguous chunk size (which is identical on all processes). Fixes issue open-mpi#7109 Signed-off-by: Edgar Gabriel <egabriel@central.uh.edu>
1 parent 98fe1f2 commit 931161d

File tree

1 file changed

+6
-5
lines changed

1 file changed

+6
-5
lines changed

ompi/mca/common/ompio/common_ompio_aggregators.c

+6-5
Original file line numberDiff line numberDiff line change
@@ -126,17 +126,17 @@ int mca_common_ompio_simple_grouping(ompio_file_t *fh,
126126
}
127127

128128
P_a = 1;
129-
time_prev = cost_calc ( fh->f_size, P_a, fh->f_view_size, (size_t) fh->f_bytes_per_agg, mode );
129+
time_prev = cost_calc ( fh->f_size, P_a, fh->f_cc_size, (size_t) fh->f_bytes_per_agg, mode );
130130
P_a_prev = P_a;
131131
for ( P_a = incr; P_a <= fh->f_size; P_a += incr ) {
132-
time = cost_calc ( fh->f_size, P_a, fh->f_view_size, (size_t) fh->f_bytes_per_agg, mode );
132+
time = cost_calc ( fh->f_size, P_a, fh->f_cc_size, (size_t) fh->f_bytes_per_agg, mode );
133133
dtime_abs = (time_prev - time);
134134
dtime = dtime_abs / time_prev;
135135
dtime_diff = ( P_a == incr ) ? dtime : (dtime_prev - dtime);
136136
#ifdef OMPIO_DEBUG
137137
if ( 0 == fh->f_rank ){
138138
printf(" d_p = %ld P_a = %d time = %lf dtime = %lf dtime_abs =%lf dtime_diff=%lf\n",
139-
fh->f_view_size, P_a, time, dtime, dtime_abs, dtime_diff );
139+
fh->f_cc_size, P_a, time, dtime, dtime_abs, dtime_diff );
140140
}
141141
#endif
142142
if ( dtime_diff < dtime_threshold ) {
@@ -171,7 +171,7 @@ int mca_common_ompio_simple_grouping(ompio_file_t *fh,
171171
num_groups = P_a_prev;
172172
#ifdef OMPIO_DEBUG
173173
printf(" For P=%d d_p=%ld b_c=%d threshold=%f chosen P_a = %d \n",
174-
fh->f_size, fh->f_view_size, fh->f_bytes_per_agg, dtime_threshold, P_a_prev);
174+
fh->f_size, fh->f_cc_size, fh->f_bytes_per_agg, dtime_threshold, P_a_prev);
175175
#endif
176176

177177
/* Cap the maximum number of aggregators.*/
@@ -183,6 +183,7 @@ int mca_common_ompio_simple_grouping(ompio_file_t *fh,
183183
}
184184

185185
*num_groups_out = num_groups;
186+
186187
return mca_common_ompio_forced_grouping ( fh, num_groups, contg_groups);
187188
}
188189

@@ -576,7 +577,7 @@ int mca_common_ompio_create_groups(ompio_file_t *fh,
576577
opal_output (1, "mca_common_ompio_create_groups: error in mca_common_ompio_prepare_to_group\n");
577578
goto exit;
578579
}
579-
580+
580581
switch(ompio_grouping_flag){
581582

582583
case OMPIO_SPLIT:

0 commit comments

Comments
 (0)