Skip to content

Commit

Permalink
Optimize the datatype creation.
Browse files Browse the repository at this point in the history
The internal array of counts of predefined types is now only created
when needed, which is either in a heterogeneous environment, or when
one call get_elements. It saves space and makes the convertor creation a
little faster in some cases.

Rearrange the fields in the datatype description structs.

The macro OPAL_DATATYPE_INIT_PTYPES_ARRAY had a bug, and the
static array was only partially created. All predefined types should
have the ptypes array created and initialized.

Signed-off-by: George Bosilca <bosilca@icl.utk.edu>
  • Loading branch information
bosilca committed May 3, 2017
1 parent 043b22f commit 6f17464
Show file tree
Hide file tree
Showing 17 changed files with 230 additions and 128 deletions.
5 changes: 5 additions & 0 deletions ompi/datatype/ompi_datatype_create_darray.c
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,14 @@ int32_t ompi_datatype_create_darray(int size,
}

/* Build up array */
displs[0] = st_offsets[start_loop];
for (i = start_loop; i != end_loop; i += step) {
int nprocs, tmp_rank;

/* Update the lower bound of the local type */
tmp_size *= gsize_array[i - step];
displs[0] += tmp_size * st_offsets[i];

switch(distrib_array[i]) {
case MPI_DISTRIBUTE_BLOCK:
rc = block(gsize_array, i, ndims, psize_array[i], coords[i],
Expand Down
6 changes: 4 additions & 2 deletions ompi/datatype/ompi_datatype_get_elements.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@

#include "ompi/runtime/params.h"
#include "ompi/datatype/ompi_datatype.h"
#include "opal/datatype/opal_datatype_internal.h"

int ompi_datatype_get_elements (ompi_datatype_t *datatype, size_t ucount, size_t *count)
{
Expand All @@ -48,9 +49,10 @@ int ompi_datatype_get_elements (ompi_datatype_t *datatype, size_t ucount, size_t
there are no leftover bytes */
if (!ompi_datatype_is_predefined(datatype)) {
if (0 != internal_count) {
opal_datatype_compute_ptypes(&datatype->super);
/* count the basic elements in the datatype */
for (i = 4, total = 0 ; i < OPAL_DATATYPE_MAX_PREDEFINED ; ++i) {
total += datatype->super.btypes[i];
for (i = OPAL_DATATYPE_FIRST_TYPE, total = 0 ; i < OPAL_DATATYPE_MAX_PREDEFINED ; ++i) {
total += datatype->super.ptypes[i];
}
internal_count = total * internal_count;
}
Expand Down
2 changes: 1 addition & 1 deletion ompi/datatype/ompi_datatype_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -467,7 +467,7 @@ extern const ompi_datatype_t* ompi_datatype_basicDatatypes[OMPI_DATATYPE_MPI_MAX
.name = OPAL_DATATYPE_INIT_NAME(TYPE ## SIZE), \
.desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(TYPE ## SIZE), \
.opt_desc = OPAL_DATATYPE_INIT_DESC_PREDEFINED(TYPE ## SIZE), \
.btypes = OPAL_DATATYPE_INIT_BTYPES_ARRAY(TYPE ## SIZE) \
.ptypes = OPAL_DATATYPE_INIT_PTYPES_ARRAY(TYPE ## SIZE) \
}

#define OMPI_DATATYPE_INIT_PREDEFINED_BASIC_TYPE_FORTRAN( TYPE, NAME, SIZE, ALIGN, FLAGS ) \
Expand Down
18 changes: 9 additions & 9 deletions ompi/datatype/ompi_datatype_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -384,29 +384,29 @@ opal_pointer_array_t ompi_datatype_f_to_c_table = {{0}};
(PDST)->super.opt_desc = (PSRC)->super.opt_desc; \
(PDST)->packed_description = (PSRC)->packed_description; \
(PSRC)->packed_description = NULL; \
memcpy( (PDST)->super.btypes, (PSRC)->super.btypes, \
OPAL_DATATYPE_MAX_PREDEFINED * sizeof(uint32_t) ); \
memcpy( (PDST)->super.ptypes, (PSRC)->super.ptypes, \
OPAL_DATATYPE_MAX_PREDEFINED * sizeof(size_t) ); \
} while(0)

#define DECLARE_MPI2_COMPOSED_STRUCT_DDT( PDATA, MPIDDT, MPIDDTNAME, type1, type2, MPIType1, MPIType2, FLAGS) \
do { \
struct { type1 v1; type2 v2; } s[2]; \
ompi_datatype_t *types[2], *ptype; \
int bLength[2] = {1, 1}; \
ptrdiff_t base, displ[2]; \
ptrdiff_t base, displ[2]; \
\
types[0] = (ompi_datatype_t*)ompi_datatype_basicDatatypes[MPIType1]; \
types[1] = (ompi_datatype_t*)ompi_datatype_basicDatatypes[MPIType2]; \
base = (ptrdiff_t)(&(s[0])); \
displ[0] = (ptrdiff_t)(&(s[0].v1)); \
base = (ptrdiff_t)(&(s[0])); \
displ[0] = (ptrdiff_t)(&(s[0].v1)); \
displ[0] -= base; \
displ[1] = (ptrdiff_t)(&(s[0].v2)); \
displ[1] = (ptrdiff_t)(&(s[0].v2)); \
displ[1] -= base; \
\
ompi_datatype_create_struct( 2, bLength, displ, types, &ptype ); \
displ[0] = (ptrdiff_t)(&(s[1])); \
displ[0] = (ptrdiff_t)(&(s[1])); \
displ[0] -= base; \
if( displ[0] != (displ[1] + (ptrdiff_t)sizeof(type2)) ) \
if( displ[0] != (displ[1] + (ptrdiff_t)sizeof(type2)) ) \
ptype->super.ub = displ[0]; /* force a new extent for the datatype */ \
ptype->super.flags |= (FLAGS); \
ptype->id = MPIDDT; \
Expand Down Expand Up @@ -736,7 +736,7 @@ void ompi_datatype_dump( const ompi_datatype_t* pData )
(long)pData->super.size, (int)pData->super.align, pData->super.id, (int)pData->super.desc.length, (int)pData->super.desc.used,
(long)pData->super.true_lb, (long)pData->super.true_ub, (long)(pData->super.true_ub - pData->super.true_lb),
(long)pData->super.lb, (long)pData->super.ub, (long)(pData->super.ub - pData->super.lb),
(int)pData->super.nbElems, (int)pData->super.btypes[OPAL_DATATYPE_LOOP], (int)pData->super.flags );
(int)pData->super.nbElems, (int)pData->super.loops, (int)pData->super.flags );
/* dump the flags */
if( ompi_datatype_is_predefined(pData) ) {
index += snprintf( buffer + index, length - index, "predefined " );
Expand Down
3 changes: 2 additions & 1 deletion ompi/include/ompi/memchecker.h
Original file line number Diff line number Diff line change
Expand Up @@ -366,7 +366,8 @@ static inline int memchecker_datatype(MPI_Datatype type)
opal_memchecker_base_isdefined (&type->super.opt_desc.length, sizeof(opal_datatype_count_t));
opal_memchecker_base_isdefined (&type->super.opt_desc.used, sizeof(opal_datatype_count_t));
opal_memchecker_base_isdefined (&type->super.opt_desc.desc, sizeof(dt_elem_desc_t *));
opal_memchecker_base_isdefined (&type->super.btypes, OPAL_DATATYPE_MAX_PREDEFINED * sizeof(uint32_t));
if( NULL != type->super.ptypes )
opal_memchecker_base_isdefined (&type->super.ptypes, OPAL_DATATYPE_MAX_PREDEFINED * sizeof(size_t));

opal_memchecker_base_isdefined (&type->id, sizeof(int32_t));
opal_memchecker_base_isdefined (&type->d_f_to_c_index, sizeof(int32_t));
Expand Down
82 changes: 53 additions & 29 deletions opal/datatype/opal_convertor.c
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,6 @@
CONVERTOR->cbmemcpy( (DST), (SRC), (BLENGTH), (CONVERTOR) )
#endif

extern int opal_convertor_create_stack_with_pos_general( opal_convertor_t* convertor,
int starting_point, const int* sizes );

static void opal_convertor_construct( opal_convertor_t* convertor )
{
convertor->pStack = convertor->static_stack;
Expand Down Expand Up @@ -447,32 +444,62 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
return rc;
}

static size_t
opal_datatype_compute_remote_size( const opal_datatype_t* pData,
const size_t* sizes )
{
uint32_t typeMask = pData->bdt_used;
size_t length = 0;

if( OPAL_UNLIKELY(NULL == pData->ptypes) ) {
/* Allocate and fill the array of types used in the datatype description */
opal_datatype_compute_ptypes( (opal_datatype_t*)pData );
}

for( int i = OPAL_DATATYPE_FIRST_TYPE; typeMask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++ ) {
if( typeMask & ((uint32_t)1 << i) ) {
length += (pData->ptypes[i] * sizes[i]);
typeMask ^= ((uint32_t)1 << i);
}
}
return length;
}

/**
* Compute the remote size based on the datatype and count. Assume that the sizes
* are the sizes corresponding to the remote architecture.
*/
size_t opal_convertor_compute_remote_size( opal_convertor_t* pConvertor )
{
if( pConvertor->flags & CONVERTOR_HOMOGENEOUS ) {
pConvertor->remote_size = pConvertor->local_size;
} else {
if( 0 == (pConvertor->flags & CONVERTOR_HAS_REMOTE_SIZE) ) {
/* This is for a single datatype, we must update it with the count */
pConvertor->remote_size = opal_datatype_compute_remote_size(pConvertor->pDesc,
pConvertor->master->remote_sizes);
pConvertor->remote_size *= pConvertor->count;
pConvertor->flags |= CONVERTOR_HAS_REMOTE_SIZE;
}
}
pConvertor->flags |= CONVERTOR_HAS_REMOTE_SIZE;
return pConvertor->remote_size;
}


/**
* Compute the remote size. If necessary remove the homogeneous flag
* and redirect the convertor description toward the non-optimized
* datatype representation.
*/
#define OPAL_CONVERTOR_COMPUTE_REMOTE_SIZE(convertor, datatype, bdt_mask) \
{ \
if( OPAL_UNLIKELY(0 != (bdt_mask)) ) { \
opal_convertor_master_t* master; \
int i; \
uint32_t mask = datatype->bdt_used; \
convertor->flags &= (~CONVERTOR_HOMOGENEOUS); \
master = convertor->master; \
convertor->remote_size = 0; \
for( i = OPAL_DATATYPE_FIRST_TYPE; mask && (i < OPAL_DATATYPE_MAX_PREDEFINED); i++ ) { \
if( mask & ((uint32_t)1 << i) ) { \
convertor->remote_size += (datatype->btypes[i] * \
master->remote_sizes[i]); \
mask ^= ((uint32_t)1 << i); \
} \
} \
convertor->remote_size *= convertor->count; \
convertor->use_desc = &(datatype->desc); \
} \
}
#define OPAL_CONVERTOR_COMPUTE_REMOTE_SIZE(convertor, datatype) \
do { \
if( datatype->bdt_used & convertor->master->hetero_mask ) { \
convertor->flags &= (~CONVERTOR_HOMOGENEOUS); \
convertor->use_desc = &(datatype->desc); \
} \
opal_convertor_compute_remote_size( (convertor) ); \
} while(0)

/**
* This macro will initialize a convertor based on a previously created
Expand All @@ -483,8 +510,6 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
*/
#define OPAL_CONVERTOR_PREPARE( convertor, datatype, count, pUserBuf ) \
{ \
uint32_t bdt_mask; \
\
/* If the data is empty we just mark the convertor as \
* completed. With this flag set the pack and unpack functions \
* will not do anything. \
Expand Down Expand Up @@ -516,9 +541,8 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
} \
} \
\
bdt_mask = datatype->bdt_used & convertor->master->hetero_mask; \
OPAL_CONVERTOR_COMPUTE_REMOTE_SIZE( convertor, datatype, \
bdt_mask ); \
assert( (convertor)->pDesc == (datatype) ); \
OPAL_CONVERTOR_COMPUTE_REMOTE_SIZE( convertor, datatype ); \
assert( NULL != convertor->use_desc->desc ); \
/* For predefined datatypes (contiguous) do nothing more */ \
/* if checksum is enabled then always continue */ \
Expand All @@ -530,7 +554,7 @@ int32_t opal_convertor_set_position_nocheck( opal_convertor_t* convertor,
} \
convertor->flags &= ~CONVERTOR_NO_OP; \
{ \
uint32_t required_stack_length = datatype->btypes[OPAL_DATATYPE_LOOP] + 1; \
uint32_t required_stack_length = datatype->loops + 1; \
\
if( required_stack_length > convertor->stack_size ) { \
assert(convertor->pStack == convertor->static_stack); \
Expand Down
29 changes: 23 additions & 6 deletions opal/datatype/opal_convertor.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ BEGIN_C_DECLS
#define CONVERTOR_STATE_ALLOC 0x04000000
#define CONVERTOR_COMPLETED 0x08000000
#define CONVERTOR_CUDA_UNIFIED 0x10000000
#define CONVERTOR_HAS_REMOTE_SIZE 0x20000000

union dt_elem_desc;
typedef struct opal_convertor_t opal_convertor_t;
Expand All @@ -72,7 +73,7 @@ struct dt_stack_t {
int32_t index; /**< index in the element description */
int16_t type; /**< the type used for the last pack/unpack (original or OPAL_DATATYPE_UINT1) */
size_t count; /**< number of times we still have to do it */
ptrdiff_t disp; /**< actual displacement depending on the count field */
ptrdiff_t disp; /**< actual displacement depending on the count field */
};
typedef struct dt_stack_t dt_stack_t;

Expand Down Expand Up @@ -186,9 +187,16 @@ static inline int32_t opal_convertor_need_buffers( const opal_convertor_t* pConv
return 1;
}

/**
* Update the size of the remote datatype representation. The size will
* depend on the configuration of the master convertor. In homogeneous
* environments, the local and remote sizes are identical.
*/
size_t
opal_convertor_compute_remote_size( opal_convertor_t* pConv );

/*
*
/**
* Return the local size of the convertor (count times the size of the datatype).
*/
static inline void opal_convertor_get_packed_size( const opal_convertor_t* pConv,
size_t* pSize )
Expand All @@ -197,16 +205,24 @@ static inline void opal_convertor_get_packed_size( const opal_convertor_t* pConv
}


/*
*
/**
* Return the remote size of the convertor (count times the remote size of the
* datatype). On homogeneous environments the local and remote sizes are
* identical.
*/
static inline void opal_convertor_get_unpacked_size( const opal_convertor_t* pConv,
size_t* pSize )
{
if( pConv->flags & CONVERTOR_HOMOGENEOUS ) {
*pSize = pConv->local_size;
return;
}
if( 0 == (CONVERTOR_HAS_REMOTE_SIZE & pConv->flags) ) {
opal_convertor_compute_remote_size( (opal_convertor_t*)pConv);
}
*pSize = pConv->remote_size;
}


/**
* Return the current absolute position of the next pack/unpack. This function is
* mostly useful for contiguous datatypes, when we need to get the pointer to the
Expand Down Expand Up @@ -279,6 +295,7 @@ opal_convertor_raw( opal_convertor_t* convertor, /* [IN/OUT] */
uint32_t* iov_count, /* [IN/OUT] */
size_t* length ); /* [OUT] */


/*
* Upper level does not need to call the _nocheck function directly.
*/
Expand Down
27 changes: 16 additions & 11 deletions opal/datatype/opal_datatype.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ BEGIN_C_DECLS
#endif
/*
* No more than this number of _Basic_ datatypes in C/CPP or Fortran
* are supported (in order to not change setup and usage of btypes).
* are supported (in order to not change setup and usage of the predefined
* datatypes).
*
* XXX TODO Adapt to whatever the OMPI-layer needs
* BEWARE: This constant should reflect whatever the OMPI-layer needs.
*/
#define OPAL_DATATYPE_MAX_SUPPORTED 47

Expand Down Expand Up @@ -108,13 +109,14 @@ struct opal_datatype_t {
uint32_t bdt_used; /**< bitset of which basic datatypes are used in the data description */
size_t size; /**< total size in bytes of the memory used by the data if
the data is put on a contiguous buffer */
ptrdiff_t true_lb; /**< the true lb of the data without user defined lb and ub */
ptrdiff_t true_ub; /**< the true ub of the data without user defined lb and ub */
ptrdiff_t lb; /**< lower bound in memory */
ptrdiff_t ub; /**< upper bound in memory */
ptrdiff_t true_lb; /**< the true lb of the data without user defined lb and ub */
ptrdiff_t true_ub; /**< the true ub of the data without user defined lb and ub */
ptrdiff_t lb; /**< lower bound in memory */
ptrdiff_t ub; /**< upper bound in memory */
/* --- cacheline 1 boundary (64 bytes) --- */
size_t nbElems; /**< total number of elements inside the datatype */
uint32_t align; /**< data should be aligned to */
uint32_t loops; /**< number of loops on the iternal type stack */

/* Attribute fields */
char name[OPAL_MAX_OBJECT_NAME]; /**< name of the datatype */
Expand All @@ -123,11 +125,12 @@ struct opal_datatype_t {
dt_type_desc_t opt_desc; /**< short description of the data used when conversion is useless
or in the send case (without conversion) */

uint32_t btypes[OPAL_DATATYPE_MAX_SUPPORTED];
/**< basic elements count used to compute the size of the
datatype for remote nodes. The length of the array is dependent on
the maximum number of datatypes of all top layers.
Reason being is that Fortran is not at the OPAL layer. */
size_t *ptypes; /**< array of basic predefined types that facilitate the computing
of the remote size in heterogeneous environments. The length of the
array is dependent on the maximum number of predefined datatypes of
all language interfaces (because Fortran is not known at the OPAL
layer). This field should never be initialized in homogeneous
environments */
/* --- cacheline 5 boundary (320 bytes) was 32-36 bytes ago --- */

/* size: 352, cachelines: 6, members: 15 */
Expand Down Expand Up @@ -281,6 +284,8 @@ OPAL_DECLSPEC int32_t
opal_datatype_copy_content_same_ddt( const opal_datatype_t* pData, int32_t count,
char* pDestBuf, char* pSrcBuf );

OPAL_DECLSPEC int opal_datatype_compute_ptypes( opal_datatype_t* datatype );

OPAL_DECLSPEC const opal_datatype_t*
opal_datatype_match_size( int size, uint16_t datakind, uint16_t datalang );

Expand Down
Loading

0 comments on commit 6f17464

Please sign in to comment.