Skip to content

Commit

Permalink
osc/rdma: performance improvments and bug fixes
Browse files Browse the repository at this point in the history
This commit is a large update to the osc/rdma component. Included in
this commit:

 - Add support for using hardware atomics for fetch-and-op and single
   count accumulate  when using the accumulate lock. This will improve
   the performance of these operations even when not setting the
   single intrinsic info key.

 - Rework how large accumulates are done. They now block on the get
   operation to fix some bugs discovered by an IBM one-sided test. I
   may roll back some of the changes if the underlying bug in the
   original design is discovered. There appear to be no real
   difference (on the hardware this was tested with) in performance so
   its probably a non-issue. References #2530.

 - Add support for an additional lock-all algorithm: on-demand. The
   on-demand algorithm will attempt to acquire the peer lock when
   starting an RMA operation. The lock algorithm default has not
   changed. The algorithm can be selected by setting the
   osc_rdma_locking_mode MCA variable. The valid values are two_level
   and on_demand.

 - Make use of the btl_flush function if available. This can improve
   performance with some btls.

 - When using btl_flush do not keep track of the number of put
   operations. This reduces the number of atomic operations in the
   critical path.

 - Make the window buffers more friendly to multi-threaded
   applications. This was done by dropping support for multiple
   buffers per MPI window. I intend to re-add that support once the
   underlying performance bug under the old buffering scheme is
   fixed.

 - Fix a bug in request completion in the accumulate, get, and put
   paths. This also helps with #2530.

 - General code cleanup and fixes.

Signed-off-by: Nathan Hjelm <hjelmn@lanl.gov>
  • Loading branch information
hjelmn committed Mar 15, 2018
1 parent 5f58e7b commit db32f17
Show file tree
Hide file tree
Showing 16 changed files with 964 additions and 880 deletions.
113 changes: 106 additions & 7 deletions ompi/mca/osc/rdma/osc_rdma.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
* University of Stuttgart. All rights reserved.
* Copyright (c) 2004-2005 The Regents of the University of California.
* All rights reserved.
* Copyright (c) 2007-2017 Los Alamos National Security, LLC. All rights
* Copyright (c) 2007-2018 Los Alamos National Security, LLC. All rights
* reserved.
* Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
* Copyright (c) 2012-2013 Sandia National Laboratories. All rights reserved.
Expand Down Expand Up @@ -50,6 +50,11 @@

#include "opal_stdint.h"

enum {
OMPI_OSC_RDMA_LOCKING_TWO_LEVEL,
OMPI_OSC_RDMA_LOCKING_ON_DEMAND,
};

/**
* @brief osc rdma component structure
*/
Expand Down Expand Up @@ -87,6 +92,9 @@ struct ompi_osc_rdma_component_t {
/** Default value of the no_locks info key for new windows */
bool no_locks;

/** Locking mode to use as the default for all windows */
int locking_mode;

/** Accumulate operations will only operate on a single intrinsic datatype */
bool acc_single_intrinsic;

Expand Down Expand Up @@ -119,6 +127,8 @@ struct ompi_osc_rdma_module_t {
/** Mutex lock protecting module data */
opal_mutex_t lock;

/** locking mode to use */
int locking_mode;

/* window configuration */

Expand Down Expand Up @@ -147,10 +157,12 @@ struct ompi_osc_rdma_module_t {
/** Local displacement unit. */
int disp_unit;


/** global leader */
ompi_osc_rdma_peer_t *leader;

/** my peer structure */
ompi_osc_rdma_peer_t *my_peer;

/** pointer to free on cleanup (may be NULL) */
void *free_after;

Expand Down Expand Up @@ -276,6 +288,16 @@ int ompi_osc_rdma_free (struct ompi_win_t *win);
*/
int ompi_osc_module_add_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer);

/**
* @brief demand lock a peer
*
* @param[in] module osc rdma module
* @param[in] peer peer to lock
*
* @returns OMPI_SUCCESS on success
*/
int ompi_osc_rdma_demand_lock_peer (ompi_osc_rdma_module_t *module, ompi_osc_rdma_peer_t *peer);

/**
* @brief check if a peer object is cached for a remote rank
*
Expand Down Expand Up @@ -449,10 +471,18 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
}

return NULL;
case OMPI_OSC_RDMA_SYNC_TYPE_FENCE:
case OMPI_OSC_RDMA_SYNC_TYPE_LOCK:
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found fence/lock_all access epoch for target %d", target);
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found lock_all access epoch for target %d", target);

*peer = ompi_osc_rdma_module_peer (module, target);
if (OPAL_UNLIKELY(OMPI_OSC_RDMA_LOCKING_ON_DEMAND == module->locking_mode &&
!ompi_osc_rdma_peer_is_demand_locked (*peer))) {
ompi_osc_rdma_demand_lock_peer (module, *peer);
}

return &module->all_sync;
case OMPI_OSC_RDMA_SYNC_TYPE_FENCE:
OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "found fence access epoch for target %d", target);
/* fence epoch is now active */
module->all_sync.epoch_active = true;
*peer = ompi_osc_rdma_module_peer (module, target);
Expand All @@ -470,25 +500,94 @@ static inline ompi_osc_rdma_sync_t *ompi_osc_rdma_module_sync_lookup (ompi_osc_r
return NULL;
}

static bool ompi_osc_rdma_use_btl_flush (ompi_osc_rdma_module_t *module)
{
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
return !!(module->selected_btl->btl_flush);
#else
return false;
#endif
}

/**
* @brief increment the outstanding rdma operation counter (atomic)
*
* @param[in] rdma_sync osc rdma synchronization object
*/
static inline void ompi_osc_rdma_sync_rdma_inc_always (ompi_osc_rdma_sync_t *rdma_sync)
{
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma.counter, 1);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "inc: there are %ld outstanding rdma operations",
(unsigned long) rdma_sync->outstanding_rdma.counter);
}

static inline void ompi_osc_rdma_sync_rdma_inc (ompi_osc_rdma_sync_t *rdma_sync)
{
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
if (ompi_osc_rdma_use_btl_flush (rdma_sync->module)) {
return;
}
#endif
ompi_osc_rdma_sync_rdma_inc_always (rdma_sync);
}

/**
* @brief decrement the outstanding rdma operation counter (atomic)
*
* @param[in] rdma_sync osc rdma synchronization object
*/
static inline void ompi_osc_rdma_sync_rdma_dec_always (ompi_osc_rdma_sync_t *rdma_sync)
{
opal_atomic_wmb ();
ompi_osc_rdma_counter_add (&rdma_sync->outstanding_rdma.counter, -1);

OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "dec: there are %ld outstanding rdma operations",
(unsigned long) rdma_sync->outstanding_rdma.counter);
}

static inline void ompi_osc_rdma_sync_rdma_dec (ompi_osc_rdma_sync_t *rdma_sync)
{
#if defined(BTL_VERSION) && (BTL_VERSION >= 310)
if (ompi_osc_rdma_use_btl_flush (rdma_sync->module)) {
return;
}
#endif
ompi_osc_rdma_sync_rdma_dec_always (rdma_sync);
}

/**
* @brief complete all outstanding rdma operations to all peers
*
* @param[in] module osc rdma module
*/
static inline void ompi_osc_rdma_sync_rdma_complete (ompi_osc_rdma_sync_t *sync)
{
ompi_osc_rdma_aggregation_t *aggregation, *next;

if (opal_list_get_size (&sync->aggregations)) {
ompi_osc_rdma_aggregation_t *aggregation, *next;

OPAL_THREAD_SCOPED_LOCK(&sync->lock,
OPAL_LIST_FOREACH_SAFE(aggregation, next, &sync->aggregations, ompi_osc_rdma_aggregation_t) {
fprintf (stderr, "Flushing aggregation %p, peeer %p\n", aggregation, aggregation->peer);
ompi_osc_rdma_peer_aggregate_flush (aggregation->peer);
});
}

#if !defined(BTL_VERSION) || (BTL_VERSION < 310)
do {
opal_progress ();
} while (sync->outstanding_rdma);
} while (ompi_osc_rdma_sync_get_count (sync));
#else
mca_btl_base_module_t *btl_module = sync->module->selected_btl;

do {
if (!ompi_osc_rdma_use_btl_flush (sync->module)) {
opal_progress ();
} else {
btl_module->btl_flush (btl_module, NULL);
}
} while (ompi_osc_rdma_sync_get_count (sync) || (sync->module->rdma_frag && (sync->module->rdma_frag->pending > 1)));
#endif
}

/**
Expand Down
Loading

0 comments on commit db32f17

Please sign in to comment.