Skip to content

Commit

Permalink
DAOS-14443 pool: Fix pool create performance (#13003) (#13508)
Browse files Browse the repository at this point in the history
The "nominated" optimization for pool create operations in ds_rsvc has
become less effective because of certain recent timing changes. This
patch replaces the "nominated" optimization with new one that piggybacks
on the first POOL_CREATE sent to a pool service.

  - Employ the unused pool_create_in.pri_op.pi_hdl to ask a PS replica
    to campaign, so that no RPC protocol bump is necessary.

In theory, the absolute safety of both optimizations relies on a raft
update (to be made soon).

This patch also changes rsvc_client to begin each leader search from a
random replica, instead of a fixed one. When a lot of pool services are
created on the same set of ranks, this change distributes the pool
service leader more evenly.

Signed-off-by: Li Wei <wei.g.li@intel.com>
  • Loading branch information
liw committed Jan 3, 2024
1 parent b65ab3c commit c0c5de6
Show file tree
Hide file tree
Showing 6 changed files with 55 additions and 54 deletions.
38 changes: 21 additions & 17 deletions src/common/rsvc.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/**
* (C) Copyright 2017-2022 Intel Corporation.
* (C) Copyright 2017-2023 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -53,7 +53,7 @@ rsvc_client_init(struct rsvc_client *client, const d_rank_list_t *ranks)
return -DER_NOMEM;
}
rsvc_client_reset_leader(client);
client->sc_next = 0;
client->sc_next = -1;
return 0;
}

Expand All @@ -78,26 +78,28 @@ rsvc_client_fini(struct rsvc_client *client)
int
rsvc_client_choose(struct rsvc_client *client, crt_endpoint_t *ep)
{
int chosen = -1;
int chosen;

D_DEBUG(DB_MD, DF_CLI"\n", DP_CLI(client));

if (client->sc_ranks->rl_nr == 0) {
D_DEBUG(DB_MD, "replica list empty\n");
return -DER_NOTREPLICA;
}

if (client->sc_leader_known && client->sc_leader_aliveness > 0) {
chosen = client->sc_leader_index;
} else if (client->sc_ranks->rl_nr > 0) {
} else {
if (client->sc_next < 0)
client->sc_next = d_randn(client->sc_ranks->rl_nr);
chosen = client->sc_next;
/* The hintless search is a round robin of all replicas. */
client->sc_next++;
client->sc_next %= client->sc_ranks->rl_nr;
}

if (chosen == -1) {
D_DEBUG(DB_MD, "replica list empty\n");
return -DER_NOTREPLICA;
} else {
D_ASSERTF(chosen >= 0 && chosen < client->sc_ranks->rl_nr,
"%d\n", chosen);
ep->ep_rank = client->sc_ranks->rl_ranks[chosen];
}
D_ASSERTF(chosen >= 0 && chosen < client->sc_ranks->rl_nr, "chosen=%d\n", chosen);
ep->ep_rank = client->sc_ranks->rl_ranks[chosen];
ep->ep_tag = 0;
return 0;
}
Expand Down Expand Up @@ -127,7 +129,7 @@ rsvc_client_process_error(struct rsvc_client *client, int rc,
(rl->rl_nr - pos) * sizeof(*rl->rl_ranks));
client->sc_next = pos;
} else {
client->sc_next = 0;
client->sc_next = (rl->rl_nr > 0 ? 0 : -1);
}
D_ERROR("removed rank %u from replica list due to "DF_RC"\n",
ep->ep_rank, DP_RC(rc));
Expand All @@ -145,10 +147,12 @@ rsvc_client_process_error(struct rsvc_client *client, int rc,
* Gave up this leader. Start the hintless
* search.
*/
D_DEBUG(DB_MD, "give up leader rank %u\n",
ep->ep_rank);
client->sc_next = client->sc_leader_index + 1;
client->sc_next %= client->sc_ranks->rl_nr;
D_DEBUG(DB_MD, "give up leader rank %u\n", ep->ep_rank);
client->sc_next = d_randn(client->sc_ranks->rl_nr);
if (client->sc_next == leader_index) {
client->sc_next++;
client->sc_next %= client->sc_ranks->rl_nr;
}
}
}
}
Expand Down
13 changes: 13 additions & 0 deletions src/gurt/misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,19 @@ d_rand()
return result;
}

/* Return a random integer in [0, n), where n must be positive. */
long int
d_randn(long int n)
{
long int i;

D_ASSERT(n > 0);
i = ((double)d_rand() / D_RAND_MAX) * n;
if (i >= n)
i = 0;
return i;
}

void
d_free(void *ptr)
{
Expand Down
1 change: 1 addition & 0 deletions src/include/gurt/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ extern "C" {

void d_srand(long int);
long int d_rand(void);
long int d_randn(long int n);

/* memory allocating macros */
void d_free(void *);
Expand Down
3 changes: 2 additions & 1 deletion src/pool/rpc.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* (C) Copyright 2016-2022 Intel Corporation.
* (C) Copyright 2016-2023 Intel Corporation.
*
* SPDX-License-Identifier: BSD-2-Clause-Patent
*/
Expand Down Expand Up @@ -169,6 +169,7 @@ extern int dc_pool_proto_version;

CRT_RPC_DECLARE(pool_op, DAOS_ISEQ_POOL_OP, DAOS_OSEQ_POOL_OP)

/* If pri_op.pi_hdl is not null, call rdb_campaign. */
#define DAOS_ISEQ_POOL_CREATE /* input fields */ \
((struct pool_op_in) (pri_op) CRT_VAR) \
((d_rank_list_t) (pri_tgt_ranks) CRT_PTR) \
Expand Down
18 changes: 18 additions & 0 deletions src/pool/srv_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,7 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group,
struct pool_create_in *in;
struct pool_create_out *out;
struct d_backoff_seq backoff_seq;
int n_attempts = 0;
int rc;

/* Check for default label supplied via property. */
Expand Down Expand Up @@ -927,9 +928,16 @@ ds_pool_svc_dist_create(const uuid_t pool_uuid, int ntargets, const char *group,
in->pri_ndomains = ndomains;
in->pri_domains.ca_count = ndomains;
in->pri_domains.ca_arrays = (uint32_t *)domains;
if (n_attempts == 0)
/*
* This is our first attempt. Use a non-null pi_hdl to ask the
* chosen PS replica to campaign.
*/
uuid_generate(in->pri_op.pi_hdl);

/* Send the POOL_CREATE request. */
rc = dss_rpc_send(rpc);
n_attempts++;
out = crt_reply_get(rpc);
D_ASSERT(out != NULL);
rc = rsvc_client_complete_rpc(&client, &ep, rc,
Expand Down Expand Up @@ -2605,6 +2613,16 @@ ds_pool_create_handler(crt_rpc_t *rpc)
D_GOTO(out_mutex, rc = -DER_CANCELED);
}

if (!uuid_is_null(in->pri_op.pi_hdl)) {
/*
* Try starting a campaign without waiting for the election
* timeout. Since this is a performance optimization, ignore
* errors.
*/
rc = rdb_campaign(svc->ps_rsvc.s_db);
D_DEBUG(DB_MD, DF_UUID": campaign: "DF_RC"\n", DP_UUID(svc->ps_uuid), DP_RC(rc));
}

rc = rdb_tx_begin(svc->ps_rsvc.s_db, RDB_NIL_TERM, &tx);
if (rc != 0)
D_GOTO(out_mutex, rc);
Expand Down
36 changes: 0 additions & 36 deletions src/rsvc/srv.c
Original file line number Diff line number Diff line change
Expand Up @@ -659,28 +659,6 @@ ds_rsvc_request_map_dist(struct ds_rsvc *svc)
D_DEBUG(DB_MD, "%s: requested map distribution\n", svc->s_name);
}

static bool
nominated(d_rank_list_t *replicas, uuid_t db_uuid)
{
int i;

/* No initial membership. */
if (replicas == NULL || replicas->rl_nr < 1)
return false;

/* Only one replica. */
if (replicas->rl_nr == 1)
return true;

/*
* Nominate by hashing the DB UUID. The only requirement is that every
* replica shall end up with the same nomination.
*/
i = d_hash_murmur64(db_uuid, sizeof(uuid_t), 0x2db) % replicas->rl_nr;

return (replicas->rl_ranks[i] == dss_self_rank());
}

static bool
self_only(d_rank_list_t *replicas)
{
Expand Down Expand Up @@ -713,20 +691,6 @@ start(enum ds_rsvc_class_id class, d_iov_t *id, uuid_t db_uuid, uint64_t term, b
if (rc != 0)
goto err_storage;

/*
* If creating a replica with an initial membership, we are
* bootstrapping the DB (via sc_bootstrap or an external mechanism). If
* we are the "nominated" replica, start a campaign without waiting for
* the election timeout.
*/
if (create && nominated(replicas, svc->s_db_uuid)) {
/* Give others a chance to get ready for voting. */
dss_sleep(1 /* ms */);
rc = rdb_campaign(svc->s_db);
if (rc != 0)
goto err_db;
}

if (create && self_only(replicas) &&
rsvc_class(class)->sc_bootstrap != NULL) {
rc = bootstrap_self(svc, arg);
Expand Down

0 comments on commit c0c5de6

Please sign in to comment.