From eb1f9c8bc2f527fc66b28e3805177ac14a4ccac0 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Wed, 4 Sep 2024 22:21:53 +0200 Subject: [PATCH 1/6] DAOS-16508 cksum: retry a few times on checksum mismatch on update Unlike fetch, we return DER_CSUM on update (turned into EIO by dfs) without any retry. We should retry a few times in case it is a transient error. The patch also prints more information about the actual checksum mismatch. Signed-off-by: Johann Lombardi --- src/common/checksum.c | 4 ++++ src/object/cli_csum.h | 3 +++ src/object/cli_obj.c | 17 ++++++++++++++--- src/object/obj_internal.h | 3 ++- 4 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/common/checksum.c b/src/common/checksum.c index 69cf89ab07a..c36f14e3c6d 100644 --- a/src/common/checksum.c +++ b/src/common/checksum.c @@ -278,6 +278,10 @@ daos_csummer_compare_csum_info(struct daos_csummer *obj, match = daos_csummer_csum_compare(obj, ci_idx2csum(a, i), ci_idx2csum(b, i), a->cs_len); + if (unlikely(!match)) + D_ERROR("Checksum mismatch at index %d/%d "DF_CI_BUF" != "DF_CI_BUF"\n", i, + a->cs_nr, DP_CI_BUF(ci_idx2csum(a, i), a->cs_len), + DP_CI_BUF(ci_idx2csum(b, i), b->cs_len)); } return match; diff --git a/src/object/cli_csum.h b/src/object/cli_csum.h index 3b1431b8c1f..72951cef893 100644 --- a/src/object/cli_csum.h +++ b/src/object/cli_csum.h @@ -11,6 +11,9 @@ #include #include "obj_internal.h" +/** How many times to retry UPDATE RPCs on checksum error */ +#define MAX_CSUM_UPDATE_RETRY 10 + int dc_obj_csum_update(struct daos_csummer *csummer, struct cont_props props, daos_obj_id_t param, daos_key_t *dkey, daos_iod_t *iods, d_sg_list_t *sgls, const uint32_t iod_nr, struct dcs_layout *layout, struct dcs_csum_info **dkey_csum, diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 2c0c0a952ea..1ce124f5ef7 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -4760,9 +4760,20 @@ obj_comp_cb(tse_task_t *task, void *data) obj_auxi->tx_uncertain = 1; else obj_auxi->nvme_io_err = 1; - } else if (task->dt_result != -DER_NVME_IO) { - /* Don't retry update for CSUM & UNCERTAIN errors */ - obj_auxi->io_retry = 0; + } else { + if (task->dt_result == -DER_CSUM) { + /** Retry a few times on checksum error on update */ + if (!obj_auxi->csum_retry && + obj_auxi->csum_retry_count++ < MAX_CSUM_UPDATE_RETRY) { + obj_auxi->csum_retry = 1; + obj_auxi->csum_retry_count++; + } else { + obj_auxi->io_retry = 0; + } + } else if (task->dt_result != -DER_NVME_IO) { + /* Don't retry update for UNCERTAIN errors */ + obj_auxi->io_retry = 0; + } } } else { obj_auxi->io_retry = 0; diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index ec6aa3b817e..8887c46e44e 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -474,9 +474,10 @@ struct obj_auxi_args { rebuilding:1, for_migrate:1; /* request flags. currently only: ORF_RESEND */ - uint32_t flags; uint32_t specified_shard; + uint16_t flags; uint16_t retry_cnt; + uint16_t csm_retry_cnt;; uint16_t inprogress_cnt; struct obj_req_tgts req_tgts; d_sg_list_t *sgls_dup; From 4ba7f27cb661393010a9d9c35422c1c1cacb4e74 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Wed, 4 Sep 2024 22:24:48 +0200 Subject: [PATCH 2/6] Fix typo Signed-off-by: Johann Lombardi --- src/object/obj_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 8887c46e44e..6f4ab397f68 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -477,7 +477,7 @@ struct obj_auxi_args { uint32_t specified_shard; uint16_t flags; uint16_t retry_cnt; - uint16_t csm_retry_cnt;; + uint16_t csum_retry_cnt; uint16_t inprogress_cnt; struct obj_req_tgts req_tgts; d_sg_list_t *sgls_dup; From 979ebc9aab439a918fc62cc0fb2495f1697371b4 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Wed, 4 Sep 2024 22:37:11 +0200 Subject: [PATCH 3/6] Fix another typo. Signed-off-by: Johann Lombardi --- src/object/cli_csum.h | 2 +- src/object/cli_obj.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/object/cli_csum.h b/src/object/cli_csum.h index 72951cef893..566c707054d 100644 --- a/src/object/cli_csum.h +++ b/src/object/cli_csum.h @@ -12,7 +12,7 @@ #include "obj_internal.h" /** How many times to retry UPDATE RPCs on checksum error */ -#define MAX_CSUM_UPDATE_RETRY 10 +#define MAX_CSUM_RETRY 10 int dc_obj_csum_update(struct daos_csummer *csummer, struct cont_props props, daos_obj_id_t param, daos_key_t *dkey, daos_iod_t *iods, d_sg_list_t *sgls, const uint32_t iod_nr, diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 1ce124f5ef7..0aba5cef642 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -4764,9 +4764,9 @@ obj_comp_cb(tse_task_t *task, void *data) if (task->dt_result == -DER_CSUM) { /** Retry a few times on checksum error on update */ if (!obj_auxi->csum_retry && - obj_auxi->csum_retry_count++ < MAX_CSUM_UPDATE_RETRY) { + obj_auxi->csum_retry_cnt < MAX_CSUM_RETRY) { obj_auxi->csum_retry = 1; - obj_auxi->csum_retry_count++; + obj_auxi->csum_retry_cnt++; } else { obj_auxi->io_retry = 0; } From 90656e31c2a3efce768ab080919634e47a9167f9 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Thu, 5 Sep 2024 14:55:16 +0200 Subject: [PATCH 4/6] Address Xuehzao's feedback. Signed-off-by: Johann Lombardi --- src/object/cli_obj.c | 26 ++++++++++++++++++-------- src/object/obj_internal.h | 4 ++-- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 0aba5cef642..0d300f8544b 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -4684,12 +4684,15 @@ obj_comp_cb(tse_task_t *task, void *data) int rc; obj_auxi = tse_task_stack_pop(task, sizeof(*obj_auxi)); - obj_auxi->io_retry = 0; - obj_auxi->result = 0; - obj_auxi->csum_retry = 0; - obj_auxi->tx_uncertain = 0; - obj_auxi->nvme_io_err = 0; obj = obj_auxi->obj; + + /** Clear various bits for a new attempt */ + obj_auxi->io_retry = 0; + obj_auxi->result = 0; + obj_auxi->csum_retry = 0; + obj_auxi->tx_uncertain = 0; + obj_auxi->nvme_io_err = 0; + rc = obj_comp_cb_internal(obj_auxi); if (rc != 0 || obj_auxi->result) { if (task->dt_result == 0) @@ -4762,12 +4765,19 @@ obj_comp_cb(tse_task_t *task, void *data) obj_auxi->nvme_io_err = 1; } else { if (task->dt_result == -DER_CSUM) { + struct shard_rw_args *rw_arg = &obj_auxi->rw_args; + /** Retry a few times on checksum error on update */ - if (!obj_auxi->csum_retry && - obj_auxi->csum_retry_cnt < MAX_CSUM_RETRY) { + if (rw_arg->csum_retry_cnt < MAX_CSUM_RETRY) { obj_auxi->csum_retry = 1; - obj_auxi->csum_retry_cnt++; + rw_arg->csum_retry_cnt++; + D_DEBUG(DB_IO, DF_OID" checksum error on " + "update, retrying\n", + DP_OID(obj->cob_md.omd_id)); } else { + D_ERROR(DF_OID" checksum error on udpate, " + "too many retries. Failing I/O\n", + DP_OID(obj->cob_md.omd_id)); obj_auxi->io_retry = 0; } } else if (task->dt_result != -DER_NVME_IO) { diff --git a/src/object/obj_internal.h b/src/object/obj_internal.h index 6f4ab397f68..ae5c9c82fd1 100644 --- a/src/object/obj_internal.h +++ b/src/object/obj_internal.h @@ -281,6 +281,7 @@ struct shard_rw_args { struct dcs_csum_info *dkey_csum; struct dcs_iod_csums *iod_csums; struct obj_reasb_req *reasb_req; + uint16_t csum_retry_cnt; }; struct coll_sparse_targets { @@ -475,9 +476,8 @@ struct obj_auxi_args { for_migrate:1; /* request flags. currently only: ORF_RESEND */ uint32_t specified_shard; - uint16_t flags; + uint32_t flags; uint16_t retry_cnt; - uint16_t csum_retry_cnt; uint16_t inprogress_cnt; struct obj_req_tgts req_tgts; d_sg_list_t *sgls_dup; From 3fc0272f51da6f2ad627aeba311e720e3a4dd83f Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Thu, 5 Sep 2024 15:36:52 +0200 Subject: [PATCH 5/6] Fix lint error Signed-off-by: Johann Lombardi --- src/object/cli_obj.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index 0d300f8544b..a365d60087b 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -4775,7 +4775,7 @@ obj_comp_cb(tse_task_t *task, void *data) "update, retrying\n", DP_OID(obj->cob_md.omd_id)); } else { - D_ERROR(DF_OID" checksum error on udpate, " + D_ERROR(DF_OID" checksum error on update, " "too many retries. Failing I/O\n", DP_OID(obj->cob_md.omd_id)); obj_auxi->io_retry = 0; From 171126062c95e06ab70779934e7decd1657b1067 Mon Sep 17 00:00:00 2001 From: Johann Lombardi Date: Fri, 6 Sep 2024 17:26:04 +0200 Subject: [PATCH 6/6] Addres Xuezhao's feedback Signed-off-by: Johann Lombardi --- src/object/cli_obj.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/object/cli_obj.c b/src/object/cli_obj.c index a365d60087b..318ba0cb51e 100644 --- a/src/object/cli_obj.c +++ b/src/object/cli_obj.c @@ -4764,7 +4764,8 @@ obj_comp_cb(tse_task_t *task, void *data) else obj_auxi->nvme_io_err = 1; } else { - if (task->dt_result == -DER_CSUM) { + if (obj_auxi->opc == DAOS_OBJ_RPC_UPDATE && + task->dt_result == -DER_CSUM) { struct shard_rw_args *rw_arg = &obj_auxi->rw_args; /** Retry a few times on checksum error on update */