From 5cdae942c0c6cea60b3c57be15922b10b221c6c7 Mon Sep 17 00:00:00 2001 From: Nasf-Fan Date: Mon, 5 Aug 2024 23:26:12 +0800 Subject: [PATCH] DAOS-16089 object: more check when transfer bitmap for coll punch (#14743) To detect some potential DRAM corruption and logic bug. Signed-off-by: Fan Yong --- src/object/cli_coll.c | 5 ++++- src/object/obj_class.c | 4 ++-- src/object/obj_rpc.c | 25 +++++++++++++++++++------ src/object/srv_coll.c | 11 +++++++++-- src/object/srv_obj.c | 11 ++++++----- 5 files changed, 40 insertions(+), 16 deletions(-) diff --git a/src/object/cli_coll.c b/src/object/cli_coll.c index a19e4c06b89..12ba634813a 100644 --- a/src/object/cli_coll.c +++ b/src/object/cli_coll.c @@ -451,8 +451,11 @@ obj_coll_prep_one(struct coll_oper_args *coa, struct dc_object *obj, dcs->dcs_buf[dcs->dcs_nr++] = shard->do_id.id_shard; - if (unlikely(dct->dct_tgt_nr == (uint8_t)(-1))) + if (unlikely(dct->dct_tgt_nr == (uint8_t)(-1))) { + D_WARN("Too much shards for obj "DF_OID"reside on the same target %u/%u\n", + DP_OID(obj->cob_md.omd_id), shard->do_target_rank, shard->do_target_idx); goto out; + } if (coa->coa_for_modify) { if (dct->dct_tgt_nr >= dct->dct_tgt_cap) { diff --git a/src/object/obj_class.c b/src/object/obj_class.c index f533ccd7417..3b3cc5e5362 100644 --- a/src/object/obj_class.c +++ b/src/object/obj_class.c @@ -44,8 +44,8 @@ daos_oclass_attr_find(daos_obj_id_t oid, uint32_t *nr_grps) /* see daos_objid_generate */ oc = oclass_ident2cl(daos_obj_id2class(oid), nr_grps); if (!oc) { - D_DEBUG(DB_PL, "Unknown object class %u for "DF_OID"\n", - (unsigned int)daos_obj_id2class(oid), DP_OID(oid)); + D_WARN("Unknown object class %u for "DF_OID"\n", + (unsigned int)daos_obj_id2class(oid), DP_OID(oid)); return NULL; } D_DEBUG(DB_PL, "Find class %s for oid "DF_OID"\n", diff --git a/src/object/obj_rpc.c b/src/object/obj_rpc.c index ff810c73127..e48c75d5e1d 100644 --- a/src/object/obj_rpc.c +++ b/src/object/obj_rpc.c @@ -1134,11 +1134,8 @@ crt_proc_struct_daos_coll_shard(crt_proc_t proc, crt_proc_op_t proc_op, struct d if (DECODING(proc_op)) dcs->dcs_cap = dcs->dcs_nr; - if (dcs->dcs_nr <= 1) { - if (DECODING(proc_op)) - dcs->dcs_buf = &dcs->dcs_inline; + if (dcs->dcs_nr <= 1) return 0; - } if (DECODING(proc_op)) { D_ALLOC_ARRAY(dcs->dcs_buf, dcs->dcs_nr); @@ -1161,6 +1158,7 @@ crt_proc_struct_daos_coll_shard(crt_proc_t proc, crt_proc_op_t proc_op, struct d int crt_proc_struct_daos_coll_target(crt_proc_t proc, crt_proc_op_t proc_op, struct daos_coll_target *dct) { + int size; int rc; int i; @@ -1172,6 +1170,12 @@ crt_proc_struct_daos_coll_target(crt_proc_t proc, crt_proc_op_t proc_op, struct if (unlikely(rc)) return rc; + size = dct->dct_bitmap_sz << 3; + if (unlikely(size == 0)) { + D_ERROR("Invalid bitmap size (0) for collective operation!\n"); + return -DER_INVAL; + } + rc = crt_proc_uint8_t(proc, proc_op, &dct->dct_max_shard); if (unlikely(rc)) return rc; @@ -1185,14 +1189,23 @@ crt_proc_struct_daos_coll_target(crt_proc_t proc, crt_proc_op_t proc_op, struct return rc; if (DECODING(proc_op)) { + if (unlikely(dct->dct_max_shard >= size)) { + D_ERROR("Too small bitmap for collective operation: %u vs %u\n", + dct->dct_max_shard, size); + return -DER_INVAL; + } + D_ALLOC(dct->dct_bitmap, dct->dct_bitmap_sz); if (dct->dct_bitmap == NULL) return -DER_NOMEM; /* When decode, allocate enough buffer to avoid some XS accessing invalid DRAM. */ - D_ALLOC_ARRAY(dct->dct_shards, dct->dct_bitmap_sz << 3); + D_ALLOC_ARRAY(dct->dct_shards, size); if (dct->dct_shards == NULL) - goto out; + D_GOTO(out, rc = -DER_NOMEM); + + for (i = 0; i < size; i++) + dct->dct_shards[i].dcs_buf = &dct->dct_shards[i].dcs_inline; } rc = crt_proc_memcpy(proc, proc_op, dct->dct_bitmap, dct->dct_bitmap_sz); diff --git a/src/object/srv_coll.c b/src/object/srv_coll.c index 2783756df53..9e421810861 100644 --- a/src/object/srv_coll.c +++ b/src/object/srv_coll.c @@ -184,8 +184,11 @@ obj_coll_punch_bulk(crt_rpc_t *rpc, d_iov_t *iov, crt_proc_t *p_proc, rc = obj_bulk_transfer(rpc, CRT_BULK_GET, false, &ocpi->ocpi_tgt_bulk, NULL, NULL, DAOS_HDL_INVAL, &sgls, 1, NULL, NULL); - if (rc != 0) + if (rc != 0) { + D_ERROR("Failed to prepare bulk transfer for coll_punch, size %u: "DF_RC"\n", + ocpi->ocpi_bulk_tgt_sz, DP_RC(rc)); goto out; + } rc = crt_proc_create(dss_get_module_info()->dmi_ctx, iov->iov_buf, iov->iov_len, CRT_PROC_DECODE, &proc); @@ -238,8 +241,12 @@ obj_coll_punch_prep(struct obj_coll_punch_in *ocpi, struct daos_coll_target *dct /* dcts[0] is for current engine. */ if (dcts[0].dct_bitmap == NULL || dcts[0].dct_bitmap_sz == 0 || - dcts[0].dct_shards == NULL) + dcts[0].dct_shards == NULL) { + D_ERROR("Invalid input for current engine: bitmap %s, bitmap_sz %u, shards %s\n", + dcts[0].dct_bitmap == NULL ? "empty" : "non-empty", dcts[0].dct_bitmap_sz, + dcts[0].dct_shards == NULL ? "empty" : "non-empty"); D_GOTO(out, rc = -DER_INVAL); + } /* Already allocated enough space in MBS when decode to hold the targets and bitmap. */ target = (struct dtx_coll_target *)(ddt + mbs->dm_tgt_cnt); diff --git a/src/object/srv_obj.c b/src/object/srv_obj.c index 27617c81a80..159fa7822dd 100644 --- a/src/object/srv_obj.c +++ b/src/object/srv_obj.c @@ -5701,14 +5701,15 @@ ds_obj_coll_punch_handler(crt_rpc_t *rpc) max_ver = version; DL_CDEBUG(rc != 0 && rc != -DER_INPROGRESS && rc != -DER_TX_RESTART, DLOG_ERR, DB_IO, rc, - "(%s) handled collective punch RPC %p for obj " - DF_UOID" on XS %u/%u epc "DF_X64" pmv %u/%u, with dti " - DF_DTI", forward width %u, forward depth %u", + "(%s) handled collective punch RPC %p for obj "DF_UOID" on XS %u/%u epc " + DF_X64" pmv %u/%u, with dti "DF_DTI", bulk_tgt_sz %u, bulk_tgt_nr %u, " + "tgt_nr %u, forward width %u, forward depth %u", (ocpi->ocpi_flags & ORF_LEADER) ? "leader" : (ocpi->ocpi_tgts.ca_count == 1 ? "non-leader" : "relay-engine"), rpc, DP_UOID(ocpi->ocpi_oid), dmi->dmi_xs_id, dmi->dmi_tgt_id, ocpi->ocpi_epoch, - ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_disp_width, - ocpi->ocpi_disp_depth); + ocpi->ocpi_map_ver, max_ver, DP_DTI(&ocpi->ocpi_xid), ocpi->ocpi_bulk_tgt_sz, + ocpi->ocpi_bulk_tgt_nr, (unsigned int)ocpi->ocpi_tgts.ca_count, + ocpi->ocpi_disp_width, ocpi->ocpi_disp_depth); obj_punch_complete(rpc, rc, max_ver);