Skip to content

Commit

Permalink
RDMA/hns: Fix cpu stuck caused by printings during reset
Browse files Browse the repository at this point in the history
During reset, cmd to destroy resources such as qp, cq, and mr may fail,
and error logs will be printed. When a large number of resources are
destroyed, there will be lots of printings, and it may lead to a cpu
stuck.

Delete some unnecessary printings and replace other printing functions
in these paths with the ratelimited version.

Fixes: 9a44353 ("IB/hns: Add driver files for hns RoCE driver")
Fixes: c7bcb13 ("RDMA/hns: Add SRQ support for hip08 kernel mode")
Fixes: 70f9252 ("RDMA/hns: Use the reserved loopback QPs to free MR before destroying MPT")
Fixes: 926a01d ("RDMA/hns: Add QP operations support for hip08 SoC")
Signed-off-by: wenglianfa <wenglianfa@huawei.com>
Signed-off-by: Junxian Huang <huangjunxian6@hisilicon.com>
Link: https://patch.msgid.link/20241024124000.2931869-6-huangjunxian6@hisilicon.com
Signed-off-by: Leon Romanovsky <leon@kernel.org>
  • Loading branch information
wenglianfa authored and rleon committed Oct 30, 2024
1 parent d81fb65 commit 323275a
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 48 deletions.
4 changes: 2 additions & 2 deletions drivers/infiniband/hw/hns/hns_roce_cq.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,8 +179,8 @@ static void free_cqc(struct hns_roce_dev *hr_dev, struct hns_roce_cq *hr_cq)
ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_CQC,
hr_cq->cqn);
if (ret)
dev_err(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n", ret,
hr_cq->cqn);
dev_err_ratelimited(dev, "DESTROY_CQ failed (%d) for CQN %06lx\n",
ret, hr_cq->cqn);

xa_erase_irq(&cq_table->array, hr_cq->cqn);

Expand Down
4 changes: 2 additions & 2 deletions drivers/infiniband/hw/hns/hns_roce_hem.c
Original file line number Diff line number Diff line change
Expand Up @@ -672,8 +672,8 @@ void hns_roce_table_put(struct hns_roce_dev *hr_dev,

ret = hr_dev->hw->clear_hem(hr_dev, table, obj, HEM_HOP_STEP_DIRECT);
if (ret)
dev_warn(dev, "failed to clear HEM base address, ret = %d.\n",
ret);
dev_warn_ratelimited(dev, "failed to clear HEM base address, ret = %d.\n",
ret);

hns_roce_free_hem(hr_dev, table->hem[i]);
table->hem[i] = NULL;
Expand Down
73 changes: 33 additions & 40 deletions drivers/infiniband/hw/hns/hns_roce_hw_v2.c
Original file line number Diff line number Diff line change
Expand Up @@ -373,19 +373,12 @@ static int set_rwqe_data_seg(struct ib_qp *ibqp, const struct ib_send_wr *wr,
static int check_send_valid(struct hns_roce_dev *hr_dev,
struct hns_roce_qp *hr_qp)
{
struct ib_device *ibdev = &hr_dev->ib_dev;

if (unlikely(hr_qp->state == IB_QPS_RESET ||
hr_qp->state == IB_QPS_INIT ||
hr_qp->state == IB_QPS_RTR)) {
ibdev_err(ibdev, "failed to post WQE, QP state %u!\n",
hr_qp->state);
hr_qp->state == IB_QPS_RTR))
return -EINVAL;
} else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN)) {
ibdev_err(ibdev, "failed to post WQE, dev state %d!\n",
hr_dev->state);
else if (unlikely(hr_dev->state >= HNS_ROCE_DEVICE_STATE_RST_DOWN))
return -EIO;
}

return 0;
}
Expand Down Expand Up @@ -2775,8 +2768,8 @@ static int free_mr_modify_rsv_qp(struct hns_roce_dev *hr_dev,
ret = hr_dev->hw->modify_qp(&hr_qp->ibqp, attr, mask, IB_QPS_INIT,
IB_QPS_INIT, NULL);
if (ret) {
ibdev_err(ibdev, "failed to modify qp to init, ret = %d.\n",
ret);
ibdev_err_ratelimited(ibdev, "failed to modify qp to init, ret = %d.\n",
ret);
return ret;
}

Expand Down Expand Up @@ -3421,8 +3414,8 @@ static int free_mr_post_send_lp_wqe(struct hns_roce_qp *hr_qp)

ret = hns_roce_v2_post_send(&hr_qp->ibqp, send_wr, &bad_wr);
if (ret) {
ibdev_err(ibdev, "failed to post wqe for free mr, ret = %d.\n",
ret);
ibdev_err_ratelimited(ibdev, "failed to post wqe for free mr, ret = %d.\n",
ret);
return ret;
}

Expand Down Expand Up @@ -3461,9 +3454,9 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev)

ret = free_mr_post_send_lp_wqe(hr_qp);
if (ret) {
ibdev_err(ibdev,
"failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n",
hr_qp->qpn, ret);
ibdev_err_ratelimited(ibdev,
"failed to send wqe (qp:0x%lx) for free mr, ret = %d.\n",
hr_qp->qpn, ret);
break;
}

Expand All @@ -3474,16 +3467,16 @@ static void free_mr_send_cmd_to_hw(struct hns_roce_dev *hr_dev)
while (cqe_cnt) {
npolled = hns_roce_v2_poll_cq(&free_mr->rsv_cq->ib_cq, cqe_cnt, wc);
if (npolled < 0) {
ibdev_err(ibdev,
"failed to poll cqe for free mr, remain %d cqe.\n",
cqe_cnt);
ibdev_err_ratelimited(ibdev,
"failed to poll cqe for free mr, remain %d cqe.\n",
cqe_cnt);
goto out;
}

if (time_after(jiffies, end)) {
ibdev_err(ibdev,
"failed to poll cqe for free mr and timeout, remain %d cqe.\n",
cqe_cnt);
ibdev_err_ratelimited(ibdev,
"failed to poll cqe for free mr and timeout, remain %d cqe.\n",
cqe_cnt);
goto out;
}
cqe_cnt -= npolled;
Expand Down Expand Up @@ -5061,10 +5054,8 @@ static int hns_roce_v2_set_abs_fields(struct ib_qp *ibqp,
struct hns_roce_dev *hr_dev = to_hr_dev(ibqp->device);
int ret = 0;

if (!check_qp_state(cur_state, new_state)) {
ibdev_err(&hr_dev->ib_dev, "Illegal state for QP!\n");
if (!check_qp_state(cur_state, new_state))
return -EINVAL;
}

if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
memset(qpc_mask, 0, hr_dev->caps.qpc_sz);
Expand Down Expand Up @@ -5325,7 +5316,7 @@ static int hns_roce_v2_modify_qp(struct ib_qp *ibqp,
/* SW pass context to HW */
ret = hns_roce_v2_qp_modify(hr_dev, context, qpc_mask, hr_qp);
if (ret) {
ibdev_err(ibdev, "failed to modify QP, ret = %d.\n", ret);
ibdev_err_ratelimited(ibdev, "failed to modify QP, ret = %d.\n", ret);
goto out;
}

Expand Down Expand Up @@ -5463,15 +5454,17 @@ static int hns_roce_v2_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr,

ret = hns_roce_v2_query_qpc(hr_dev, hr_qp->qpn, &context);
if (ret) {
ibdev_err(ibdev, "failed to query QPC, ret = %d.\n", ret);
ibdev_err_ratelimited(ibdev,
"failed to query QPC, ret = %d.\n",
ret);
ret = -EINVAL;
goto out;
}

state = hr_reg_read(&context, QPC_QP_ST);
tmp_qp_state = to_ib_qp_st((enum hns_roce_v2_qp_state)state);
if (tmp_qp_state == -1) {
ibdev_err(ibdev, "Illegal ib_qp_state\n");
ibdev_err_ratelimited(ibdev, "Illegal ib_qp_state\n");
ret = -EINVAL;
goto out;
}
Expand Down Expand Up @@ -5564,9 +5557,9 @@ static int hns_roce_v2_destroy_qp_common(struct hns_roce_dev *hr_dev,
ret = hns_roce_v2_modify_qp(&hr_qp->ibqp, NULL, 0,
hr_qp->state, IB_QPS_RESET, udata);
if (ret)
ibdev_err(ibdev,
"failed to modify QP to RST, ret = %d.\n",
ret);
ibdev_err_ratelimited(ibdev,
"failed to modify QP to RST, ret = %d.\n",
ret);
}

send_cq = hr_qp->ibqp.send_cq ? to_hr_cq(hr_qp->ibqp.send_cq) : NULL;
Expand Down Expand Up @@ -5609,9 +5602,9 @@ int hns_roce_v2_destroy_qp(struct ib_qp *ibqp, struct ib_udata *udata)

ret = hns_roce_v2_destroy_qp_common(hr_dev, hr_qp, udata);
if (ret)
ibdev_err(&hr_dev->ib_dev,
"failed to destroy QP, QPN = 0x%06lx, ret = %d.\n",
hr_qp->qpn, ret);
ibdev_err_ratelimited(&hr_dev->ib_dev,
"failed to destroy QP, QPN = 0x%06lx, ret = %d.\n",
hr_qp->qpn, ret);

hns_roce_qp_destroy(hr_dev, hr_qp, udata);

Expand Down Expand Up @@ -5905,9 +5898,9 @@ static int hns_roce_v2_modify_cq(struct ib_cq *cq, u16 cq_count, u16 cq_period)
HNS_ROCE_CMD_MODIFY_CQC, hr_cq->cqn);
hns_roce_free_cmd_mailbox(hr_dev, mailbox);
if (ret)
ibdev_err(&hr_dev->ib_dev,
"failed to process cmd when modifying CQ, ret = %d.\n",
ret);
ibdev_err_ratelimited(&hr_dev->ib_dev,
"failed to process cmd when modifying CQ, ret = %d.\n",
ret);

err_out:
if (ret)
Expand All @@ -5931,9 +5924,9 @@ static int hns_roce_v2_query_cqc(struct hns_roce_dev *hr_dev, u32 cqn,
ret = hns_roce_cmd_mbox(hr_dev, 0, mailbox->dma,
HNS_ROCE_CMD_QUERY_CQC, cqn);
if (ret) {
ibdev_err(&hr_dev->ib_dev,
"failed to process cmd when querying CQ, ret = %d.\n",
ret);
ibdev_err_ratelimited(&hr_dev->ib_dev,
"failed to process cmd when querying CQ, ret = %d.\n",
ret);
goto err_mailbox;
}

Expand Down
4 changes: 2 additions & 2 deletions drivers/infiniband/hw/hns/hns_roce_mr.c
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ static void hns_roce_mr_free(struct hns_roce_dev *hr_dev, struct hns_roce_mr *mr
key_to_hw_index(mr->key) &
(hr_dev->caps.num_mtpts - 1));
if (ret)
ibdev_warn(ibdev, "failed to destroy mpt, ret = %d.\n",
ret);
ibdev_warn_ratelimited(ibdev, "failed to destroy mpt, ret = %d.\n",
ret);
}

free_mr_pbl(hr_dev, mr);
Expand Down
4 changes: 2 additions & 2 deletions drivers/infiniband/hw/hns/hns_roce_srq.c
Original file line number Diff line number Diff line change
Expand Up @@ -151,8 +151,8 @@ static void free_srqc(struct hns_roce_dev *hr_dev, struct hns_roce_srq *srq)
ret = hns_roce_destroy_hw_ctx(hr_dev, HNS_ROCE_CMD_DESTROY_SRQ,
srq->srqn);
if (ret)
dev_err(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n",
ret, srq->srqn);
dev_err_ratelimited(hr_dev->dev, "DESTROY_SRQ failed (%d) for SRQN %06lx\n",
ret, srq->srqn);

xa_erase_irq(&srq_table->xa, srq->srqn);

Expand Down

0 comments on commit 323275a

Please sign in to comment.