Skip to content

Commit

Permalink
Merge branch 'mlxsw-use-page-pool-for-rx-buffers-allocation'
Browse files Browse the repository at this point in the history
Petr Machata says:

====================
mlxsw: Use page pool for Rx buffers allocation

Amit Cohen  writes:

After using NAPI to process events from hardware, the next step is to
use page pool for Rx buffers allocation, which is also enhances
performance.

To simplify this change, first use page pool to allocate one continuous
buffer for each packet, later memory consumption can be improved by using
fragmented buffers.

This set significantly enhances mlxsw driver performance, CPU can handle
about 370% of the packets per second it previously handled.

The next planned improvement is using XDP to optimize telemetry.

Patch set overview:
Patches #1-#2 are small preparations for page pool usage
Patch #3 initializes page pool, but do not use it
Patch #4 converts the driver to use page pool for buffers allocations
Patch #5 is an optimization for buffer access
Patch #6 cleans up an unused structure
Patch #7 uses napi_consume_skb() as part of Tx completion
====================

Link: https://lore.kernel.org/r/cover.1718709196.git.petrm@nvidia.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
kuba-moo committed Jun 20, 2024
2 parents 89f5e60 + d94ae64 commit 6f80fcd
Show file tree
Hide file tree
Showing 2 changed files with 142 additions and 58 deletions.
1 change: 1 addition & 0 deletions drivers/net/ethernet/mellanox/mlxsw/Kconfig
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ config MLXSW_CORE_THERMAL
config MLXSW_PCI
tristate "PCI bus implementation for Mellanox Technologies Switch ASICs"
depends on PCI && HAS_IOMEM && MLXSW_CORE
select PAGE_POOL
default m
help
This is PCI bus implementation for Mellanox Technologies Switch ASICs.
Expand Down
199 changes: 141 additions & 58 deletions drivers/net/ethernet/mellanox/mlxsw/pci.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include <linux/if_vlan.h>
#include <linux/log2.h>
#include <linux/string.h>
#include <net/page_pool/helpers.h>

#include "pci_hw.h"
#include "pci.h"
Expand Down Expand Up @@ -61,15 +62,11 @@ struct mlxsw_pci_mem_item {
};

struct mlxsw_pci_queue_elem_info {
struct page *page;
char *elem; /* pointer to actual dma mapped element mem chunk */
union {
struct {
struct sk_buff *skb;
} sdq;
struct {
struct sk_buff *skb;
} rdq;
} u;
struct {
struct sk_buff *skb;
} sdq;
};

struct mlxsw_pci_queue {
Expand All @@ -88,10 +85,14 @@ struct mlxsw_pci_queue {
enum mlxsw_pci_cqe_v v;
struct mlxsw_pci_queue *dq;
struct napi_struct napi;
struct page_pool *page_pool;
} cq;
struct {
struct tasklet_struct tasklet;
} eq;
struct {
struct mlxsw_pci_queue *cq;
} rdq;
} u;
};

Expand Down Expand Up @@ -335,6 +336,25 @@ static void mlxsw_pci_sdq_fini(struct mlxsw_pci *mlxsw_pci,
mlxsw_cmd_hw2sw_sdq(mlxsw_pci->core, q->num);
}

#define MLXSW_PCI_SKB_HEADROOM (NET_SKB_PAD + NET_IP_ALIGN)

#define MLXSW_PCI_RX_BUF_SW_OVERHEAD \
(MLXSW_PCI_SKB_HEADROOM + \
SKB_DATA_ALIGN(sizeof(struct skb_shared_info)))

static void
mlxsw_pci_wqe_rx_frag_set(struct mlxsw_pci *mlxsw_pci, struct page *page,
char *wqe, int index, size_t frag_len)
{
dma_addr_t mapaddr;

mapaddr = page_pool_get_dma_addr(page);
mapaddr += MLXSW_PCI_SKB_HEADROOM;

mlxsw_pci_wqe_address_set(wqe, index, mapaddr);
mlxsw_pci_wqe_byte_count_set(wqe, index, frag_len);
}

static int mlxsw_pci_wqe_frag_map(struct mlxsw_pci *mlxsw_pci, char *wqe,
int index, char *frag_data, size_t frag_len,
int direction)
Expand Down Expand Up @@ -364,43 +384,47 @@ static void mlxsw_pci_wqe_frag_unmap(struct mlxsw_pci *mlxsw_pci, char *wqe,
dma_unmap_single(&pdev->dev, mapaddr, frag_len, direction);
}

static int mlxsw_pci_rdq_skb_alloc(struct mlxsw_pci *mlxsw_pci,
struct mlxsw_pci_queue_elem_info *elem_info,
gfp_t gfp)
static struct sk_buff *mlxsw_pci_rdq_build_skb(struct page *page,
u16 byte_count)
{
void *data = page_address(page);
unsigned int allocated_size;
struct sk_buff *skb;

net_prefetch(data);
allocated_size = page_size(page);
skb = napi_build_skb(data, allocated_size);
if (unlikely(!skb))
return ERR_PTR(-ENOMEM);

skb_reserve(skb, MLXSW_PCI_SKB_HEADROOM);
skb_put(skb, byte_count);
return skb;
}

static int mlxsw_pci_rdq_page_alloc(struct mlxsw_pci_queue *q,
struct mlxsw_pci_queue_elem_info *elem_info)
{
struct mlxsw_pci_queue *cq = q->u.rdq.cq;
size_t buf_len = MLXSW_PORT_MAX_MTU;
char *wqe = elem_info->elem;
struct sk_buff *skb;
int err;
struct page *page;

skb = __netdev_alloc_skb_ip_align(NULL, buf_len, gfp);
if (!skb)
page = page_pool_dev_alloc_pages(cq->u.cq.page_pool);
if (unlikely(!page))
return -ENOMEM;

err = mlxsw_pci_wqe_frag_map(mlxsw_pci, wqe, 0, skb->data,
buf_len, DMA_FROM_DEVICE);
if (err)
goto err_frag_map;

elem_info->u.rdq.skb = skb;
mlxsw_pci_wqe_rx_frag_set(q->pci, page, wqe, 0, buf_len);
elem_info->page = page;
return 0;

err_frag_map:
dev_kfree_skb_any(skb);
return err;
}

static void mlxsw_pci_rdq_skb_free(struct mlxsw_pci *mlxsw_pci,
struct mlxsw_pci_queue_elem_info *elem_info)
static void mlxsw_pci_rdq_page_free(struct mlxsw_pci_queue *q,
struct mlxsw_pci_queue_elem_info *elem_info)
{
struct sk_buff *skb;
char *wqe;

skb = elem_info->u.rdq.skb;
wqe = elem_info->elem;
struct mlxsw_pci_queue *cq = q->u.rdq.cq;

mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
dev_kfree_skb_any(skb);
page_pool_put_page(cq->u.cq.page_pool, elem_info->page, -1, false);
}

static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
Expand Down Expand Up @@ -434,13 +458,14 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,

cq = mlxsw_pci_cq_get(mlxsw_pci, cq_num);
cq->u.cq.dq = q;
q->u.rdq.cq = cq;

mlxsw_pci_queue_doorbell_producer_ring(mlxsw_pci, q);

for (i = 0; i < q->count; i++) {
elem_info = mlxsw_pci_queue_elem_info_producer_get(q);
BUG_ON(!elem_info);
err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_KERNEL);
err = mlxsw_pci_rdq_page_alloc(q, elem_info);
if (err)
goto rollback;
/* Everything is set up, ring doorbell to pass elem to HW */
Expand All @@ -453,8 +478,9 @@ static int mlxsw_pci_rdq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
rollback:
for (i--; i >= 0; i--) {
elem_info = mlxsw_pci_queue_elem_info_get(q, i);
mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
mlxsw_pci_rdq_page_free(q, elem_info);
}
q->u.rdq.cq = NULL;
cq->u.cq.dq = NULL;
mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);

Expand All @@ -470,7 +496,7 @@ static void mlxsw_pci_rdq_fini(struct mlxsw_pci *mlxsw_pci,
mlxsw_cmd_hw2sw_rdq(mlxsw_pci->core, q->num);
for (i = 0; i < q->count; i++) {
elem_info = mlxsw_pci_queue_elem_info_get(q, i);
mlxsw_pci_rdq_skb_free(mlxsw_pci, elem_info);
mlxsw_pci_rdq_page_free(q, elem_info);
}
}

Expand Down Expand Up @@ -515,7 +541,7 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
struct mlxsw_pci_queue *q,
u16 consumer_counter_limit,
enum mlxsw_pci_cqe_v cqe_v,
char *cqe)
char *cqe, int budget)
{
struct pci_dev *pdev = mlxsw_pci->pdev;
struct mlxsw_pci_queue_elem_info *elem_info;
Expand All @@ -526,8 +552,8 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,

spin_lock(&q->lock);
elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
tx_info = mlxsw_skb_cb(elem_info->u.sdq.skb)->tx_info;
skb = elem_info->u.sdq.skb;
tx_info = mlxsw_skb_cb(elem_info->sdq.skb)->tx_info;
skb = elem_info->sdq.skb;
wqe = elem_info->elem;
for (i = 0; i < MLXSW_PCI_WQE_SG_ENTRIES; i++)
mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, i, DMA_TO_DEVICE);
Expand All @@ -541,8 +567,8 @@ static void mlxsw_pci_cqe_sdq_handle(struct mlxsw_pci *mlxsw_pci,
}

if (skb)
dev_kfree_skb_any(skb);
elem_info->u.sdq.skb = NULL;
napi_consume_skb(skb, budget);
elem_info->sdq.skb = NULL;

if (q->consumer_counter++ != consumer_counter_limit)
dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in SDQ\n");
Expand Down Expand Up @@ -605,26 +631,38 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,
{
struct pci_dev *pdev = mlxsw_pci->pdev;
struct mlxsw_pci_queue_elem_info *elem_info;
struct mlxsw_pci_queue *cq = q->u.rdq.cq;
struct mlxsw_rx_info rx_info = {};
char wqe[MLXSW_PCI_WQE_SIZE];
struct sk_buff *skb;
struct page *page;
u16 byte_count;
int err;

elem_info = mlxsw_pci_queue_elem_info_consumer_get(q);
skb = elem_info->u.rdq.skb;
memcpy(wqe, elem_info->elem, MLXSW_PCI_WQE_SIZE);

if (q->consumer_counter++ != consumer_counter_limit)
dev_dbg_ratelimited(&pdev->dev, "Consumer counter does not match limit in RDQ\n");

err = mlxsw_pci_rdq_skb_alloc(mlxsw_pci, elem_info, GFP_ATOMIC);
byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
byte_count -= ETH_FCS_LEN;

page = elem_info->page;

err = mlxsw_pci_rdq_page_alloc(q, elem_info);
if (err) {
dev_err_ratelimited(&pdev->dev, "Failed to alloc skb for RDQ\n");
dev_err_ratelimited(&pdev->dev, "Failed to alloc page\n");
goto out;
}

skb = mlxsw_pci_rdq_build_skb(page, byte_count);
if (IS_ERR(skb)) {
dev_err_ratelimited(&pdev->dev, "Failed to build skb for RDQ\n");
page_pool_recycle_direct(cq->u.cq.page_pool, page);
goto out;
}

mlxsw_pci_wqe_frag_unmap(mlxsw_pci, wqe, 0, DMA_FROM_DEVICE);
skb_mark_for_recycle(skb);

if (mlxsw_pci_cqe_lag_get(cqe_v, cqe)) {
rx_info.is_lag = true;
Expand Down Expand Up @@ -657,10 +695,6 @@ static void mlxsw_pci_cqe_rdq_handle(struct mlxsw_pci *mlxsw_pci,

mlxsw_pci_skb_cb_ts_set(mlxsw_pci, skb, cqe_v, cqe);

byte_count = mlxsw_pci_cqe_byte_count_get(cqe);
if (mlxsw_pci_cqe_crc_get(cqe_v, cqe))
byte_count -= ETH_FCS_LEN;
skb_put(skb, byte_count);
mlxsw_core_skb_receive(mlxsw_pci->core, skb, &rx_info);

out:
Expand Down Expand Up @@ -785,7 +819,7 @@ static int mlxsw_pci_napi_poll_cq_tx(struct napi_struct *napi, int budget)
mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);

mlxsw_pci_cqe_sdq_handle(mlxsw_pci, sdq,
wqe_counter, q->u.cq.v, ncqe);
wqe_counter, q->u.cq.v, ncqe, budget);

work_done++;
}
Expand Down Expand Up @@ -832,19 +866,54 @@ static void mlxsw_pci_cq_napi_setup(struct mlxsw_pci_queue *q,
mlxsw_pci_napi_poll_cq_rx);
break;
}

napi_enable(&q->u.cq.napi);
}

static void mlxsw_pci_cq_napi_teardown(struct mlxsw_pci_queue *q)
{
napi_disable(&q->u.cq.napi);
netif_napi_del(&q->u.cq.napi);
}

static int mlxsw_pci_cq_page_pool_init(struct mlxsw_pci_queue *q,
enum mlxsw_pci_cq_type cq_type)
{
struct page_pool_params pp_params = {};
struct mlxsw_pci *mlxsw_pci = q->pci;
struct page_pool *page_pool;
u32 max_pkt_size;

if (cq_type != MLXSW_PCI_CQ_RDQ)
return 0;

max_pkt_size = MLXSW_PORT_MAX_MTU + MLXSW_PCI_RX_BUF_SW_OVERHEAD;
pp_params.order = get_order(max_pkt_size);
pp_params.flags = PP_FLAG_DMA_MAP;
pp_params.pool_size = MLXSW_PCI_WQE_COUNT;
pp_params.nid = dev_to_node(&mlxsw_pci->pdev->dev);
pp_params.dev = &mlxsw_pci->pdev->dev;
pp_params.napi = &q->u.cq.napi;
pp_params.dma_dir = DMA_FROM_DEVICE;

page_pool = page_pool_create(&pp_params);
if (IS_ERR(page_pool))
return PTR_ERR(page_pool);

q->u.cq.page_pool = page_pool;
return 0;
}

static void mlxsw_pci_cq_page_pool_fini(struct mlxsw_pci_queue *q,
enum mlxsw_pci_cq_type cq_type)
{
if (cq_type != MLXSW_PCI_CQ_RDQ)
return;

page_pool_destroy(q->u.cq.page_pool);
}

static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
struct mlxsw_pci_queue *q)
{
enum mlxsw_pci_cq_type cq_type = mlxsw_pci_cq_type(mlxsw_pci, q);
int i;
int err;

Expand Down Expand Up @@ -874,15 +943,29 @@ static int mlxsw_pci_cq_init(struct mlxsw_pci *mlxsw_pci, char *mbox,
err = mlxsw_cmd_sw2hw_cq(mlxsw_pci->core, mbox, q->num);
if (err)
return err;
mlxsw_pci_cq_napi_setup(q, mlxsw_pci_cq_type(mlxsw_pci, q));
mlxsw_pci_cq_napi_setup(q, cq_type);

err = mlxsw_pci_cq_page_pool_init(q, cq_type);
if (err)
goto err_page_pool_init;

napi_enable(&q->u.cq.napi);
mlxsw_pci_queue_doorbell_consumer_ring(mlxsw_pci, q);
mlxsw_pci_queue_doorbell_arm_consumer_ring(mlxsw_pci, q);
return 0;

err_page_pool_init:
mlxsw_pci_cq_napi_teardown(q);
return err;
}

static void mlxsw_pci_cq_fini(struct mlxsw_pci *mlxsw_pci,
struct mlxsw_pci_queue *q)
{
enum mlxsw_pci_cq_type cq_type = mlxsw_pci_cq_type(mlxsw_pci, q);

napi_disable(&q->u.cq.napi);
mlxsw_pci_cq_page_pool_fini(q, cq_type);
mlxsw_pci_cq_napi_teardown(q);
mlxsw_cmd_hw2sw_cq(mlxsw_pci->core, q->num);
}
Expand Down Expand Up @@ -1919,7 +2002,7 @@ static int mlxsw_pci_skb_transmit(void *bus_priv, struct sk_buff *skb,
goto unlock;
}
mlxsw_skb_cb(skb)->tx_info = *tx_info;
elem_info->u.sdq.skb = skb;
elem_info->sdq.skb = skb;

wqe = elem_info->elem;
mlxsw_pci_wqe_c_set(wqe, 1); /* always report completion */
Expand Down

0 comments on commit 6f80fcd

Please sign in to comment.