Skip to content

Commit

Permalink
Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/gi…
Browse files Browse the repository at this point in the history
…t/tnguy/net-queue

Tony Nguyen says:

====================
ice: fix Rx data path for heavy 9k MTU traffic

Maciej Fijalkowski says:

This patchset fixes a pretty nasty issue that was reported by RedHat
folks which occurred after ~30 minutes (this value varied, just trying
here to state that it was not observed immediately but rather after a
considerable longer amount of time) when ice driver was tortured with
jumbo frames via mix of iperf traffic executed simultaneously with
wrk/nginx on client/server sides (HTTP and TCP workloads basically).

The reported splats were spanning across all the bad things that can
happen to the state of page - refcount underflow, use-after-free, etc.
One of these looked as follows:

[ 2084.019891] BUG: Bad page state in process swapper/34  pfn:97fcd0
[ 2084.025990] page:00000000a60ee772 refcount:-1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x97fcd0
[ 2084.035462] flags: 0x17ffffc0000000(node=0|zone=2|lastcpupid=0x1fffff)
[ 2084.041990] raw: 0017ffffc0000000 dead000000000100 dead000000000122 0000000000000000
[ 2084.049730] raw: 0000000000000000 0000000000000000 ffffffffffffffff 0000000000000000
[ 2084.057468] page dumped because: nonzero _refcount
[ 2084.062260] Modules linked in: bonding tls sunrpc intel_rapl_msr intel_rapl_common intel_uncore_frequency intel_uncore_frequency_common i10nm_edac nfit libnvdimm x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm mgag200 irqd
[ 2084.137829] CPU: 34 PID: 0 Comm: swapper/34 Kdump: loaded Not tainted 5.14.0-427.37.1.el9_4.x86_64 #1
[ 2084.147039] Hardware name: Dell Inc. PowerEdge R750/0216NK, BIOS 1.13.2 12/19/2023
[ 2084.154604] Call Trace:
[ 2084.157058]  <IRQ>
[ 2084.159080]  dump_stack_lvl+0x34/0x48
[ 2084.162752]  bad_page.cold+0x63/0x94
[ 2084.166333]  check_new_pages+0xb3/0xe0
[ 2084.170083]  rmqueue_bulk+0x2d2/0x9e0
[ 2084.173749]  ? ktime_get+0x35/0xa0
[ 2084.177159]  rmqueue_pcplist+0x13b/0x210
[ 2084.181081]  rmqueue+0x7d3/0xd40
[ 2084.184316]  ? xas_load+0x9/0xa0
[ 2084.187547]  ? xas_find+0x183/0x1d0
[ 2084.191041]  ? xa_find_after+0xd0/0x130
[ 2084.194879]  ? intel_iommu_iotlb_sync_map+0x89/0xe0
[ 2084.199759]  get_page_from_freelist+0x11f/0x530
[ 2084.204291]  __alloc_pages+0xf2/0x250
[ 2084.207958]  ice_alloc_rx_bufs+0xcc/0x1c0 [ice]
[ 2084.212543]  ice_clean_rx_irq+0x631/0xa20 [ice]
[ 2084.217111]  ice_napi_poll+0xdf/0x2a0 [ice]
[ 2084.221330]  __napi_poll+0x27/0x170
[ 2084.224824]  net_rx_action+0x233/0x2f0
[ 2084.228575]  __do_softirq+0xc7/0x2ac
[ 2084.232155]  __irq_exit_rcu+0xa1/0xc0
[ 2084.235821]  common_interrupt+0x80/0xa0
[ 2084.239662]  </IRQ>
[ 2084.241768]  <TASK>

The fix is mostly about reverting what was done in commit 1dc1a7e
("ice: Centrallize Rx buffer recycling") followed by proper timing on
page_count() storage and then removing the ice_rx_buf::act related logic
(which was mostly introduced for purposes from cited commit).

Special thanks to Xu Du for providing reproducer and Jacob Keller for
initial extensive analysis.

* '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/net-queue:
  ice: stop storing XDP verdict within ice_rx_buf
  ice: gather page_count()'s of each frag right before XDP prog call
  ice: put Rx buffers after being done with current frame
====================

Link: https://patch.msgid.link/20250131185415.3741532-1-anthony.l.nguyen@intel.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
kuba-moo committed Feb 3, 2025
2 parents 235174b + 468a195 commit 88be092
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 91 deletions.
150 changes: 103 additions & 47 deletions drivers/net/ethernet/intel/ice/ice_txrx.c
Original file line number Diff line number Diff line change
Expand Up @@ -527,15 +527,14 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring)
* @xdp: xdp_buff used as input to the XDP program
* @xdp_prog: XDP program to run
* @xdp_ring: ring to be used for XDP_TX action
* @rx_buf: Rx buffer to store the XDP action
* @eop_desc: Last descriptor in packet to read metadata from
*
* Returns any of ICE_XDP_{PASS, CONSUMED, TX, REDIR}
*/
static void
static u32
ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct ice_tx_ring *xdp_ring,
struct ice_rx_buf *rx_buf, union ice_32b_rx_flex_desc *eop_desc)
union ice_32b_rx_flex_desc *eop_desc)
{
unsigned int ret = ICE_XDP_PASS;
u32 act;
Expand Down Expand Up @@ -574,7 +573,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
ret = ICE_XDP_CONSUMED;
}
exit:
ice_set_rx_bufs_act(xdp, rx_ring, ret);
return ret;
}

/**
Expand Down Expand Up @@ -860,10 +859,8 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
xdp_buff_set_frags_flag(xdp);
}

if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS))
return -ENOMEM;
}

__skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
rx_buf->page_offset, size);
Expand Down Expand Up @@ -924,7 +921,6 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
struct ice_rx_buf *rx_buf;

rx_buf = &rx_ring->rx_buf[ntc];
rx_buf->pgcnt = page_count(rx_buf->page);
prefetchw(rx_buf->page);

if (!size)
Expand All @@ -940,6 +936,31 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
return rx_buf;
}

/**
* ice_get_pgcnts - grab page_count() for gathered fragments
* @rx_ring: Rx descriptor ring to store the page counts on
*
* This function is intended to be called right before running XDP
* program so that the page recycling mechanism will be able to take
* a correct decision regarding underlying pages; this is done in such
* way as XDP program can change the refcount of page
*/
static void ice_get_pgcnts(struct ice_rx_ring *rx_ring)
{
u32 nr_frags = rx_ring->nr_frags + 1;
u32 idx = rx_ring->first_desc;
struct ice_rx_buf *rx_buf;
u32 cnt = rx_ring->count;

for (int i = 0; i < nr_frags; i++) {
rx_buf = &rx_ring->rx_buf[idx];
rx_buf->pgcnt = page_count(rx_buf->page);

if (++idx == cnt)
idx = 0;
}
}

/**
* ice_build_skb - Build skb around an existing buffer
* @rx_ring: Rx descriptor ring to transact packets on
Expand Down Expand Up @@ -1051,12 +1072,12 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp)
rx_buf->page_offset + headlen, size,
xdp->frame_sz);
} else {
/* buffer is unused, change the act that should be taken later
* on; data was copied onto skb's linear part so there's no
/* buffer is unused, restore biased page count in Rx buffer;
* data was copied onto skb's linear part so there's no
* need for adjusting page offset and we can reuse this buffer
* as-is
*/
rx_buf->act = ICE_SKB_CONSUMED;
rx_buf->pagecnt_bias++;
}

if (unlikely(xdp_buff_has_frags(xdp))) {
Expand Down Expand Up @@ -1103,6 +1124,65 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf)
rx_buf->page = NULL;
}

/**
* ice_put_rx_mbuf - ice_put_rx_buf() caller, for all frame frags
* @rx_ring: Rx ring with all the auxiliary data
* @xdp: XDP buffer carrying linear + frags part
* @xdp_xmit: XDP_TX/XDP_REDIRECT verdict storage
* @ntc: a current next_to_clean value to be stored at rx_ring
* @verdict: return code from XDP program execution
*
* Walk through gathered fragments and satisfy internal page
* recycle mechanism; we take here an action related to verdict
* returned by XDP program;
*/
static void ice_put_rx_mbuf(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
u32 *xdp_xmit, u32 ntc, u32 verdict)
{
u32 nr_frags = rx_ring->nr_frags + 1;
u32 idx = rx_ring->first_desc;
u32 cnt = rx_ring->count;
u32 post_xdp_frags = 1;
struct ice_rx_buf *buf;
int i;

if (unlikely(xdp_buff_has_frags(xdp)))
post_xdp_frags += xdp_get_shared_info_from_buff(xdp)->nr_frags;

for (i = 0; i < post_xdp_frags; i++) {
buf = &rx_ring->rx_buf[idx];

if (verdict & (ICE_XDP_TX | ICE_XDP_REDIR)) {
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
*xdp_xmit |= verdict;
} else if (verdict & ICE_XDP_CONSUMED) {
buf->pagecnt_bias++;
} else if (verdict == ICE_XDP_PASS) {
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
}

ice_put_rx_buf(rx_ring, buf);

if (++idx == cnt)
idx = 0;
}
/* handle buffers that represented frags released by XDP prog;
* for these we keep pagecnt_bias as-is; refcount from struct page
* has been decremented within XDP prog and we do not have to increase
* the biased refcnt
*/
for (; i < nr_frags; i++) {
buf = &rx_ring->rx_buf[idx];
ice_put_rx_buf(rx_ring, buf);
if (++idx == cnt)
idx = 0;
}

xdp->data = NULL;
rx_ring->first_desc = ntc;
rx_ring->nr_frags = 0;
}

/**
* ice_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
* @rx_ring: Rx descriptor ring to transact packets on
Expand All @@ -1120,15 +1200,13 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
unsigned int offset = rx_ring->rx_offset;
struct xdp_buff *xdp = &rx_ring->xdp;
u32 cached_ntc = rx_ring->first_desc;
struct ice_tx_ring *xdp_ring = NULL;
struct bpf_prog *xdp_prog = NULL;
u32 ntc = rx_ring->next_to_clean;
u32 cached_ntu, xdp_verdict;
u32 cnt = rx_ring->count;
u32 xdp_xmit = 0;
u32 cached_ntu;
bool failure;
u32 first;

xdp_prog = READ_ONCE(rx_ring->xdp_prog);
if (xdp_prog) {
Expand Down Expand Up @@ -1190,6 +1268,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
xdp_prepare_buff(xdp, hard_start, offset, size, !!offset);
xdp_buff_clear_frags_flag(xdp);
} else if (ice_add_xdp_frag(rx_ring, xdp, rx_buf, size)) {
ice_put_rx_mbuf(rx_ring, xdp, NULL, ntc, ICE_XDP_CONSUMED);
break;
}
if (++ntc == cnt)
Expand All @@ -1199,15 +1278,15 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
if (ice_is_non_eop(rx_ring, rx_desc))
continue;

ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_buf, rx_desc);
if (rx_buf->act == ICE_XDP_PASS)
ice_get_pgcnts(rx_ring);
xdp_verdict = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring, rx_desc);
if (xdp_verdict == ICE_XDP_PASS)
goto construct_skb;
total_rx_bytes += xdp_get_buff_len(xdp);
total_rx_pkts++;

xdp->data = NULL;
rx_ring->first_desc = ntc;
rx_ring->nr_frags = 0;
ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict);

continue;
construct_skb:
if (likely(ice_ring_uses_build_skb(rx_ring)))
Expand All @@ -1217,18 +1296,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
/* exit if we failed to retrieve a buffer */
if (!skb) {
rx_ring->ring_stats->rx_stats.alloc_page_failed++;
rx_buf->act = ICE_XDP_CONSUMED;
if (unlikely(xdp_buff_has_frags(xdp)))
ice_set_rx_bufs_act(xdp, rx_ring,
ICE_XDP_CONSUMED);
xdp->data = NULL;
rx_ring->first_desc = ntc;
rx_ring->nr_frags = 0;
break;
xdp_verdict = ICE_XDP_CONSUMED;
}
xdp->data = NULL;
rx_ring->first_desc = ntc;
rx_ring->nr_frags = 0;
ice_put_rx_mbuf(rx_ring, xdp, &xdp_xmit, ntc, xdp_verdict);

if (!skb)
break;

stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
Expand Down Expand Up @@ -1257,23 +1330,6 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
total_rx_pkts++;
}

first = rx_ring->first_desc;
while (cached_ntc != first) {
struct ice_rx_buf *buf = &rx_ring->rx_buf[cached_ntc];

if (buf->act & (ICE_XDP_TX | ICE_XDP_REDIR)) {
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
xdp_xmit |= buf->act;
} else if (buf->act & ICE_XDP_CONSUMED) {
buf->pagecnt_bias++;
} else if (buf->act == ICE_XDP_PASS) {
ice_rx_buf_adjust_pg_offset(buf, xdp->frame_sz);
}

ice_put_rx_buf(rx_ring, buf);
if (++cached_ntc >= cnt)
cached_ntc = 0;
}
rx_ring->next_to_clean = ntc;
/* return up to cleaned_count buffers to hardware */
failure = ice_alloc_rx_bufs(rx_ring, ICE_RX_DESC_UNUSED(rx_ring));
Expand Down
1 change: 0 additions & 1 deletion drivers/net/ethernet/intel/ice/ice_txrx.h
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ struct ice_rx_buf {
struct page *page;
unsigned int page_offset;
unsigned int pgcnt;
unsigned int act;
unsigned int pagecnt_bias;
};

Expand Down
43 changes: 0 additions & 43 deletions drivers/net/ethernet/intel/ice/ice_txrx_lib.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,49 +5,6 @@
#define _ICE_TXRX_LIB_H_
#include "ice.h"

/**
* ice_set_rx_bufs_act - propagate Rx buffer action to frags
* @xdp: XDP buffer representing frame (linear and frags part)
* @rx_ring: Rx ring struct
* act: action to store onto Rx buffers related to XDP buffer parts
*
* Set action that should be taken before putting Rx buffer from first frag
* to the last.
*/
static inline void
ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring,
const unsigned int act)
{
u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
u32 nr_frags = rx_ring->nr_frags + 1;
u32 idx = rx_ring->first_desc;
u32 cnt = rx_ring->count;
struct ice_rx_buf *buf;

for (int i = 0; i < nr_frags; i++) {
buf = &rx_ring->rx_buf[idx];
buf->act = act;

if (++idx == cnt)
idx = 0;
}

/* adjust pagecnt_bias on frags freed by XDP prog */
if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) {
u32 delta = rx_ring->nr_frags - sinfo_frags;

while (delta) {
if (idx == 0)
idx = cnt - 1;
else
idx--;
buf = &rx_ring->rx_buf[idx];
buf->pagecnt_bias--;
delta--;
}
}
}

/**
* ice_test_staterr - tests bits in Rx descriptor status and error fields
* @status_err_n: Rx descriptor status_error0 or status_error1 bits
Expand Down

0 comments on commit 88be092

Please sign in to comment.