From 67bcf375d3b341f5aea7bb92fb7509ff439339e2 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 8 Mar 2023 17:47:46 +0100
Subject: [PATCH 01/32] net: intel: introduce Intel Ethernet common library

Not a secret there's a ton of code duplication between two and more Intel
ethernet modules.
Before introducing new changes, which would need to be copied over again,
start decoupling the already existing duplicate functionality into a new
module, which will be shared between several Intel Ethernet drivers.
Add the lookup table which converts 8/10-bit hardware packet type into
a parsed bitfield structure for easy checking packet format parameters,
such as payload level, IP version, etc. This is currently used by i40e,
ice and iavf and it's all the same in all three drivers.
The only difference introduced in this implementation is that instead of
defining a 256 (or 1024 in case of ice) element array, add unlikely()
condition to limit the input to 154 (current maximum non-reserved packet
type). There's no reason to waste 600 (or even 3600) bytes only to not
hurt very unlikely exception packets.
The hash computation function now takes payload level directly as a
pkt_hash_type. There's a couple cases when non-IP ptypes are marked as
L3 payload and in the previous versions their hash level would be 2, not
3. But skb_set_hash() only sees difference between L4 and non-L4, thus
this won't change anything at all.
The module is behind the hidden Kconfig symbol, which the drivers will
select when needed. The exports are behind 'LIBIE' namespace to limit
the scope of the functions.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 MAINTAINERS                                   |   3 +-
 drivers/net/ethernet/intel/Kconfig            |  11 +-
 drivers/net/ethernet/intel/Makefile           |   1 +
 drivers/net/ethernet/intel/i40e/i40e_common.c | 253 --------------
 drivers/net/ethernet/intel/i40e/i40e_main.c   |   1 +
 .../net/ethernet/intel/i40e/i40e_prototype.h  |   7 -
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  74 +---
 drivers/net/ethernet/intel/i40e/i40e_type.h   |  88 -----
 drivers/net/ethernet/intel/iavf/iavf_common.c | 253 --------------
 drivers/net/ethernet/intel/iavf/iavf_main.c   |   1 +
 .../net/ethernet/intel/iavf/iavf_prototype.h  |   7 -
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  70 +---
 drivers/net/ethernet/intel/iavf/iavf_type.h   |  88 -----
 .../net/ethernet/intel/ice/ice_lan_tx_rx.h    | 316 ------------------
 drivers/net/ethernet/intel/ice/ice_main.c     |   1 +
 drivers/net/ethernet/intel/ice/ice_txrx_lib.c |  74 +---
 drivers/net/ethernet/intel/libie/Makefile     |   6 +
 drivers/net/ethernet/intel/libie/rx.c         | 110 ++++++
 include/linux/net/intel/libie/rx.h            | 128 +++++++
 19 files changed, 312 insertions(+), 1180 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/libie/Makefile
 create mode 100644 drivers/net/ethernet/intel/libie/rx.c
 create mode 100644 include/linux/net/intel/libie/rx.h

diff --git a/MAINTAINERS b/MAINTAINERS
index fbbda4671e734d..f0bb5ee1787068 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -10260,7 +10260,8 @@ F:	Documentation/networking/device_drivers/ethernet/intel/
 F:	drivers/net/ethernet/intel/
 F:	drivers/net/ethernet/intel/*/
 F:	include/linux/avf/virtchnl.h
-F:	include/linux/net/intel/iidc.h
+F:	include/linux/net/intel/
+F:	include/linux/net/intel/*/
 
 INTEL ETHERNET PROTOCOL DRIVER FOR RDMA
 M:	Mustafa Ismail <mustafa.ismail@intel.com>
diff --git a/drivers/net/ethernet/intel/Kconfig b/drivers/net/ethernet/intel/Kconfig
index 9bc0a951989964..cec4a938fbd0f6 100644
--- a/drivers/net/ethernet/intel/Kconfig
+++ b/drivers/net/ethernet/intel/Kconfig
@@ -84,6 +84,12 @@ config E1000E_HWTS
 	 devices. The cross-timestamp is available through the PTP clock
 	 driver precise cross-timestamp ioctl (PTP_SYS_OFFSET_PRECISE).
 
+config LIBIE
+	tristate
+	help
+	  libie (Intel Ethernet library) is a common library containing
+	  routines shared by several Intel Ethernet drivers.
+
 config IGB
 	tristate "Intel(R) 82575/82576 PCI-Express Gigabit Ethernet support"
 	depends on PCI
@@ -225,6 +231,7 @@ config I40E
 	depends on PTP_1588_CLOCK_OPTIONAL
 	depends on PCI
 	select AUXILIARY_BUS
+	select LIBIE
 	help
 	  This driver supports Intel(R) Ethernet Controller XL710 Family of
 	  devices.  For more information on how to identify your adapter, go
@@ -254,8 +261,9 @@ config IAVF
 	tristate
 config I40EVF
 	tristate "Intel(R) Ethernet Adaptive Virtual Function support"
-	select IAVF
 	depends on PCI_MSI
+	select IAVF
+	select LIBIE
 	help
 	  This driver supports virtual functions for Intel XL710,
 	  X710, X722, XXV710, and all devices advertising support for
@@ -282,6 +290,7 @@ config ICE
 	depends on GNSS || GNSS = n
 	select AUXILIARY_BUS
 	select DIMLIB
+	select LIBIE
 	select NET_DEVLINK
 	select PLDMFW
 	help
diff --git a/drivers/net/ethernet/intel/Makefile b/drivers/net/ethernet/intel/Makefile
index d80d04132073ca..ce622b4d825df7 100644
--- a/drivers/net/ethernet/intel/Makefile
+++ b/drivers/net/ethernet/intel/Makefile
@@ -15,3 +15,4 @@ obj-$(CONFIG_I40E) += i40e/
 obj-$(CONFIG_IAVF) += iavf/
 obj-$(CONFIG_FM10K) += fm10k/
 obj-$(CONFIG_ICE) += ice/
+obj-$(CONFIG_LIBIE) += libie/
diff --git a/drivers/net/ethernet/intel/i40e/i40e_common.c b/drivers/net/ethernet/intel/i40e/i40e_common.c
index ed88e38d488b2d..25bb858268fcd6 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_common.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_common.c
@@ -383,259 +383,6 @@ int i40e_aq_set_rss_key(struct i40e_hw *hw,
 	return i40e_aq_get_set_rss_key(hw, vsi_id, key, true);
 }
 
-/* The i40e_ptype_lookup table is used to convert from the 8-bit ptype in the
- * hardware to a bit-field that can be used by SW to more easily determine the
- * packet type.
- *
- * Macros are used to shorten the table lines and make this table human
- * readable.
- *
- * We store the PTYPE in the top byte of the bit field - this is just so that
- * we can check that the table doesn't have a row missing, as the index into
- * the table should be the PTYPE.
- *
- * Typical work flow:
- *
- * IF NOT i40e_ptype_lookup[ptype].known
- * THEN
- *      Packet is unknown
- * ELSE IF i40e_ptype_lookup[ptype].outer_ip == I40E_RX_PTYPE_OUTER_IP
- *      Use the rest of the fields to look at the tunnels, inner protocols, etc
- * ELSE
- *      Use the enum i40e_rx_l2_ptype to decode the packet type
- * ENDIF
- */
-
-/* macro to make the table lines short, use explicit indexing with [PTYPE] */
-#define I40E_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = { \
-		1, \
-		I40E_RX_PTYPE_OUTER_##OUTER_IP, \
-		I40E_RX_PTYPE_OUTER_##OUTER_IP_VER, \
-		I40E_RX_PTYPE_##OUTER_FRAG, \
-		I40E_RX_PTYPE_TUNNEL_##T, \
-		I40E_RX_PTYPE_TUNNEL_END_##TE, \
-		I40E_RX_PTYPE_##TEF, \
-		I40E_RX_PTYPE_INNER_PROT_##I, \
-		I40E_RX_PTYPE_PAYLOAD_LAYER_##PL }
-
-#define I40E_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-
-/* shorter macros makes the table fit but are terse */
-#define I40E_RX_PTYPE_NOF		I40E_RX_PTYPE_NOT_FRAG
-#define I40E_RX_PTYPE_FRG		I40E_RX_PTYPE_FRAG
-#define I40E_RX_PTYPE_INNER_PROT_TS	I40E_RX_PTYPE_INNER_PROT_TIMESYNC
-
-/* Lookup table mapping in the 8-bit HW PTYPE to the bit field for decoding */
-struct i40e_rx_ptype_decoded i40e_ptype_lookup[BIT(8)] = {
-	/* L2 Packet types */
-	I40E_PTT_UNUSED_ENTRY(0),
-	I40E_PTT(1,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT(2,  L2, NONE, NOF, NONE, NONE, NOF, TS,   PAY2),
-	I40E_PTT(3,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT_UNUSED_ENTRY(4),
-	I40E_PTT_UNUSED_ENTRY(5),
-	I40E_PTT(6,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT(7,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT_UNUSED_ENTRY(8),
-	I40E_PTT_UNUSED_ENTRY(9),
-	I40E_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	I40E_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	I40E_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-
-	/* Non Tunneled IPv4 */
-	I40E_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(25),
-	I40E_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	I40E_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	I40E_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv4 */
-	I40E_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(32),
-	I40E_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv6 */
-	I40E_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(39),
-	I40E_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT */
-	I40E_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> IPv4 */
-	I40E_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(47),
-	I40E_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> IPv6 */
-	I40E_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(54),
-	I40E_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC */
-	I40E_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
-	I40E_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(62),
-	I40E_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
-	I40E_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(69),
-	I40E_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC/VLAN */
-	I40E_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
-	I40E_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(77),
-	I40E_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
-	I40E_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(84),
-	I40E_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* Non Tunneled IPv6 */
-	I40E_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
-	I40E_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(91),
-	I40E_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	I40E_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	I40E_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv4 */
-	I40E_PTT(95,  IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(96,  IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(97,  IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(98),
-	I40E_PTT(99,  IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv6 */
-	I40E_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(105),
-	I40E_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT */
-	I40E_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> IPv4 */
-	I40E_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(113),
-	I40E_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> IPv6 */
-	I40E_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(120),
-	I40E_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC */
-	I40E_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
-	I40E_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(128),
-	I40E_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
-	I40E_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(135),
-	I40E_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN */
-	I40E_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
-	I40E_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	I40E_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	I40E_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(143),
-	I40E_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	I40E_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	I40E_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
-	I40E_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	I40E_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	I40E_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	I40E_PTT_UNUSED_ENTRY(150),
-	I40E_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	I40E_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	I40E_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* unused entries */
-	[154 ... 255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-};
-
 /**
  * i40e_init_shared_code - Initialize the shared code
  * @hw: pointer to hardware structure
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
index c8ff5675b29d8b..d89a5fff15983d 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -97,6 +97,7 @@ MODULE_PARM_DESC(debug, "Debug level (0=none,...,16=all), Debug mask (0x8XXXXXXX
 
 MODULE_AUTHOR("Intel Corporation, <e1000-devel@lists.sourceforge.net>");
 MODULE_DESCRIPTION("Intel(R) Ethernet Connection XL710 Network Driver");
+MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 
 static struct workqueue_struct *i40e_wq;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_prototype.h b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
index fe845987d99a55..5287d0ef32d5c7 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_prototype.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_prototype.h
@@ -380,13 +380,6 @@ void i40e_set_pci_config_data(struct i40e_hw *hw, u16 link_status);
 
 int i40e_set_mac_type(struct i40e_hw *hw);
 
-extern struct i40e_rx_ptype_decoded i40e_ptype_lookup[];
-
-static inline struct i40e_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype)
-{
-	return i40e_ptype_lookup[ptype];
-}
-
 /**
  * i40e_virtchnl_link_speed - Convert AdminQ link_speed to virtchnl definition
  * @link_speed: the speed to convert
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index c8c2cbaa0ede6c..e4bfc7e3c076e1 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1,8 +1,9 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
-#include <linux/prefetch.h>
 #include <linux/bpf_trace.h>
+#include <linux/net/intel/libie/rx.h>
+#include <linux/prefetch.h>
 #include <net/mpls.h>
 #include <net/xdp.h>
 #include "i40e.h"
@@ -1758,40 +1759,32 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
 				    struct sk_buff *skb,
 				    union i40e_rx_desc *rx_desc)
 {
-	struct i40e_rx_ptype_decoded decoded;
+	struct libie_rx_ptype_parsed parsed;
 	u32 rx_error, rx_status;
 	bool ipv4, ipv6;
 	u8 ptype;
 	u64 qword;
 
+	skb->ip_summed = CHECKSUM_NONE;
+
 	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
 	ptype = (qword & I40E_RXD_QW1_PTYPE_MASK) >> I40E_RXD_QW1_PTYPE_SHIFT;
+
+	parsed = libie_parse_rx_ptype(ptype);
+	if (!libie_has_rx_checksum(vsi->netdev, parsed))
+		return;
+
 	rx_error = (qword & I40E_RXD_QW1_ERROR_MASK) >>
 		   I40E_RXD_QW1_ERROR_SHIFT;
 	rx_status = (qword & I40E_RXD_QW1_STATUS_MASK) >>
 		    I40E_RXD_QW1_STATUS_SHIFT;
-	decoded = decode_rx_desc_ptype(ptype);
-
-	skb->ip_summed = CHECKSUM_NONE;
-
-	skb_checksum_none_assert(skb);
-
-	/* Rx csum enabled and ip headers found? */
-	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
-		return;
 
 	/* did the hardware decode the packet and checksum? */
 	if (!(rx_status & BIT(I40E_RX_DESC_STATUS_L3L4P_SHIFT)))
 		return;
 
-	/* both known and outer_ip must be set for the below code to work */
-	if (!(decoded.known && decoded.outer_ip))
-		return;
-
-	ipv4 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == I40E_RX_PTYPE_OUTER_IPV6);
+	ipv4 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4;
+	ipv6 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6;
 
 	if (ipv4 &&
 	    (rx_error & (BIT(I40E_RX_DESC_ERROR_IPE_SHIFT) |
@@ -1819,49 +1812,16 @@ static inline void i40e_rx_checksum(struct i40e_vsi *vsi,
 	 * we need to bump the checksum level by 1 to reflect the fact that
 	 * we are indicating we validated the inner checksum.
 	 */
-	if (decoded.tunnel_type >= I40E_RX_PTYPE_TUNNEL_IP_GRENAT)
+	if (parsed.tunnel_type >= LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT)
 		skb->csum_level = 1;
 
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case I40E_RX_PTYPE_INNER_PROT_TCP:
-	case I40E_RX_PTYPE_INNER_PROT_UDP:
-	case I40E_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		fallthrough;
-	default:
-		break;
-	}
-
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	return;
 
 checksum_fail:
 	vsi->back->hw_csum_rx_error++;
 }
 
-/**
- * i40e_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns a hash type to be used by skb_set_hash
- **/
-static inline int i40e_ptype_to_htype(u8 ptype)
-{
-	struct i40e_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
-
-	if (!decoded.known)
-		return PKT_HASH_TYPE_NONE;
-
-	if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
-	    decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4)
-		return PKT_HASH_TYPE_L4;
-	else if (decoded.outer_ip == I40E_RX_PTYPE_OUTER_IP &&
-		 decoded.payload_layer == I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3)
-		return PKT_HASH_TYPE_L3;
-	else
-		return PKT_HASH_TYPE_L2;
-}
-
 /**
  * i40e_rx_hash - set the hash value in the skb
  * @ring: descriptor ring
@@ -1874,17 +1834,19 @@ static inline void i40e_rx_hash(struct i40e_ring *ring,
 				struct sk_buff *skb,
 				u8 rx_ptype)
 {
+	struct libie_rx_ptype_parsed parsed;
 	u32 hash;
 	const __le64 rss_mask =
 		cpu_to_le64((u64)I40E_RX_DESC_FLTSTAT_RSS_HASH <<
 			    I40E_RX_DESC_STATUS_FLTSTAT_SHIFT);
 
-	if (!(ring->netdev->features & NETIF_F_RXHASH))
+	parsed = libie_parse_rx_ptype(rx_ptype);
+	if (!libie_has_rx_hash(ring->netdev, parsed))
 		return;
 
 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
-		skb_set_hash(skb, hash, i40e_ptype_to_htype(rx_ptype));
+		libie_skb_set_hash(skb, hash, parsed);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h b/drivers/net/ethernet/intel/i40e/i40e_type.h
index 388c3d36d96a55..05b8510f99a930 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -773,94 +773,6 @@ enum i40e_rx_desc_error_l3l4e_fcoe_masks {
 #define I40E_RXD_QW1_PTYPE_SHIFT	30
 #define I40E_RXD_QW1_PTYPE_MASK		(0xFFULL << I40E_RXD_QW1_PTYPE_SHIFT)
 
-/* Packet type non-ip values */
-enum i40e_rx_l2_ptype {
-	I40E_RX_PTYPE_L2_RESERVED			= 0,
-	I40E_RX_PTYPE_L2_MAC_PAY2			= 1,
-	I40E_RX_PTYPE_L2_TIMESYNC_PAY2			= 2,
-	I40E_RX_PTYPE_L2_FIP_PAY2			= 3,
-	I40E_RX_PTYPE_L2_OUI_PAY2			= 4,
-	I40E_RX_PTYPE_L2_MACCNTRL_PAY2			= 5,
-	I40E_RX_PTYPE_L2_LLDP_PAY2			= 6,
-	I40E_RX_PTYPE_L2_ECP_PAY2			= 7,
-	I40E_RX_PTYPE_L2_EVB_PAY2			= 8,
-	I40E_RX_PTYPE_L2_QCN_PAY2			= 9,
-	I40E_RX_PTYPE_L2_EAPOL_PAY2			= 10,
-	I40E_RX_PTYPE_L2_ARP				= 11,
-	I40E_RX_PTYPE_L2_FCOE_PAY3			= 12,
-	I40E_RX_PTYPE_L2_FCOE_FCDATA_PAY3		= 13,
-	I40E_RX_PTYPE_L2_FCOE_FCRDY_PAY3		= 14,
-	I40E_RX_PTYPE_L2_FCOE_FCRSP_PAY3		= 15,
-	I40E_RX_PTYPE_L2_FCOE_FCOTHER_PA		= 16,
-	I40E_RX_PTYPE_L2_FCOE_VFT_PAY3			= 17,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCDATA		= 18,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCRDY			= 19,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCRSP			= 20,
-	I40E_RX_PTYPE_L2_FCOE_VFT_FCOTHER		= 21,
-	I40E_RX_PTYPE_GRENAT4_MAC_PAY3			= 58,
-	I40E_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4	= 87,
-	I40E_RX_PTYPE_GRENAT6_MAC_PAY3			= 124,
-	I40E_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4	= 153
-};
-
-struct i40e_rx_ptype_decoded {
-	u32 known:1;
-	u32 outer_ip:1;
-	u32 outer_ip_ver:1;
-	u32 outer_frag:1;
-	u32 tunnel_type:3;
-	u32 tunnel_end_prot:2;
-	u32 tunnel_end_frag:1;
-	u32 inner_prot:4;
-	u32 payload_layer:3;
-};
-
-enum i40e_rx_ptype_outer_ip {
-	I40E_RX_PTYPE_OUTER_L2	= 0,
-	I40E_RX_PTYPE_OUTER_IP	= 1
-};
-
-enum i40e_rx_ptype_outer_ip_ver {
-	I40E_RX_PTYPE_OUTER_NONE	= 0,
-	I40E_RX_PTYPE_OUTER_IPV4	= 0,
-	I40E_RX_PTYPE_OUTER_IPV6	= 1
-};
-
-enum i40e_rx_ptype_outer_fragmented {
-	I40E_RX_PTYPE_NOT_FRAG	= 0,
-	I40E_RX_PTYPE_FRAG	= 1
-};
-
-enum i40e_rx_ptype_tunnel_type {
-	I40E_RX_PTYPE_TUNNEL_NONE		= 0,
-	I40E_RX_PTYPE_TUNNEL_IP_IP		= 1,
-	I40E_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
-	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
-	I40E_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
-};
-
-enum i40e_rx_ptype_tunnel_end_prot {
-	I40E_RX_PTYPE_TUNNEL_END_NONE	= 0,
-	I40E_RX_PTYPE_TUNNEL_END_IPV4	= 1,
-	I40E_RX_PTYPE_TUNNEL_END_IPV6	= 2,
-};
-
-enum i40e_rx_ptype_inner_prot {
-	I40E_RX_PTYPE_INNER_PROT_NONE		= 0,
-	I40E_RX_PTYPE_INNER_PROT_UDP		= 1,
-	I40E_RX_PTYPE_INNER_PROT_TCP		= 2,
-	I40E_RX_PTYPE_INNER_PROT_SCTP		= 3,
-	I40E_RX_PTYPE_INNER_PROT_ICMP		= 4,
-	I40E_RX_PTYPE_INNER_PROT_TIMESYNC	= 5
-};
-
-enum i40e_rx_ptype_payload_layer {
-	I40E_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
-	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
-	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
-	I40E_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
-};
-
 #define I40E_RXD_QW1_LENGTH_PBUF_SHIFT	38
 #define I40E_RXD_QW1_LENGTH_PBUF_MASK	(0x3FFFULL << \
 					 I40E_RXD_QW1_LENGTH_PBUF_SHIFT)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_common.c b/drivers/net/ethernet/intel/iavf/iavf_common.c
index dd11dbbd5551a2..ba6c9f154d189a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_common.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_common.c
@@ -499,259 +499,6 @@ enum iavf_status iavf_aq_set_rss_key(struct iavf_hw *hw, u16 vsi_id,
 	return iavf_aq_get_set_rss_key(hw, vsi_id, key, true);
 }
 
-/* The iavf_ptype_lookup table is used to convert from the 8-bit ptype in the
- * hardware to a bit-field that can be used by SW to more easily determine the
- * packet type.
- *
- * Macros are used to shorten the table lines and make this table human
- * readable.
- *
- * We store the PTYPE in the top byte of the bit field - this is just so that
- * we can check that the table doesn't have a row missing, as the index into
- * the table should be the PTYPE.
- *
- * Typical work flow:
- *
- * IF NOT iavf_ptype_lookup[ptype].known
- * THEN
- *      Packet is unknown
- * ELSE IF iavf_ptype_lookup[ptype].outer_ip == IAVF_RX_PTYPE_OUTER_IP
- *      Use the rest of the fields to look at the tunnels, inner protocols, etc
- * ELSE
- *      Use the enum iavf_rx_l2_ptype to decode the packet type
- * ENDIF
- */
-
-/* macro to make the table lines short, use explicit indexing with [PTYPE] */
-#define IAVF_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = { \
-		1, \
-		IAVF_RX_PTYPE_OUTER_##OUTER_IP, \
-		IAVF_RX_PTYPE_OUTER_##OUTER_IP_VER, \
-		IAVF_RX_PTYPE_##OUTER_FRAG, \
-		IAVF_RX_PTYPE_TUNNEL_##T, \
-		IAVF_RX_PTYPE_TUNNEL_END_##TE, \
-		IAVF_RX_PTYPE_##TEF, \
-		IAVF_RX_PTYPE_INNER_PROT_##I, \
-		IAVF_RX_PTYPE_PAYLOAD_LAYER_##PL }
-
-#define IAVF_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-
-/* shorter macros makes the table fit but are terse */
-#define IAVF_RX_PTYPE_NOF		IAVF_RX_PTYPE_NOT_FRAG
-#define IAVF_RX_PTYPE_FRG		IAVF_RX_PTYPE_FRAG
-#define IAVF_RX_PTYPE_INNER_PROT_TS	IAVF_RX_PTYPE_INNER_PROT_TIMESYNC
-
-/* Lookup table mapping the 8-bit HW PTYPE to the bit field for decoding */
-struct iavf_rx_ptype_decoded iavf_ptype_lookup[BIT(8)] = {
-	/* L2 Packet types */
-	IAVF_PTT_UNUSED_ENTRY(0),
-	IAVF_PTT(1,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT(2,  L2, NONE, NOF, NONE, NONE, NOF, TS,   PAY2),
-	IAVF_PTT(3,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT_UNUSED_ENTRY(4),
-	IAVF_PTT_UNUSED_ENTRY(5),
-	IAVF_PTT(6,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT(7,  L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT_UNUSED_ENTRY(8),
-	IAVF_PTT_UNUSED_ENTRY(9),
-	IAVF_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	IAVF_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	IAVF_PTT(12, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(13, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(14, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(15, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(16, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(17, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(18, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(19, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(20, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(21, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY3),
-
-	/* Non Tunneled IPv4 */
-	IAVF_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(25),
-	IAVF_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	IAVF_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	IAVF_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv4 */
-	IAVF_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(32),
-	IAVF_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv6 */
-	IAVF_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(39),
-	IAVF_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT */
-	IAVF_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> IPv4 */
-	IAVF_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(47),
-	IAVF_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> IPv6 */
-	IAVF_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(54),
-	IAVF_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC */
-	IAVF_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
-	IAVF_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(62),
-	IAVF_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
-	IAVF_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(69),
-	IAVF_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC/VLAN */
-	IAVF_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
-	IAVF_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(77),
-	IAVF_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
-	IAVF_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(84),
-	IAVF_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* Non Tunneled IPv6 */
-	IAVF_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
-	IAVF_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(91),
-	IAVF_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	IAVF_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	IAVF_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv4 */
-	IAVF_PTT(95,  IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(96,  IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(97,  IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(98),
-	IAVF_PTT(99,  IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv6 */
-	IAVF_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(105),
-	IAVF_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT */
-	IAVF_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> IPv4 */
-	IAVF_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(113),
-	IAVF_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> IPv6 */
-	IAVF_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(120),
-	IAVF_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC */
-	IAVF_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
-	IAVF_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(128),
-	IAVF_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
-	IAVF_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(135),
-	IAVF_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN */
-	IAVF_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
-	IAVF_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	IAVF_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	IAVF_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(143),
-	IAVF_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	IAVF_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	IAVF_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
-	IAVF_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	IAVF_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	IAVF_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	IAVF_PTT_UNUSED_ENTRY(150),
-	IAVF_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	IAVF_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	IAVF_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* unused entries */
-	[154 ... 255] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-};
-
 /**
  * iavf_aq_send_msg_to_pf
  * @hw: pointer to the hardware structure
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 095201e83c9db0..9f2e67a6cde3db 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -46,6 +46,7 @@ MODULE_DEVICE_TABLE(pci, iavf_pci_tbl);
 MODULE_ALIAS("i40evf");
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION("Intel(R) Ethernet Adaptive Virtual Function Network Driver");
+MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 
 static const struct net_device_ops iavf_netdev_ops;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_prototype.h b/drivers/net/ethernet/intel/iavf/iavf_prototype.h
index edebfbbcffdc2e..c2e5dbc0a75a35 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_prototype.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_prototype.h
@@ -51,13 +51,6 @@ enum iavf_status iavf_aq_set_rss_key(struct iavf_hw *hw, u16 seid,
 
 enum iavf_status iavf_set_mac_type(struct iavf_hw *hw);
 
-extern struct iavf_rx_ptype_decoded iavf_ptype_lookup[];
-
-static inline struct iavf_rx_ptype_decoded decode_rx_desc_ptype(u8 ptype)
-{
-	return iavf_ptype_lookup[ptype];
-}
-
 void iavf_vf_parse_hw_config(struct iavf_hw *hw,
 			     struct virtchnl_vf_resource *msg);
 enum iavf_status iavf_vf_reset(struct iavf_hw *hw);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index e989feda133c1e..a83b96e9b6fcf4 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/net/intel/libie/rx.h>
 #include <linux/prefetch.h>
 
 #include "iavf.h"
@@ -982,40 +983,32 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
 				    struct sk_buff *skb,
 				    union iavf_rx_desc *rx_desc)
 {
-	struct iavf_rx_ptype_decoded decoded;
+	struct libie_rx_ptype_parsed parsed;
 	u32 rx_error, rx_status;
 	bool ipv4, ipv6;
 	u8 ptype;
 	u64 qword;
 
+	skb->ip_summed = CHECKSUM_NONE;
+
 	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
 	ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> IAVF_RXD_QW1_PTYPE_SHIFT;
+
+	parsed = libie_parse_rx_ptype(ptype);
+	if (!libie_has_rx_checksum(vsi->netdev, parsed))
+		return;
+
 	rx_error = (qword & IAVF_RXD_QW1_ERROR_MASK) >>
 		   IAVF_RXD_QW1_ERROR_SHIFT;
 	rx_status = (qword & IAVF_RXD_QW1_STATUS_MASK) >>
 		    IAVF_RXD_QW1_STATUS_SHIFT;
-	decoded = decode_rx_desc_ptype(ptype);
-
-	skb->ip_summed = CHECKSUM_NONE;
-
-	skb_checksum_none_assert(skb);
-
-	/* Rx csum enabled and ip headers found? */
-	if (!(vsi->netdev->features & NETIF_F_RXCSUM))
-		return;
 
 	/* did the hardware decode the packet and checksum? */
 	if (!(rx_status & BIT(IAVF_RX_DESC_STATUS_L3L4P_SHIFT)))
 		return;
 
-	/* both known and outer_ip must be set for the below code to work */
-	if (!(decoded.known && decoded.outer_ip))
-		return;
-
-	ipv4 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == IAVF_RX_PTYPE_OUTER_IPV6);
+	ipv4 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4;
+	ipv6 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6;
 
 	if (ipv4 &&
 	    (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) |
@@ -1039,46 +1032,13 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
 	if (rx_error & BIT(IAVF_RX_DESC_ERROR_PPRS_SHIFT))
 		return;
 
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case IAVF_RX_PTYPE_INNER_PROT_TCP:
-	case IAVF_RX_PTYPE_INNER_PROT_UDP:
-	case IAVF_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		fallthrough;
-	default:
-		break;
-	}
-
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	return;
 
 checksum_fail:
 	vsi->back->hw_csum_rx_error++;
 }
 
-/**
- * iavf_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns a hash type to be used by skb_set_hash
- **/
-static inline int iavf_ptype_to_htype(u8 ptype)
-{
-	struct iavf_rx_ptype_decoded decoded = decode_rx_desc_ptype(ptype);
-
-	if (!decoded.known)
-		return PKT_HASH_TYPE_NONE;
-
-	if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
-	    decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY4)
-		return PKT_HASH_TYPE_L4;
-	else if (decoded.outer_ip == IAVF_RX_PTYPE_OUTER_IP &&
-		 decoded.payload_layer == IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY3)
-		return PKT_HASH_TYPE_L3;
-	else
-		return PKT_HASH_TYPE_L2;
-}
-
 /**
  * iavf_rx_hash - set the hash value in the skb
  * @ring: descriptor ring
@@ -1091,17 +1051,19 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
 				struct sk_buff *skb,
 				u8 rx_ptype)
 {
+	struct libie_rx_ptype_parsed parsed;
 	u32 hash;
 	const __le64 rss_mask =
 		cpu_to_le64((u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
 			    IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT);
 
-	if (!(ring->netdev->features & NETIF_F_RXHASH))
+	parsed = libie_parse_rx_ptype(rx_ptype);
+	if (!libie_has_rx_hash(ring->netdev, parsed))
 		return;
 
 	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
-		skb_set_hash(skb, hash, iavf_ptype_to_htype(rx_ptype));
+		libie_skb_set_hash(skb, hash, parsed);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_type.h b/drivers/net/ethernet/intel/iavf/iavf_type.h
index 9f1f523807c4e6..3030ba33032603 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_type.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_type.h
@@ -339,94 +339,6 @@ enum iavf_rx_desc_error_l3l4e_fcoe_masks {
 #define IAVF_RXD_QW1_PTYPE_SHIFT	30
 #define IAVF_RXD_QW1_PTYPE_MASK		(0xFFULL << IAVF_RXD_QW1_PTYPE_SHIFT)
 
-/* Packet type non-ip values */
-enum iavf_rx_l2_ptype {
-	IAVF_RX_PTYPE_L2_RESERVED			= 0,
-	IAVF_RX_PTYPE_L2_MAC_PAY2			= 1,
-	IAVF_RX_PTYPE_L2_TIMESYNC_PAY2			= 2,
-	IAVF_RX_PTYPE_L2_FIP_PAY2			= 3,
-	IAVF_RX_PTYPE_L2_OUI_PAY2			= 4,
-	IAVF_RX_PTYPE_L2_MACCNTRL_PAY2			= 5,
-	IAVF_RX_PTYPE_L2_LLDP_PAY2			= 6,
-	IAVF_RX_PTYPE_L2_ECP_PAY2			= 7,
-	IAVF_RX_PTYPE_L2_EVB_PAY2			= 8,
-	IAVF_RX_PTYPE_L2_QCN_PAY2			= 9,
-	IAVF_RX_PTYPE_L2_EAPOL_PAY2			= 10,
-	IAVF_RX_PTYPE_L2_ARP				= 11,
-	IAVF_RX_PTYPE_L2_FCOE_PAY3			= 12,
-	IAVF_RX_PTYPE_L2_FCOE_FCDATA_PAY3		= 13,
-	IAVF_RX_PTYPE_L2_FCOE_FCRDY_PAY3		= 14,
-	IAVF_RX_PTYPE_L2_FCOE_FCRSP_PAY3		= 15,
-	IAVF_RX_PTYPE_L2_FCOE_FCOTHER_PA		= 16,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_PAY3			= 17,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCDATA		= 18,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCRDY			= 19,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCRSP			= 20,
-	IAVF_RX_PTYPE_L2_FCOE_VFT_FCOTHER		= 21,
-	IAVF_RX_PTYPE_GRENAT4_MAC_PAY3			= 58,
-	IAVF_RX_PTYPE_GRENAT4_MACVLAN_IPV6_ICMP_PAY4	= 87,
-	IAVF_RX_PTYPE_GRENAT6_MAC_PAY3			= 124,
-	IAVF_RX_PTYPE_GRENAT6_MACVLAN_IPV6_ICMP_PAY4	= 153
-};
-
-struct iavf_rx_ptype_decoded {
-	u32 known:1;
-	u32 outer_ip:1;
-	u32 outer_ip_ver:1;
-	u32 outer_frag:1;
-	u32 tunnel_type:3;
-	u32 tunnel_end_prot:2;
-	u32 tunnel_end_frag:1;
-	u32 inner_prot:4;
-	u32 payload_layer:3;
-};
-
-enum iavf_rx_ptype_outer_ip {
-	IAVF_RX_PTYPE_OUTER_L2	= 0,
-	IAVF_RX_PTYPE_OUTER_IP	= 1
-};
-
-enum iavf_rx_ptype_outer_ip_ver {
-	IAVF_RX_PTYPE_OUTER_NONE	= 0,
-	IAVF_RX_PTYPE_OUTER_IPV4	= 0,
-	IAVF_RX_PTYPE_OUTER_IPV6	= 1
-};
-
-enum iavf_rx_ptype_outer_fragmented {
-	IAVF_RX_PTYPE_NOT_FRAG	= 0,
-	IAVF_RX_PTYPE_FRAG	= 1
-};
-
-enum iavf_rx_ptype_tunnel_type {
-	IAVF_RX_PTYPE_TUNNEL_NONE		= 0,
-	IAVF_RX_PTYPE_TUNNEL_IP_IP		= 1,
-	IAVF_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
-	IAVF_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
-	IAVF_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
-};
-
-enum iavf_rx_ptype_tunnel_end_prot {
-	IAVF_RX_PTYPE_TUNNEL_END_NONE	= 0,
-	IAVF_RX_PTYPE_TUNNEL_END_IPV4	= 1,
-	IAVF_RX_PTYPE_TUNNEL_END_IPV6	= 2,
-};
-
-enum iavf_rx_ptype_inner_prot {
-	IAVF_RX_PTYPE_INNER_PROT_NONE		= 0,
-	IAVF_RX_PTYPE_INNER_PROT_UDP		= 1,
-	IAVF_RX_PTYPE_INNER_PROT_TCP		= 2,
-	IAVF_RX_PTYPE_INNER_PROT_SCTP		= 3,
-	IAVF_RX_PTYPE_INNER_PROT_ICMP		= 4,
-	IAVF_RX_PTYPE_INNER_PROT_TIMESYNC	= 5
-};
-
-enum iavf_rx_ptype_payload_layer {
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
-	IAVF_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
-};
-
 #define IAVF_RXD_QW1_LENGTH_PBUF_SHIFT	38
 #define IAVF_RXD_QW1_LENGTH_PBUF_MASK	(0x3FFFULL << \
 					 IAVF_RXD_QW1_LENGTH_PBUF_SHIFT)
diff --git a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
index 89f986a75cc855..611577ebc29d82 100644
--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -160,64 +160,6 @@ struct ice_fltr_desc {
 				(0x1ULL << ICE_FXD_FLTR_WB_QW1_FAIL_PROF_S)
 #define ICE_FXD_FLTR_WB_QW1_FAIL_PROF_YES	0x1ULL
 
-struct ice_rx_ptype_decoded {
-	u32 known:1;
-	u32 outer_ip:1;
-	u32 outer_ip_ver:2;
-	u32 outer_frag:1;
-	u32 tunnel_type:3;
-	u32 tunnel_end_prot:2;
-	u32 tunnel_end_frag:1;
-	u32 inner_prot:4;
-	u32 payload_layer:3;
-};
-
-enum ice_rx_ptype_outer_ip {
-	ICE_RX_PTYPE_OUTER_L2	= 0,
-	ICE_RX_PTYPE_OUTER_IP	= 1,
-};
-
-enum ice_rx_ptype_outer_ip_ver {
-	ICE_RX_PTYPE_OUTER_NONE	= 0,
-	ICE_RX_PTYPE_OUTER_IPV4	= 1,
-	ICE_RX_PTYPE_OUTER_IPV6	= 2,
-};
-
-enum ice_rx_ptype_outer_fragmented {
-	ICE_RX_PTYPE_NOT_FRAG	= 0,
-	ICE_RX_PTYPE_FRAG	= 1,
-};
-
-enum ice_rx_ptype_tunnel_type {
-	ICE_RX_PTYPE_TUNNEL_NONE		= 0,
-	ICE_RX_PTYPE_TUNNEL_IP_IP		= 1,
-	ICE_RX_PTYPE_TUNNEL_IP_GRENAT		= 2,
-	ICE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC	= 3,
-	ICE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN	= 4,
-};
-
-enum ice_rx_ptype_tunnel_end_prot {
-	ICE_RX_PTYPE_TUNNEL_END_NONE	= 0,
-	ICE_RX_PTYPE_TUNNEL_END_IPV4	= 1,
-	ICE_RX_PTYPE_TUNNEL_END_IPV6	= 2,
-};
-
-enum ice_rx_ptype_inner_prot {
-	ICE_RX_PTYPE_INNER_PROT_NONE		= 0,
-	ICE_RX_PTYPE_INNER_PROT_UDP		= 1,
-	ICE_RX_PTYPE_INNER_PROT_TCP		= 2,
-	ICE_RX_PTYPE_INNER_PROT_SCTP		= 3,
-	ICE_RX_PTYPE_INNER_PROT_ICMP		= 4,
-	ICE_RX_PTYPE_INNER_PROT_TIMESYNC	= 5,
-};
-
-enum ice_rx_ptype_payload_layer {
-	ICE_RX_PTYPE_PAYLOAD_LAYER_NONE	= 0,
-	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY2	= 1,
-	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY3	= 2,
-	ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4	= 3,
-};
-
 /* Rx Flex Descriptor
  * This descriptor is used instead of the legacy version descriptor when
  * ice_rlan_ctx.adv_desc is set
@@ -651,262 +593,4 @@ struct ice_tlan_ctx {
 	u8 int_q_state;	/* width not needed - internal - DO NOT WRITE!!! */
 };
 
-/* The ice_ptype_lkup table is used to convert from the 10-bit ptype in the
- * hardware to a bit-field that can be used by SW to more easily determine the
- * packet type.
- *
- * Macros are used to shorten the table lines and make this table human
- * readable.
- *
- * We store the PTYPE in the top byte of the bit field - this is just so that
- * we can check that the table doesn't have a row missing, as the index into
- * the table should be the PTYPE.
- *
- * Typical work flow:
- *
- * IF NOT ice_ptype_lkup[ptype].known
- * THEN
- *      Packet is unknown
- * ELSE IF ice_ptype_lkup[ptype].outer_ip == ICE_RX_PTYPE_OUTER_IP
- *      Use the rest of the fields to look at the tunnels, inner protocols, etc
- * ELSE
- *      Use the enum ice_rx_l2_ptype to decode the packet type
- * ENDIF
- */
-
-/* macro to make the table lines short, use explicit indexing with [PTYPE] */
-#define ICE_PTT(PTYPE, OUTER_IP, OUTER_IP_VER, OUTER_FRAG, T, TE, TEF, I, PL)\
-	[PTYPE] = { \
-		1, \
-		ICE_RX_PTYPE_OUTER_##OUTER_IP, \
-		ICE_RX_PTYPE_OUTER_##OUTER_IP_VER, \
-		ICE_RX_PTYPE_##OUTER_FRAG, \
-		ICE_RX_PTYPE_TUNNEL_##T, \
-		ICE_RX_PTYPE_TUNNEL_END_##TE, \
-		ICE_RX_PTYPE_##TEF, \
-		ICE_RX_PTYPE_INNER_PROT_##I, \
-		ICE_RX_PTYPE_PAYLOAD_LAYER_##PL }
-
-#define ICE_PTT_UNUSED_ENTRY(PTYPE) [PTYPE] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-
-/* shorter macros makes the table fit but are terse */
-#define ICE_RX_PTYPE_NOF		ICE_RX_PTYPE_NOT_FRAG
-#define ICE_RX_PTYPE_FRG		ICE_RX_PTYPE_FRAG
-
-/* Lookup table mapping in the 10-bit HW PTYPE to the bit field for decoding */
-static const struct ice_rx_ptype_decoded ice_ptype_lkup[BIT(10)] = {
-	/* L2 Packet types */
-	ICE_PTT_UNUSED_ENTRY(0),
-	ICE_PTT(1, L2, NONE, NOF, NONE, NONE, NOF, NONE, PAY2),
-	ICE_PTT_UNUSED_ENTRY(2),
-	ICE_PTT_UNUSED_ENTRY(3),
-	ICE_PTT_UNUSED_ENTRY(4),
-	ICE_PTT_UNUSED_ENTRY(5),
-	ICE_PTT(6, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	ICE_PTT(7, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	ICE_PTT_UNUSED_ENTRY(8),
-	ICE_PTT_UNUSED_ENTRY(9),
-	ICE_PTT(10, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	ICE_PTT(11, L2, NONE, NOF, NONE, NONE, NOF, NONE, NONE),
-	ICE_PTT_UNUSED_ENTRY(12),
-	ICE_PTT_UNUSED_ENTRY(13),
-	ICE_PTT_UNUSED_ENTRY(14),
-	ICE_PTT_UNUSED_ENTRY(15),
-	ICE_PTT_UNUSED_ENTRY(16),
-	ICE_PTT_UNUSED_ENTRY(17),
-	ICE_PTT_UNUSED_ENTRY(18),
-	ICE_PTT_UNUSED_ENTRY(19),
-	ICE_PTT_UNUSED_ENTRY(20),
-	ICE_PTT_UNUSED_ENTRY(21),
-
-	/* Non Tunneled IPv4 */
-	ICE_PTT(22, IP, IPV4, FRG, NONE, NONE, NOF, NONE, PAY3),
-	ICE_PTT(23, IP, IPV4, NOF, NONE, NONE, NOF, NONE, PAY3),
-	ICE_PTT(24, IP, IPV4, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(25),
-	ICE_PTT(26, IP, IPV4, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	ICE_PTT(27, IP, IPV4, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	ICE_PTT(28, IP, IPV4, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv4 */
-	ICE_PTT(29, IP, IPV4, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(30, IP, IPV4, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(31, IP, IPV4, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(32),
-	ICE_PTT(33, IP, IPV4, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(34, IP, IPV4, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(35, IP, IPV4, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> IPv6 */
-	ICE_PTT(36, IP, IPV4, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(37, IP, IPV4, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(38, IP, IPV4, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(39),
-	ICE_PTT(40, IP, IPV4, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(41, IP, IPV4, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(42, IP, IPV4, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT */
-	ICE_PTT(43, IP, IPV4, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> IPv4 */
-	ICE_PTT(44, IP, IPV4, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(45, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(46, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(47),
-	ICE_PTT(48, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(49, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(50, IP, IPV4, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> IPv6 */
-	ICE_PTT(51, IP, IPV4, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(52, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(53, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(54),
-	ICE_PTT(55, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(56, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(57, IP, IPV4, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC */
-	ICE_PTT(58, IP, IPV4, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 --> GRE/NAT --> MAC --> IPv4 */
-	ICE_PTT(59, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(60, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(61, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(62),
-	ICE_PTT(63, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(64, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(65, IP, IPV4, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT -> MAC --> IPv6 */
-	ICE_PTT(66, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(67, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(68, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(69),
-	ICE_PTT(70, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(71, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(72, IP, IPV4, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv4 --> GRE/NAT --> MAC/VLAN */
-	ICE_PTT(73, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv4 ---> GRE/NAT -> MAC/VLAN --> IPv4 */
-	ICE_PTT(74, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(75, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(76, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(77),
-	ICE_PTT(78, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(79, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(80, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv4 -> GRE/NAT -> MAC/VLAN --> IPv6 */
-	ICE_PTT(81, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(82, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(83, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(84),
-	ICE_PTT(85, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(86, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(87, IP, IPV4, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* Non Tunneled IPv6 */
-	ICE_PTT(88, IP, IPV6, FRG, NONE, NONE, NOF, NONE, PAY3),
-	ICE_PTT(89, IP, IPV6, NOF, NONE, NONE, NOF, NONE, PAY3),
-	ICE_PTT(90, IP, IPV6, NOF, NONE, NONE, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(91),
-	ICE_PTT(92, IP, IPV6, NOF, NONE, NONE, NOF, TCP,  PAY4),
-	ICE_PTT(93, IP, IPV6, NOF, NONE, NONE, NOF, SCTP, PAY4),
-	ICE_PTT(94, IP, IPV6, NOF, NONE, NONE, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv4 */
-	ICE_PTT(95, IP, IPV6, NOF, IP_IP, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(96, IP, IPV6, NOF, IP_IP, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(97, IP, IPV6, NOF, IP_IP, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(98),
-	ICE_PTT(99, IP, IPV6, NOF, IP_IP, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(100, IP, IPV6, NOF, IP_IP, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(101, IP, IPV6, NOF, IP_IP, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> IPv6 */
-	ICE_PTT(102, IP, IPV6, NOF, IP_IP, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(103, IP, IPV6, NOF, IP_IP, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(104, IP, IPV6, NOF, IP_IP, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(105),
-	ICE_PTT(106, IP, IPV6, NOF, IP_IP, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(107, IP, IPV6, NOF, IP_IP, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(108, IP, IPV6, NOF, IP_IP, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT */
-	ICE_PTT(109, IP, IPV6, NOF, IP_GRENAT, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> IPv4 */
-	ICE_PTT(110, IP, IPV6, NOF, IP_GRENAT, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(111, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(112, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(113),
-	ICE_PTT(114, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(115, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(116, IP, IPV6, NOF, IP_GRENAT, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> IPv6 */
-	ICE_PTT(117, IP, IPV6, NOF, IP_GRENAT, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(118, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(119, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(120),
-	ICE_PTT(121, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(122, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(123, IP, IPV6, NOF, IP_GRENAT, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC */
-	ICE_PTT(124, IP, IPV6, NOF, IP_GRENAT_MAC, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv4 */
-	ICE_PTT(125, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(126, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(127, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(128),
-	ICE_PTT(129, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(130, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(131, IP, IPV6, NOF, IP_GRENAT_MAC, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC -> IPv6 */
-	ICE_PTT(132, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(133, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(134, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(135),
-	ICE_PTT(136, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(137, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(138, IP, IPV6, NOF, IP_GRENAT_MAC, IPV6, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN */
-	ICE_PTT(139, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, NONE, NOF, NONE, PAY3),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv4 */
-	ICE_PTT(140, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, FRG, NONE, PAY3),
-	ICE_PTT(141, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, NONE, PAY3),
-	ICE_PTT(142, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(143),
-	ICE_PTT(144, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, TCP,  PAY4),
-	ICE_PTT(145, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, SCTP, PAY4),
-	ICE_PTT(146, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV4, NOF, ICMP, PAY4),
-
-	/* IPv6 --> GRE/NAT -> MAC/VLAN --> IPv6 */
-	ICE_PTT(147, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, FRG, NONE, PAY3),
-	ICE_PTT(148, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, NONE, PAY3),
-	ICE_PTT(149, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, UDP,  PAY4),
-	ICE_PTT_UNUSED_ENTRY(150),
-	ICE_PTT(151, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, TCP,  PAY4),
-	ICE_PTT(152, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, SCTP, PAY4),
-	ICE_PTT(153, IP, IPV6, NOF, IP_GRENAT_MAC_VLAN, IPV6, NOF, ICMP, PAY4),
-
-	/* unused entries */
-	[154 ... 1023] = { 0, 0, 0, 0, 0, 0, 0, 0, 0 }
-};
-
-static inline struct ice_rx_ptype_decoded ice_decode_rx_desc_ptype(u16 ptype)
-{
-	return ice_ptype_lkup[ptype];
-}
-
-
 #endif /* _ICE_LAN_TX_RX_H_ */
diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
index a1f7c8edc22f34..f3d9c5ddef33e3 100644
--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -34,6 +34,7 @@ static const char ice_copyright[] = "Copyright (c) 2018, Intel Corporation.";
 
 MODULE_AUTHOR("Intel Corporation, <linux.nics@intel.com>");
 MODULE_DESCRIPTION(DRV_SUMMARY);
+MODULE_IMPORT_NS(LIBIE);
 MODULE_LICENSE("GPL v2");
 MODULE_FIRMWARE(ICE_DDP_PKG_FILE);
 
diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
index 7bc5aa340c7df7..3b3793428ab9ca 100644
--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.c
@@ -2,6 +2,7 @@
 /* Copyright (c) 2019, Intel Corporation. */
 
 #include <linux/filter.h>
+#include <linux/net/intel/libie/rx.h>
 
 #include "ice_txrx_lib.h"
 #include "ice_eswitch.h"
@@ -38,30 +39,6 @@ void ice_release_rx_desc(struct ice_rx_ring *rx_ring, u16 val)
 	}
 }
 
-/**
- * ice_ptype_to_htype - get a hash type
- * @ptype: the ptype value from the descriptor
- *
- * Returns appropriate hash type (such as PKT_HASH_TYPE_L2/L3/L4) to be used by
- * skb_set_hash based on PTYPE as parsed by HW Rx pipeline and is part of
- * Rx desc.
- */
-static enum pkt_hash_types ice_ptype_to_htype(u16 ptype)
-{
-	struct ice_rx_ptype_decoded decoded = ice_decode_rx_desc_ptype(ptype);
-
-	if (!decoded.known)
-		return PKT_HASH_TYPE_NONE;
-	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY4)
-		return PKT_HASH_TYPE_L4;
-	if (decoded.payload_layer == ICE_RX_PTYPE_PAYLOAD_LAYER_PAY3)
-		return PKT_HASH_TYPE_L3;
-	if (decoded.outer_ip == ICE_RX_PTYPE_OUTER_L2)
-		return PKT_HASH_TYPE_L2;
-
-	return PKT_HASH_TYPE_NONE;
-}
-
 /**
  * ice_rx_hash - set the hash value in the skb
  * @rx_ring: descriptor ring
@@ -74,9 +51,11 @@ ice_rx_hash(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
 	    struct sk_buff *skb, u16 rx_ptype)
 {
 	struct ice_32b_rx_flex_desc_nic *nic_mdid;
+	struct libie_rx_ptype_parsed parsed;
 	u32 hash;
 
-	if (!(rx_ring->netdev->features & NETIF_F_RXHASH))
+	parsed = libie_parse_rx_ptype(rx_ptype);
+	if (!libie_has_rx_hash(rx_ring->netdev, parsed))
 		return;
 
 	if (rx_desc->wb.rxdid != ICE_RXDID_FLEX_NIC)
@@ -84,7 +63,7 @@ ice_rx_hash(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
 
 	nic_mdid = (struct ice_32b_rx_flex_desc_nic *)rx_desc;
 	hash = le32_to_cpu(nic_mdid->rss_hash);
-	skb_set_hash(skb, hash, ice_ptype_to_htype(rx_ptype));
+	libie_skb_set_hash(skb, hash, parsed);
 }
 
 /**
@@ -92,7 +71,7 @@ ice_rx_hash(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc,
  * @ring: the ring we care about
  * @skb: skb currently being received and modified
  * @rx_desc: the receive descriptor
- * @ptype: the packet type decoded by hardware
+ * @ptype: the packet type parsed by hardware
  *
  * skb->protocol must be set before this function is called
  */
@@ -100,34 +79,26 @@ static void
 ice_rx_csum(struct ice_rx_ring *ring, struct sk_buff *skb,
 	    union ice_32b_rx_flex_desc *rx_desc, u16 ptype)
 {
-	struct ice_rx_ptype_decoded decoded;
+	struct libie_rx_ptype_parsed parsed;
 	u16 rx_status0, rx_status1;
 	bool ipv4, ipv6;
 
-	rx_status0 = le16_to_cpu(rx_desc->wb.status_error0);
-	rx_status1 = le16_to_cpu(rx_desc->wb.status_error1);
-
-	decoded = ice_decode_rx_desc_ptype(ptype);
-
 	/* Start with CHECKSUM_NONE and by default csum_level = 0 */
 	skb->ip_summed = CHECKSUM_NONE;
-	skb_checksum_none_assert(skb);
 
-	/* check if Rx checksum is enabled */
-	if (!(ring->netdev->features & NETIF_F_RXCSUM))
+	parsed = libie_parse_rx_ptype(ptype);
+	if (!libie_has_rx_checksum(ring->netdev, parsed))
 		return;
 
-	/* check if HW has decoded the packet and checksum */
-	if (!(rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)))
-		return;
+	rx_status0 = le16_to_cpu(rx_desc->wb.status_error0);
+	rx_status1 = le16_to_cpu(rx_desc->wb.status_error1);
 
-	if (!(decoded.known && decoded.outer_ip))
+	/* check if HW has parsed the packet and checksum */
+	if (!(rx_status0 & BIT(ICE_RX_FLEX_DESC_STATUS0_L3L4P_S)))
 		return;
 
-	ipv4 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV4);
-	ipv6 = (decoded.outer_ip == ICE_RX_PTYPE_OUTER_IP) &&
-	       (decoded.outer_ip_ver == ICE_RX_PTYPE_OUTER_IPV6);
+	ipv4 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4;
+	ipv6 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6;
 
 	if (ipv4 && (rx_status0 & (BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_IPE_S) |
 				   BIT(ICE_RX_FLEX_DESC_STATUS0_XSUM_EIPE_S))))
@@ -151,19 +122,10 @@ ice_rx_csum(struct ice_rx_ring *ring, struct sk_buff *skb,
 	 * we need to bump the checksum level by 1 to reflect the fact that
 	 * we are indicating we validated the inner checksum.
 	 */
-	if (decoded.tunnel_type >= ICE_RX_PTYPE_TUNNEL_IP_GRENAT)
+	if (parsed.tunnel_type >= LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT)
 		skb->csum_level = 1;
 
-	/* Only report checksum unnecessary for TCP, UDP, or SCTP */
-	switch (decoded.inner_prot) {
-	case ICE_RX_PTYPE_INNER_PROT_TCP:
-	case ICE_RX_PTYPE_INNER_PROT_UDP:
-	case ICE_RX_PTYPE_INNER_PROT_SCTP:
-		skb->ip_summed = CHECKSUM_UNNECESSARY;
-		break;
-	default:
-		break;
-	}
+	skb->ip_summed = CHECKSUM_UNNECESSARY;
 	return;
 
 checksum_fail:
@@ -175,7 +137,7 @@ ice_rx_csum(struct ice_rx_ring *ring, struct sk_buff *skb,
  * @rx_ring: Rx descriptor ring packet is being transacted on
  * @rx_desc: pointer to the EOP Rx descriptor
  * @skb: pointer to current skb being populated
- * @ptype: the packet type decoded by hardware
+ * @ptype: the packet type parsed by hardware
  *
  * This function checks the ring, descriptor, and packet information in
  * order to populate the hash, checksum, VLAN, protocol, and
diff --git a/drivers/net/ethernet/intel/libie/Makefile b/drivers/net/ethernet/intel/libie/Makefile
new file mode 100644
index 00000000000000..95e81d09b4746c
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+# Copyright(c) 2023 Intel Corporation.
+
+obj-$(CONFIG_LIBIE)	+= libie.o
+
+libie-objs		+= rx.o
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
new file mode 100644
index 00000000000000..f503476d8eeff9
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -0,0 +1,110 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+#include <linux/net/intel/libie/rx.h>
+
+/* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
+ * bitfield struct.
+ */
+
+#define LIBIE_RX_PTYPE(oip, ofrag, tun, tp, tefr, iprot, pl) {		   \
+		.outer_ip		= LIBIE_RX_PTYPE_OUTER_##oip,	   \
+		.outer_frag		= LIBIE_RX_PTYPE_##ofrag,	   \
+		.tunnel_type		= LIBIE_RX_PTYPE_TUNNEL_IP_##tun,  \
+		.tunnel_end_prot	= LIBIE_RX_PTYPE_TUNNEL_END_##tp,  \
+		.tunnel_end_frag	= LIBIE_RX_PTYPE_##tefr,	   \
+		.inner_prot		= LIBIE_RX_PTYPE_INNER_##iprot,	   \
+		.payload_layer		= LIBIE_RX_PTYPE_PAYLOAD_##pl,	   \
+	}
+
+#define LIBIE_RX_PTYPE_UNUSED		{ }
+
+#define __LIBIE_RX_PTYPE_L2(iprot, pl)					   \
+	LIBIE_RX_PTYPE(L2, NOT_FRAG, NONE, NONE, NOT_FRAG, iprot, pl)
+#define LIBIE_RX_PTYPE_L2		__LIBIE_RX_PTYPE_L2(NONE, L2)
+#define LIBIE_RX_PTYPE_TS		__LIBIE_RX_PTYPE_L2(TIMESYNC, L2)
+#define LIBIE_RX_PTYPE_L3		__LIBIE_RX_PTYPE_L2(NONE, L3)
+
+#define LIBIE_RX_PTYPE_IP_FRAG(oip)					   \
+	LIBIE_RX_PTYPE(IPV##oip, FRAG, NONE, NONE, NOT_FRAG, NONE, L3)
+#define LIBIE_RX_PTYPE_IP_L3(oip, tun, teprot, tefr)			   \
+	LIBIE_RX_PTYPE(IPV##oip, NOT_FRAG, tun, teprot, tefr, NONE, L3)
+#define LIBIE_RX_PTYPE_IP_L4(oip, tun, teprot, iprot)			   \
+	LIBIE_RX_PTYPE(IPV##oip, NOT_FRAG, tun, teprot, NOT_FRAG, iprot, L4)
+
+#define LIBIE_RX_PTYPE_IP_NOF(oip, tun, ver)				   \
+	LIBIE_RX_PTYPE_IP_L3(oip, tun, ver, NOT_FRAG),			   \
+	LIBIE_RX_PTYPE_IP_L4(oip, tun, ver, UDP),			   \
+	LIBIE_RX_PTYPE_UNUSED,						   \
+	LIBIE_RX_PTYPE_IP_L4(oip, tun, ver, TCP),			   \
+	LIBIE_RX_PTYPE_IP_L4(oip, tun, ver, SCTP),			   \
+	LIBIE_RX_PTYPE_IP_L4(oip, tun, ver, ICMP)
+
+/* IPv oip --> tun --> IPv ver */
+#define LIBIE_RX_PTYPE_IP_TUN_VER(oip, tun, ver)			   \
+	LIBIE_RX_PTYPE_IP_L3(oip, tun, ver, FRAG),			   \
+	LIBIE_RX_PTYPE_IP_NOF(oip, tun, ver)
+
+/* Non Tunneled IPv oip */
+#define LIBIE_RX_PTYPE_IP_RAW(oip)					   \
+	LIBIE_RX_PTYPE_IP_FRAG(oip),					   \
+	LIBIE_RX_PTYPE_IP_NOF(oip, NONE, NONE)
+
+/* IPv oip --> tun --> { IPv4, IPv6 } */
+#define LIBIE_RX_PTYPE_IP_TUN(oip, tun)					   \
+	LIBIE_RX_PTYPE_IP_TUN_VER(oip, tun, IPV4),			   \
+	LIBIE_RX_PTYPE_IP_TUN_VER(oip, tun, IPV6)
+
+/* IPv oip --> GRE/NAT tun --> { x, IPv4, IPv6 } */
+#define LIBIE_RX_PTYPE_IP_GRE(oip, tun)					   \
+	LIBIE_RX_PTYPE_IP_L3(oip, tun, NONE, NOT_FRAG),			   \
+	LIBIE_RX_PTYPE_IP_TUN(oip, tun)
+
+/* Non Tunneled IPv oip
+ * IPv oip --> { IPv4, IPv6 }
+ * IPv oip --> GRE/NAT --> { x, IPv4, IPv6 }
+ * IPv oip --> GRE/NAT --> MAC --> { x, IPv4, IPv6 }
+ * IPv oip --> GRE/NAT --> MAC/VLAN --> { x, IPv4, IPv6 }
+ */
+#define LIBIE_RX_PTYPE_IP(oip)						   \
+	LIBIE_RX_PTYPE_IP_RAW(oip),					   \
+	LIBIE_RX_PTYPE_IP_TUN(oip, IP),					   \
+	LIBIE_RX_PTYPE_IP_GRE(oip, GRENAT),				   \
+	LIBIE_RX_PTYPE_IP_GRE(oip, GRENAT_MAC),				   \
+	LIBIE_RX_PTYPE_IP_GRE(oip, GRENAT_MAC_VLAN)
+
+/* Lookup table mapping for O(1) parsing */
+const struct libie_rx_ptype_parsed libie_rx_ptype_lut[LIBIE_RX_PTYPE_NUM] = {
+	/* L2 packet types */
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_TS,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_UNUSED,
+	LIBIE_RX_PTYPE_L2,
+	LIBIE_RX_PTYPE_UNUSED,
+
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+	LIBIE_RX_PTYPE_L3,
+
+	LIBIE_RX_PTYPE_IP(4),
+	LIBIE_RX_PTYPE_IP(6),
+};
+EXPORT_SYMBOL_NS_GPL(libie_rx_ptype_lut, LIBIE);
+
+MODULE_AUTHOR("Intel Corporation");
+MODULE_DESCRIPTION("Intel(R) Ethernet common library");
+MODULE_LICENSE("GPL");
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
new file mode 100644
index 00000000000000..58bd0f35d0253f
--- /dev/null
+++ b/include/linux/net/intel/libie/rx.h
@@ -0,0 +1,128 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2023 Intel Corporation. */
+
+#ifndef __LIBIE_RX_H
+#define __LIBIE_RX_H
+
+#include <linux/netdevice.h>
+
+/* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
+ * bitfield struct.
+ */
+
+struct libie_rx_ptype_parsed {
+	u16	outer_ip:2;
+	u16	outer_frag:1;
+	u16	tunnel_type:3;
+	u16	tunnel_end_prot:2;
+	u16	tunnel_end_frag:1;
+	u16	inner_prot:3;
+	u16	payload_layer:2;
+};
+
+enum libie_rx_ptype_outer_ip {
+	LIBIE_RX_PTYPE_OUTER_L2				= 0U,
+	LIBIE_RX_PTYPE_OUTER_IPV4,
+	LIBIE_RX_PTYPE_OUTER_IPV6,
+};
+
+enum libie_rx_ptype_outer_fragmented {
+	LIBIE_RX_PTYPE_NOT_FRAG				= 0U,
+	LIBIE_RX_PTYPE_FRAG,
+};
+
+enum libie_rx_ptype_tunnel_type {
+	LIBIE_RX_PTYPE_TUNNEL_IP_NONE			= 0U,
+	LIBIE_RX_PTYPE_TUNNEL_IP_IP,
+	LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT,
+	LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC,
+	LIBIE_RX_PTYPE_TUNNEL_IP_GRENAT_MAC_VLAN,
+};
+
+enum libie_rx_ptype_tunnel_end_prot {
+	LIBIE_RX_PTYPE_TUNNEL_END_NONE			= 0U,
+	LIBIE_RX_PTYPE_TUNNEL_END_IPV4,
+	LIBIE_RX_PTYPE_TUNNEL_END_IPV6,
+};
+
+enum libie_rx_ptype_inner_prot {
+	LIBIE_RX_PTYPE_INNER_NONE			= 0U,
+	LIBIE_RX_PTYPE_INNER_UDP,
+	LIBIE_RX_PTYPE_INNER_TCP,
+	LIBIE_RX_PTYPE_INNER_SCTP,
+	LIBIE_RX_PTYPE_INNER_ICMP,
+	LIBIE_RX_PTYPE_INNER_TIMESYNC,
+};
+
+enum libie_rx_ptype_payload_layer {
+	LIBIE_RX_PTYPE_PAYLOAD_NONE			= PKT_HASH_TYPE_NONE,
+	LIBIE_RX_PTYPE_PAYLOAD_L2			= PKT_HASH_TYPE_L2,
+	LIBIE_RX_PTYPE_PAYLOAD_L3			= PKT_HASH_TYPE_L3,
+	LIBIE_RX_PTYPE_PAYLOAD_L4			= PKT_HASH_TYPE_L4,
+};
+
+#define LIBIE_RX_PTYPE_NUM				154
+
+extern const struct libie_rx_ptype_parsed
+libie_rx_ptype_lut[LIBIE_RX_PTYPE_NUM];
+
+/**
+ * libie_parse_rx_ptype - convert HW packet type to software bitfield structure
+ * @ptype: 10-bit hardware packet type value from the descriptor
+ *
+ * @libie_rx_ptype_lut must be accessed only using this wrapper.
+ *
+ * Returns the parsed bitfield struct corresponding to the provided ptype.
+ */
+static inline struct libie_rx_ptype_parsed libie_parse_rx_ptype(u32 ptype)
+{
+	if (unlikely(ptype >= LIBIE_RX_PTYPE_NUM))
+		ptype = 0;
+
+	return libie_rx_ptype_lut[ptype];
+}
+
+/* libie_has_*() can be used to quickly check whether the HW metadata is
+ * available to avoid further expensive processing such as descriptor reads.
+ * They already check for the corresponding netdev feature to be enabled,
+ * thus can be used as drop-in replacements.
+ */
+
+static inline bool libie_has_rx_checksum(const struct net_device *dev,
+					 struct libie_rx_ptype_parsed parsed)
+{
+	/* _INNER_{SCTP,TCP,UDP} are possible only when _OUTER_IPV* is set,
+	 * it is enough to check only for the L4 type.
+	 */
+	switch (parsed.inner_prot) {
+	case LIBIE_RX_PTYPE_INNER_TCP:
+	case LIBIE_RX_PTYPE_INNER_UDP:
+	case LIBIE_RX_PTYPE_INNER_SCTP:
+		return dev->features & NETIF_F_RXCSUM;
+	default:
+		return false;
+	}
+}
+
+static inline bool libie_has_rx_hash(const struct net_device *dev,
+				     struct libie_rx_ptype_parsed parsed)
+{
+	if (parsed.payload_layer < LIBIE_RX_PTYPE_PAYLOAD_L2)
+		return false;
+
+	return dev->features & NETIF_F_RXHASH;
+}
+
+/**
+ * libie_skb_set_hash - fill in skb hash value basing on the parsed ptype
+ * @skb: skb to fill the hash in
+ * @hash: 32-bit hash value from the descriptor
+ * @parsed: parsed packet type
+ */
+static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
+				      struct libie_rx_ptype_parsed parsed)
+{
+	skb_set_hash(skb, hash, parsed.payload_layer);
+}
+
+#endif /* __LIBIE_RX_H */

From 0e490c5d12f78f30563e8e736143ebb16acaea10 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 1 Feb 2023 17:45:17 +0100
Subject: [PATCH 02/32] iavf: kill "legacy-rx" for good

Ever since build_skb() became stable, the old way with allocating an skb
for storing the headers separately, which will be then copied manually,
was slower, less flexible and thus obsolete.

* it had higher pressure on MM since it actually allocates new pages,
  which then get split and refcount-biased (NAPI page cache);
* it implies memcpy() of packet headers (40+ bytes per each frame);
* the actual header length was calculated via eth_get_headlen(), which
  invokes Flow Dissector and thus wastes a bunch of CPU cycles;
* XDP makes it even more weird since it requires headroom for long and
  also tailroom for some time (since mbuf landed). Take a look at the
  ice driver, which is built around work-arounds to make XDP work with
  it.

Even on some quite low-end hardware (not a common case for 100G NICs) it
was performing worse.
The only advantage "legacy-rx" had is that it didn't require any
reserved headroom and tailroom. But iavf didn't use this, as it always
splits pages into two halves of 2k, while that save would only be useful
when striding. And again, XDP effectively removes that sole pro.

There's a train of features to land in IAVF soon: Page Pool, XDP, XSk,
multi-buffer etc. Each new would require adding more and more Danse
Macabre for absolutely no reason, besides making hotpath less and less
effective.
Remove the "feature" with all the related code. This includes at least
one very hot branch (typically hit on each new frame), which was either
always-true or always-false at least for a complete NAPI bulk of 64
frames, the whole private flags cruft and so on. Some stats:

Function: add/remove: 0/2 grow/shrink: 0/7 up/down: 0/-774 (-774)
RO Data: add/remove: 0/1 grow/shrink: 0/0 up/down: 0/-40 (-40)

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h        |   2 +-
 .../net/ethernet/intel/iavf/iavf_ethtool.c    | 140 ------------------
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  10 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  84 +----------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   |  18 +--
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |   3 +-
 6 files changed, 8 insertions(+), 249 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 2cdce251472c08..7dbec98d2a983f 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -294,7 +294,7 @@ struct iavf_adapter {
 #define IAVF_FLAG_CLIENT_NEEDS_L2_PARAMS	BIT(12)
 #define IAVF_FLAG_PROMISC_ON			BIT(13)
 #define IAVF_FLAG_ALLMULTI_ON			BIT(14)
-#define IAVF_FLAG_LEGACY_RX			BIT(15)
+/* BIT(15) is free, was IAVF_FLAG_LEGACY_RX */
 #define IAVF_FLAG_REINIT_ITR_NEEDED		BIT(16)
 #define IAVF_FLAG_QUEUES_DISABLED		BIT(17)
 #define IAVF_FLAG_SETUP_NETDEV_FEATURES		BIT(18)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index 6f171d1d85b75f..de3050c02b6ffc 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -239,29 +239,6 @@ static const struct iavf_stats iavf_gstrings_stats[] = {
 
 #define IAVF_QUEUE_STATS_LEN	ARRAY_SIZE(iavf_gstrings_queue_stats)
 
-/* For now we have one and only one private flag and it is only defined
- * when we have support for the SKIP_CPU_SYNC DMA attribute.  Instead
- * of leaving all this code sitting around empty we will strip it unless
- * our one private flag is actually available.
- */
-struct iavf_priv_flags {
-	char flag_string[ETH_GSTRING_LEN];
-	u32 flag;
-	bool read_only;
-};
-
-#define IAVF_PRIV_FLAG(_name, _flag, _read_only) { \
-	.flag_string = _name, \
-	.flag = _flag, \
-	.read_only = _read_only, \
-}
-
-static const struct iavf_priv_flags iavf_gstrings_priv_flags[] = {
-	IAVF_PRIV_FLAG("legacy-rx", IAVF_FLAG_LEGACY_RX, 0),
-};
-
-#define IAVF_PRIV_FLAGS_STR_LEN ARRAY_SIZE(iavf_gstrings_priv_flags)
-
 /**
  * iavf_get_link_ksettings - Get Link Speed and Duplex settings
  * @netdev: network interface device structure
@@ -341,8 +318,6 @@ static int iavf_get_sset_count(struct net_device *netdev, int sset)
 		return IAVF_STATS_LEN +
 			(IAVF_QUEUE_STATS_LEN * 2 *
 			 netdev->real_num_tx_queues);
-	else if (sset == ETH_SS_PRIV_FLAGS)
-		return IAVF_PRIV_FLAGS_STR_LEN;
 	else
 		return -EINVAL;
 }
@@ -384,24 +359,6 @@ static void iavf_get_ethtool_stats(struct net_device *netdev,
 	rcu_read_unlock();
 }
 
-/**
- * iavf_get_priv_flag_strings - Get private flag strings
- * @netdev: network interface device structure
- * @data: buffer for string data
- *
- * Builds the private flags string table
- **/
-static void iavf_get_priv_flag_strings(struct net_device *netdev, u8 *data)
-{
-	unsigned int i;
-
-	for (i = 0; i < IAVF_PRIV_FLAGS_STR_LEN; i++) {
-		snprintf(data, ETH_GSTRING_LEN, "%s",
-			 iavf_gstrings_priv_flags[i].flag_string);
-		data += ETH_GSTRING_LEN;
-	}
-}
-
 /**
  * iavf_get_stat_strings - Get stat strings
  * @netdev: network interface device structure
@@ -440,105 +397,11 @@ static void iavf_get_strings(struct net_device *netdev, u32 sset, u8 *data)
 	case ETH_SS_STATS:
 		iavf_get_stat_strings(netdev, data);
 		break;
-	case ETH_SS_PRIV_FLAGS:
-		iavf_get_priv_flag_strings(netdev, data);
-		break;
 	default:
 		break;
 	}
 }
 
-/**
- * iavf_get_priv_flags - report device private flags
- * @netdev: network interface device structure
- *
- * The get string set count and the string set should be matched for each
- * flag returned.  Add new strings for each flag to the iavf_gstrings_priv_flags
- * array.
- *
- * Returns a u32 bitmap of flags.
- **/
-static u32 iavf_get_priv_flags(struct net_device *netdev)
-{
-	struct iavf_adapter *adapter = netdev_priv(netdev);
-	u32 i, ret_flags = 0;
-
-	for (i = 0; i < IAVF_PRIV_FLAGS_STR_LEN; i++) {
-		const struct iavf_priv_flags *priv_flags;
-
-		priv_flags = &iavf_gstrings_priv_flags[i];
-
-		if (priv_flags->flag & adapter->flags)
-			ret_flags |= BIT(i);
-	}
-
-	return ret_flags;
-}
-
-/**
- * iavf_set_priv_flags - set private flags
- * @netdev: network interface device structure
- * @flags: bit flags to be set
- **/
-static int iavf_set_priv_flags(struct net_device *netdev, u32 flags)
-{
-	struct iavf_adapter *adapter = netdev_priv(netdev);
-	u32 orig_flags, new_flags, changed_flags;
-	u32 i;
-
-	orig_flags = READ_ONCE(adapter->flags);
-	new_flags = orig_flags;
-
-	for (i = 0; i < IAVF_PRIV_FLAGS_STR_LEN; i++) {
-		const struct iavf_priv_flags *priv_flags;
-
-		priv_flags = &iavf_gstrings_priv_flags[i];
-
-		if (flags & BIT(i))
-			new_flags |= priv_flags->flag;
-		else
-			new_flags &= ~(priv_flags->flag);
-
-		if (priv_flags->read_only &&
-		    ((orig_flags ^ new_flags) & ~BIT(i)))
-			return -EOPNOTSUPP;
-	}
-
-	/* Before we finalize any flag changes, any checks which we need to
-	 * perform to determine if the new flags will be supported should go
-	 * here...
-	 */
-
-	/* Compare and exchange the new flags into place. If we failed, that
-	 * is if cmpxchg returns anything but the old value, this means
-	 * something else must have modified the flags variable since we
-	 * copied it. We'll just punt with an error and log something in the
-	 * message buffer.
-	 */
-	if (cmpxchg(&adapter->flags, orig_flags, new_flags) != orig_flags) {
-		dev_warn(&adapter->pdev->dev,
-			 "Unable to update adapter->flags as it was modified by another thread...\n");
-		return -EAGAIN;
-	}
-
-	changed_flags = orig_flags ^ new_flags;
-
-	/* Process any additional changes needed as a result of flag changes.
-	 * The changed_flags value reflects the list of bits that were changed
-	 * in the code above.
-	 */
-
-	/* issue a reset to force legacy-rx change to take effect */
-	if (changed_flags & IAVF_FLAG_LEGACY_RX) {
-		if (netif_running(netdev)) {
-			adapter->flags |= IAVF_FLAG_RESET_NEEDED;
-			queue_work(adapter->wq, &adapter->reset_task);
-		}
-	}
-
-	return 0;
-}
-
 /**
  * iavf_get_msglevel - Get debug message level
  * @netdev: network interface device structure
@@ -584,7 +447,6 @@ static void iavf_get_drvinfo(struct net_device *netdev,
 	strscpy(drvinfo->driver, iavf_driver_name, 32);
 	strscpy(drvinfo->fw_version, "N/A", 4);
 	strscpy(drvinfo->bus_info, pci_name(adapter->pdev), 32);
-	drvinfo->n_priv_flags = IAVF_PRIV_FLAGS_STR_LEN;
 }
 
 /**
@@ -1969,8 +1831,6 @@ static const struct ethtool_ops iavf_ethtool_ops = {
 	.get_strings		= iavf_get_strings,
 	.get_ethtool_stats	= iavf_get_ethtool_stats,
 	.get_sset_count		= iavf_get_sset_count,
-	.get_priv_flags		= iavf_get_priv_flags,
-	.set_priv_flags		= iavf_set_priv_flags,
 	.get_msglevel		= iavf_get_msglevel,
 	.set_msglevel		= iavf_set_msglevel,
 	.get_coalesce		= iavf_get_coalesce,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 9f2e67a6cde3db..8f387fa10b8d85 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -713,9 +713,7 @@ static void iavf_configure_rx(struct iavf_adapter *adapter)
 	struct iavf_hw *hw = &adapter->hw;
 	int i;
 
-	/* Legacy Rx will always default to a 2048 buffer size. */
-#if (PAGE_SIZE < 8192)
-	if (!(adapter->flags & IAVF_FLAG_LEGACY_RX)) {
+	if (PAGE_SIZE < 8192) {
 		struct net_device *netdev = adapter->netdev;
 
 		/* For jumbo frames on systems with 4K pages we have to use
@@ -732,16 +730,10 @@ static void iavf_configure_rx(struct iavf_adapter *adapter)
 		    (netdev->mtu <= ETH_DATA_LEN))
 			rx_buf_len = IAVF_RXBUFFER_1536 - NET_IP_ALIGN;
 	}
-#endif
 
 	for (i = 0; i < adapter->num_active_queues; i++) {
 		adapter->rx_rings[i].tail = hw->hw_addr + IAVF_QRX_TAIL1(i);
 		adapter->rx_rings[i].rx_buf_len = rx_buf_len;
-
-		if (adapter->flags & IAVF_FLAG_LEGACY_RX)
-			clear_ring_build_skb_enabled(&adapter->rx_rings[i]);
-		else
-			set_ring_build_skb_enabled(&adapter->rx_rings[i]);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index a83b96e9b6fcf4..a7121dc5c32b3a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -824,17 +824,6 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 	writel(val, rx_ring->tail);
 }
 
-/**
- * iavf_rx_offset - Return expected offset into page to access data
- * @rx_ring: Ring we are requesting offset of
- *
- * Returns the offset value for ring into the data buffer.
- */
-static inline unsigned int iavf_rx_offset(struct iavf_ring *rx_ring)
-{
-	return ring_uses_build_skb(rx_ring) ? IAVF_SKB_PAD : 0;
-}
-
 /**
  * iavf_alloc_mapped_page - recycle or make a new page
  * @rx_ring: ring to use
@@ -879,7 +868,7 @@ static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
 
 	bi->dma = dma;
 	bi->page = page;
-	bi->page_offset = iavf_rx_offset(rx_ring);
+	bi->page_offset = IAVF_SKB_PAD;
 
 	/* initialize pagecnt_bias to 1 representing we fully own page */
 	bi->pagecnt_bias = 1;
@@ -1220,7 +1209,7 @@ static void iavf_add_rx_frag(struct iavf_ring *rx_ring,
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
 #else
-	unsigned int truesize = SKB_DATA_ALIGN(size + iavf_rx_offset(rx_ring));
+	unsigned int truesize = SKB_DATA_ALIGN(size + IAVF_SKB_PAD);
 #endif
 
 	if (!size)
@@ -1268,71 +1257,6 @@ static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring,
 	return rx_buffer;
 }
 
-/**
- * iavf_construct_skb - Allocate skb and populate it
- * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: rx buffer to pull data from
- * @size: size of buffer to add to skb
- *
- * This function allocates an skb.  It then populates it with the page
- * data from the current receive descriptor, taking care to set up the
- * skb correctly.
- */
-static struct sk_buff *iavf_construct_skb(struct iavf_ring *rx_ring,
-					  struct iavf_rx_buffer *rx_buffer,
-					  unsigned int size)
-{
-	void *va;
-#if (PAGE_SIZE < 8192)
-	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
-#else
-	unsigned int truesize = SKB_DATA_ALIGN(size);
-#endif
-	unsigned int headlen;
-	struct sk_buff *skb;
-
-	if (!rx_buffer)
-		return NULL;
-	/* prefetch first cache line of first page */
-	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
-	net_prefetch(va);
-
-	/* allocate a skb to store the frags */
-	skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-			       IAVF_RX_HDR_SIZE,
-			       GFP_ATOMIC | __GFP_NOWARN);
-	if (unlikely(!skb))
-		return NULL;
-
-	/* Determine available headroom for copy */
-	headlen = size;
-	if (headlen > IAVF_RX_HDR_SIZE)
-		headlen = eth_get_headlen(skb->dev, va, IAVF_RX_HDR_SIZE);
-
-	/* align pull length to size of long to optimize memcpy performance */
-	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));
-
-	/* update all of the pointers */
-	size -= headlen;
-	if (size) {
-		skb_add_rx_frag(skb, 0, rx_buffer->page,
-				rx_buffer->page_offset + headlen,
-				size, truesize);
-
-		/* buffer is used by skb, update page_offset */
-#if (PAGE_SIZE < 8192)
-		rx_buffer->page_offset ^= truesize;
-#else
-		rx_buffer->page_offset += truesize;
-#endif
-	} else {
-		/* buffer is unused, reset bias back to rx_buffer */
-		rx_buffer->pagecnt_bias++;
-	}
-
-	return skb;
-}
-
 /**
  * iavf_build_skb - Build skb around an existing buffer
  * @rx_ring: Rx descriptor ring to transact packets on
@@ -1505,10 +1429,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* retrieve a buffer from the ring */
 		if (skb)
 			iavf_add_rx_frag(rx_ring, rx_buffer, skb, size);
-		else if (ring_uses_build_skb(rx_ring))
-			skb = iavf_build_skb(rx_ring, rx_buffer, size);
 		else
-			skb = iavf_construct_skb(rx_ring, rx_buffer, size);
+			skb = iavf_build_skb(rx_ring, rx_buffer, size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 2624bf6d009e36..234e189c198755 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -362,7 +362,8 @@ struct iavf_ring {
 
 	u16 flags;
 #define IAVF_TXR_FLAGS_WB_ON_ITR		BIT(0)
-#define IAVF_RXR_FLAGS_BUILD_SKB_ENABLED	BIT(1)
+/* BIT(1) is free, was IAVF_RXR_FLAGS_BUILD_SKB_ENABLED */
+/* BIT(2) is free */
 #define IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(3)
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
 #define IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2	BIT(5)
@@ -393,21 +394,6 @@ struct iavf_ring {
 					 */
 } ____cacheline_internodealigned_in_smp;
 
-static inline bool ring_uses_build_skb(struct iavf_ring *ring)
-{
-	return !!(ring->flags & IAVF_RXR_FLAGS_BUILD_SKB_ENABLED);
-}
-
-static inline void set_ring_build_skb_enabled(struct iavf_ring *ring)
-{
-	ring->flags |= IAVF_RXR_FLAGS_BUILD_SKB_ENABLED;
-}
-
-static inline void clear_ring_build_skb_enabled(struct iavf_ring *ring)
-{
-	ring->flags &= ~IAVF_RXR_FLAGS_BUILD_SKB_ENABLED;
-}
-
 #define IAVF_ITR_ADAPTIVE_MIN_INC	0x0002
 #define IAVF_ITR_ADAPTIVE_MIN_USECS	0x0002
 #define IAVF_ITR_ADAPTIVE_MAX_USECS	0x007e
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 4e17d006c52d46..c2e328ec5af8f0 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -290,8 +290,7 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 		return;
 
 	/* Limit maximum frame size when jumbo frames is not enabled */
-	if (!(adapter->flags & IAVF_FLAG_LEGACY_RX) &&
-	    (adapter->netdev->mtu <= ETH_DATA_LEN))
+	if (adapter->netdev->mtu <= ETH_DATA_LEN)
 		max_frame = IAVF_RXBUFFER_1536 - NET_IP_ALIGN;
 
 	vqci->vsi_id = adapter->vsi_res->vsi_id;

From 88798b949be6225358afb87a8b45cb0dfd1ddda9 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 2 Feb 2023 18:17:12 +0100
Subject: [PATCH 03/32] iavf: optimize Rx buffer allocation a bunch

The Rx hotpath code of IAVF is not well-optimized TBH. Before doing any
further buffer model changes, shake it up a bit. Notably:

1. Cache more variables on the stack.
   DMA device, Rx page size, NTC -- these are the most common things
   used all throughout the hotpath, often in loops on each iteration.
   Instead of fetching (or even calculating, as with the page size) them
   from the ring all the time, cache them on the stack at the beginning
   of the NAPI polling callback. NTC will be written back at the end,
   the rest are used read-only, so no sync needed.
2. Don't move the recycled buffers around the ring.
   The idea of passing the page of the right-now-recycled-buffer to a
   different buffer, in this case, the first one that needs to be
   allocated, moreover, on each new frame, is fundamentally wrong. It
   involves a few o' fetches, branches and then writes (and one Rx
   buffer struct is at least 32 bytes) where they're completely unneeded,
   but gives no good -- the result is the same as if we'd recycle it
   inplace, at the same position where it was used. So drop this and let
   the main refilling function take care of all the buffers, which were
   processed and now need to be recycled/refilled.
3. Don't allocate with %GPF_ATOMIC on ifup.
   This involved introducing the @gfp parameter to a couple functions.
   Doesn't change anything for Rx -> softirq.
4. 1 budget unit == 1 descriptor, not skb.
   There could be underflow when receiving a lot of fragmented frames.
   If each of them would consist of 2 frags, it means that we'd process
   64 descriptors at the point where we pass the 32th skb to the stack.
   But the driver would count that only as a half, which could make NAPI
   re-enable interrupts prematurely and create unnecessary CPU load.
5. Shortcut !size case.
   It's super rare, but possible -- for example, if the last buffer of
   the fragmented frame contained only FCS, which was then stripped by
   the HW. Instead of checking for size several times when processing,
   quickly reuse the buffer and jump to the skb fields part.
6. Refill the ring after finishing the polling loop.
   Previously, the loop wasn't starting a new iteration after the 64th
   desc, meaning that we were always leaving 16 buffers non-refilled
   until the next NAPI poll. It's better to refill them while they're
   still hot, so do that right after exiting the loop as well.
   For a full cycle of 64 descs, there will be 4 refills of 16 descs
   from now on.

Function: add/remove: 4/2 grow/shrink: 0/5 up/down: 473/-647 (-174)

+ up to 2% performance.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c |   2 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 259 +++++++++-----------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |   3 +-
 3 files changed, 114 insertions(+), 150 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 8f387fa10b8d85..a497acd96385de 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1237,7 +1237,7 @@ static void iavf_configure(struct iavf_adapter *adapter)
 	for (i = 0; i < adapter->num_active_queues; i++) {
 		struct iavf_ring *ring = &adapter->rx_rings[i];
 
-		iavf_alloc_rx_buffers(ring, IAVF_DESC_UNUSED(ring));
+		iavf_alloc_rx_buffers(ring);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index a7121dc5c32b3a..fd08ce67380ee2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -736,7 +736,6 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 	/* Zero out the descriptor ring */
 	memset(rx_ring->desc, 0, rx_ring->size);
 
-	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 }
@@ -792,7 +791,6 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 		goto err;
 	}
 
-	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
@@ -812,9 +810,6 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 {
 	rx_ring->next_to_use = val;
 
-	/* update next to alloc since we have filled the ring */
-	rx_ring->next_to_alloc = val;
-
 	/* Force memory writes to complete before letting h/w
 	 * know there are new descriptors to fetch.  (Only
 	 * applicable for weak-ordered memory model archs,
@@ -828,12 +823,17 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
  * iavf_alloc_mapped_page - recycle or make a new page
  * @rx_ring: ring to use
  * @bi: rx_buffer struct to modify
+ * @dev: device used for DMA mapping
+ * @order: page order to allocate
+ * @gfp: GFP mask to allocate page
  *
  * Returns true if the page was successfully allocated or
  * reused.
  **/
 static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
-				   struct iavf_rx_buffer *bi)
+				   struct iavf_rx_buffer *bi,
+				   struct device *dev, u32 order,
+				   gfp_t gfp)
 {
 	struct page *page = bi->page;
 	dma_addr_t dma;
@@ -845,23 +845,21 @@ static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
 	}
 
 	/* alloc new page for storage */
-	page = dev_alloc_pages(iavf_rx_pg_order(rx_ring));
+	page = __dev_alloc_pages(gfp, order);
 	if (unlikely(!page)) {
 		rx_ring->rx_stats.alloc_page_failed++;
 		return false;
 	}
 
 	/* map page for use */
-	dma = dma_map_page_attrs(rx_ring->dev, page, 0,
-				 iavf_rx_pg_size(rx_ring),
-				 DMA_FROM_DEVICE,
-				 IAVF_RX_DMA_ATTR);
+	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE << order,
+				 DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
 
 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
 	 */
-	if (dma_mapping_error(rx_ring->dev, dma)) {
-		__free_pages(page, iavf_rx_pg_order(rx_ring));
+	if (dma_mapping_error(dev, dma)) {
+		__free_pages(page, order);
 		rx_ring->rx_stats.alloc_page_failed++;
 		return false;
 	}
@@ -898,32 +896,36 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
 }
 
 /**
- * iavf_alloc_rx_buffers - Replace used receive buffers
+ * __iavf_alloc_rx_buffers - Replace used receive buffers
  * @rx_ring: ring to place buffers on
- * @cleaned_count: number of buffers to replace
+ * @to_refill: number of buffers to replace
+ * @gfp: GFP mask to allocate pages
  *
- * Returns false if all allocations were successful, true if any fail
+ * Returns 0 if all allocations were successful or the number of buffers left
+ * to refill in case of an allocation failure.
  **/
-bool iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u16 cleaned_count)
+static u32 __iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u32 to_refill,
+				   gfp_t gfp)
 {
-	u16 ntu = rx_ring->next_to_use;
+	u32 order = iavf_rx_pg_order(rx_ring);
+	struct device *dev = rx_ring->dev;
+	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
 	struct iavf_rx_buffer *bi;
 
 	/* do nothing if no valid netdev defined */
-	if (!rx_ring->netdev || !cleaned_count)
-		return false;
+	if (unlikely(!rx_ring->netdev || !to_refill))
+		return 0;
 
 	rx_desc = IAVF_RX_DESC(rx_ring, ntu);
 	bi = &rx_ring->rx_bi[ntu];
 
 	do {
-		if (!iavf_alloc_mapped_page(rx_ring, bi))
-			goto no_buffers;
+		if (!iavf_alloc_mapped_page(rx_ring, bi, dev, order, gfp))
+			break;
 
 		/* sync the buffer for use by the device */
-		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
-						 bi->page_offset,
+		dma_sync_single_range_for_device(dev, bi->dma, bi->page_offset,
 						 rx_ring->rx_buf_len,
 						 DMA_FROM_DEVICE);
 
@@ -943,23 +945,17 @@ bool iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u16 cleaned_count)
 
 		/* clear the status bits for the next_to_use descriptor */
 		rx_desc->wb.qword1.status_error_len = 0;
-
-		cleaned_count--;
-	} while (cleaned_count);
+	} while (--to_refill);
 
 	if (rx_ring->next_to_use != ntu)
 		iavf_release_rx_desc(rx_ring, ntu);
 
-	return false;
-
-no_buffers:
-	if (rx_ring->next_to_use != ntu)
-		iavf_release_rx_desc(rx_ring, ntu);
+	return to_refill;
+}
 
-	/* make sure to come back via polling to try again after
-	 * allocation failure
-	 */
-	return true;
+void iavf_alloc_rx_buffers(struct iavf_ring *rxr)
+{
+	__iavf_alloc_rx_buffers(rxr, IAVF_DESC_UNUSED(rxr), GFP_KERNEL);
 }
 
 /**
@@ -1104,32 +1100,6 @@ static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
 	return false;
 }
 
-/**
- * iavf_reuse_rx_page - page flip buffer and store it back on the ring
- * @rx_ring: rx descriptor ring to store buffers on
- * @old_buff: donor buffer to have page reused
- *
- * Synchronizes page for reuse by the adapter
- **/
-static void iavf_reuse_rx_page(struct iavf_ring *rx_ring,
-			       struct iavf_rx_buffer *old_buff)
-{
-	struct iavf_rx_buffer *new_buff;
-	u16 nta = rx_ring->next_to_alloc;
-
-	new_buff = &rx_ring->rx_bi[nta];
-
-	/* update, and store next to alloc */
-	nta++;
-	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
-
-	/* transfer page from old buffer to new buffer */
-	new_buff->dma		= old_buff->dma;
-	new_buff->page		= old_buff->page;
-	new_buff->page_offset	= old_buff->page_offset;
-	new_buff->pagecnt_bias	= old_buff->pagecnt_bias;
-}
-
 /**
  * iavf_can_reuse_rx_page - Determine if this page can be reused by
  * the adapter for another receive
@@ -1191,30 +1161,26 @@ static bool iavf_can_reuse_rx_page(struct iavf_rx_buffer *rx_buffer)
 
 /**
  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
- * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: buffer containing page to add
  * @skb: sk_buff to place the data into
+ * @rx_buffer: buffer containing page to add
  * @size: packet length from rx_desc
+ * @pg_size: Rx buffer page size
  *
  * This function will add the data contained in rx_buffer->page to the skb.
  * It will just attach the page as a frag to the skb.
  *
  * The function will then update the page offset.
  **/
-static void iavf_add_rx_frag(struct iavf_ring *rx_ring,
+static void iavf_add_rx_frag(struct sk_buff *skb,
 			     struct iavf_rx_buffer *rx_buffer,
-			     struct sk_buff *skb,
-			     unsigned int size)
+			     u32 size, u32 pg_size)
 {
 #if (PAGE_SIZE < 8192)
-	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
+	unsigned int truesize = pg_size / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(size + IAVF_SKB_PAD);
 #endif
 
-	if (!size)
-		return;
-
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
 			rx_buffer->page_offset, size, truesize);
 
@@ -1224,63 +1190,47 @@ static void iavf_add_rx_frag(struct iavf_ring *rx_ring,
 #else
 	rx_buffer->page_offset += truesize;
 #endif
+
+	/* We have pulled a buffer for use, so decrement pagecnt_bias */
+	rx_buffer->pagecnt_bias--;
 }
 
 /**
- * iavf_get_rx_buffer - Fetch Rx buffer and synchronize data for use
- * @rx_ring: rx descriptor ring to transact packets on
- * @size: size of buffer to add to skb
+ * iavf_sync_rx_buffer - Synchronize received data for use
+ * @dev: device used for DMA mapping
+ * @buf: Rx buffer containing the data
+ * @size: size of the received data
  *
- * This function will pull an Rx buffer from the ring and synchronize it
- * for use by the CPU.
+ * This function will synchronize the Rx buffer for use by the CPU.
  */
-static struct iavf_rx_buffer *iavf_get_rx_buffer(struct iavf_ring *rx_ring,
-						 const unsigned int size)
+static void iavf_sync_rx_buffer(struct device *dev, struct iavf_rx_buffer *buf,
+				u32 size)
 {
-	struct iavf_rx_buffer *rx_buffer;
-
-	rx_buffer = &rx_ring->rx_bi[rx_ring->next_to_clean];
-	prefetchw(rx_buffer->page);
-	if (!size)
-		return rx_buffer;
-
-	/* we are reusing so sync this buffer for CPU use */
-	dma_sync_single_range_for_cpu(rx_ring->dev,
-				      rx_buffer->dma,
-				      rx_buffer->page_offset,
-				      size,
+	dma_sync_single_range_for_cpu(dev, buf->dma, buf->page_offset, size,
 				      DMA_FROM_DEVICE);
-
-	/* We have pulled a buffer for use, so decrement pagecnt_bias */
-	rx_buffer->pagecnt_bias--;
-
-	return rx_buffer;
 }
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
- * @rx_ring: Rx descriptor ring to transact packets on
- * @rx_buffer: Rx buffer to pull data from
- * @size: size of buffer to add to skb
+ * @rx_buffer: Rx buffer with the data
+ * @size: size of the data
+ * @pg_size: size of the Rx page
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring,
-				      struct iavf_rx_buffer *rx_buffer,
-				      unsigned int size)
+static struct sk_buff *iavf_build_skb(struct iavf_rx_buffer *rx_buffer,
+				      u32 size, u32 pg_size)
 {
 	void *va;
 #if (PAGE_SIZE < 8192)
-	unsigned int truesize = iavf_rx_pg_size(rx_ring) / 2;
+	unsigned int truesize = pg_size / 2;
 #else
 	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
 				SKB_DATA_ALIGN(IAVF_SKB_PAD + size);
 #endif
 	struct sk_buff *skb;
 
-	if (!rx_buffer || !size)
-		return NULL;
 	/* prefetch first cache line of first page */
 	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
 	net_prefetch(va);
@@ -1301,36 +1251,33 @@ static struct sk_buff *iavf_build_skb(struct iavf_ring *rx_ring,
 	rx_buffer->page_offset += truesize;
 #endif
 
+	rx_buffer->pagecnt_bias--;
+
 	return skb;
 }
 
 /**
- * iavf_put_rx_buffer - Clean up used buffer and either recycle or free
+ * iavf_put_rx_buffer - Recycle or free used buffer
  * @rx_ring: rx descriptor ring to transact packets on
- * @rx_buffer: rx buffer to pull data from
+ * @dev: device used for DMA mapping
+ * @rx_buffer: Rx buffer to handle
+ * @pg_size: Rx page size
  *
- * This function will clean up the contents of the rx_buffer.  It will
- * either recycle the buffer or unmap it and free the associated resources.
+ * Either recycle the buffer if possible or unmap and free the page.
  */
-static void iavf_put_rx_buffer(struct iavf_ring *rx_ring,
-			       struct iavf_rx_buffer *rx_buffer)
+static void iavf_put_rx_buffer(struct iavf_ring *rx_ring, struct device *dev,
+			       struct iavf_rx_buffer *rx_buffer, u32 pg_size)
 {
-	if (!rx_buffer)
-		return;
-
 	if (iavf_can_reuse_rx_page(rx_buffer)) {
-		/* hand second half of page back to the ring */
-		iavf_reuse_rx_page(rx_ring, rx_buffer);
 		rx_ring->rx_stats.page_reuse_count++;
-	} else {
-		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page_attrs(rx_ring->dev, rx_buffer->dma,
-				     iavf_rx_pg_size(rx_ring),
-				     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
-		__page_frag_cache_drain(rx_buffer->page,
-					rx_buffer->pagecnt_bias);
+		return;
 	}
 
+	/* we are not reusing the buffer so unmap it */
+	dma_unmap_page_attrs(dev, rx_buffer->dma, pg_size,
+			     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
+	__page_frag_cache_drain(rx_buffer->page, rx_buffer->pagecnt_bias);
+
 	/* clear contents of buffer_info */
 	rx_buffer->page = NULL;
 }
@@ -1350,14 +1297,6 @@ static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
 			    union iavf_rx_desc *rx_desc,
 			    struct sk_buff *skb)
 {
-	u32 ntc = rx_ring->next_to_clean + 1;
-
-	/* fetch, update, and store next to clean */
-	ntc = (ntc < rx_ring->count) ? ntc : 0;
-	rx_ring->next_to_clean = ntc;
-
-	prefetch(IAVF_RX_DESC(rx_ring, ntc));
-
 	/* if we are the last buffer then there is nothing else to do */
 #define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT)
 	if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF)))
@@ -1383,11 +1322,16 @@ static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
 static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 {
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+	u32 to_refill = IAVF_DESC_UNUSED(rx_ring);
+	u32 pg_size = iavf_rx_pg_size(rx_ring);
 	struct sk_buff *skb = rx_ring->skb;
-	u16 cleaned_count = IAVF_DESC_UNUSED(rx_ring);
-	bool failure = false;
+	struct device *dev = rx_ring->dev;
+	u32 ntc = rx_ring->next_to_clean;
+	u32 ring_size = rx_ring->count;
+	u32 cleaned_count = 0;
 
-	while (likely(total_rx_packets < (unsigned int)budget)) {
+	while (likely(cleaned_count < budget)) {
 		struct iavf_rx_buffer *rx_buffer;
 		union iavf_rx_desc *rx_desc;
 		unsigned int size;
@@ -1396,13 +1340,11 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		u64 qword;
 
 		/* return some buffers to hardware, one at a time is too slow */
-		if (cleaned_count >= IAVF_RX_BUFFER_WRITE) {
-			failure = failure ||
-				  iavf_alloc_rx_buffers(rx_ring, cleaned_count);
-			cleaned_count = 0;
-		}
+		if (to_refill >= IAVF_RX_BUFFER_WRITE)
+			to_refill = __iavf_alloc_rx_buffers(rx_ring, to_refill,
+							    gfp);
 
-		rx_desc = IAVF_RX_DESC(rx_ring, rx_ring->next_to_clean);
+		rx_desc = IAVF_RX_DESC(rx_ring, ntc);
 
 		/* status_error_len will always be zero for unused descriptors
 		 * because it's cleared in cleanup, and overlaps with hdr_addr
@@ -1424,24 +1366,38 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		       IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
 
 		iavf_trace(clean_rx_irq, rx_ring, rx_desc, skb);
-		rx_buffer = iavf_get_rx_buffer(rx_ring, size);
+		rx_buffer = &rx_ring->rx_bi[ntc];
+
+		/* Very rare, but possible case. The most common reason:
+		 * the last fragment contained FCS only, which was then
+		 * stripped by the HW.
+		 */
+		if (unlikely(!size))
+			goto skip_data;
+
+		iavf_sync_rx_buffer(dev, rx_buffer, size);
 
 		/* retrieve a buffer from the ring */
 		if (skb)
-			iavf_add_rx_frag(rx_ring, rx_buffer, skb, size);
+			iavf_add_rx_frag(skb, rx_buffer, size, pg_size);
 		else
-			skb = iavf_build_skb(rx_ring, rx_buffer, size);
+			skb = iavf_build_skb(rx_buffer, size, pg_size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
 			rx_ring->rx_stats.alloc_buff_failed++;
-			if (rx_buffer && size)
-				rx_buffer->pagecnt_bias++;
 			break;
 		}
 
-		iavf_put_rx_buffer(rx_ring, rx_buffer);
+skip_data:
+		iavf_put_rx_buffer(rx_ring, dev, rx_buffer, pg_size);
+
 		cleaned_count++;
+		to_refill++;
+		if (unlikely(++ntc == ring_size))
+			ntc = 0;
+
+		prefetch(IAVF_RX_DESC(rx_ring, ntc));
 
 		if (iavf_is_non_eop(rx_ring, rx_desc, skb))
 			continue;
@@ -1488,8 +1444,18 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		total_rx_packets++;
 	}
 
+	rx_ring->next_to_clean = ntc;
 	rx_ring->skb = skb;
 
+	if (to_refill >= IAVF_RX_BUFFER_WRITE) {
+		to_refill = __iavf_alloc_rx_buffers(rx_ring, to_refill, gfp);
+		/* guarantee a trip back through this routine if there was
+		 * a failure
+		 */
+		if (unlikely(to_refill))
+			cleaned_count = budget;
+	}
+
 	u64_stats_update_begin(&rx_ring->syncp);
 	rx_ring->stats.packets += total_rx_packets;
 	rx_ring->stats.bytes += total_rx_bytes;
@@ -1497,8 +1463,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	rx_ring->q_vector->rx.total_packets += total_rx_packets;
 	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
 
-	/* guarantee a trip back through this routine if there was a failure */
-	return failure ? budget : (int)total_rx_packets;
+	return cleaned_count;
 }
 
 static inline u32 iavf_buildreg_itr(const int type, u16 itr)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 234e189c198755..9c6661a6edf2f2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -383,7 +383,6 @@ struct iavf_ring {
 	struct iavf_q_vector *q_vector;	/* Backreference to associated vector */
 
 	struct rcu_head rcu;		/* to avoid race on free */
-	u16 next_to_alloc;
 	struct sk_buff *skb;		/* When iavf_clean_rx_ring_irq() must
 					 * return before it sees the EOP for
 					 * the current packet, we save that skb
@@ -426,7 +425,7 @@ static inline unsigned int iavf_rx_pg_order(struct iavf_ring *ring)
 
 #define iavf_rx_pg_size(_ring) (PAGE_SIZE << iavf_rx_pg_order(_ring))
 
-bool iavf_alloc_rx_buffers(struct iavf_ring *rxr, u16 cleaned_count);
+void iavf_alloc_rx_buffers(struct iavf_ring *rxr);
 netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
 void iavf_clean_tx_ring(struct iavf_ring *tx_ring);
 void iavf_clean_rx_ring(struct iavf_ring *rx_ring);

From 46eb61c5cc1cdaadc5a0fa6eee73a5afef3cdfad Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 17 Feb 2023 18:03:26 +0100
Subject: [PATCH 04/32] iavf: remove page splitting/recycling

As an intermediate step, remove all page splitting/recyclig code. Just
always allocate a new page and don't touch its refcount, so that it gets
freed by the core stack later.
The change allows to greatly simplify certain parts of the code:

Function: add/remove: 2/3 grow/shrink: 0/5 up/down: 543/-963 (-420)

&iavf_rx_buf can even now retire in favor of just storing an array of
pages used for Rx. Their DMA addresses can be stored in page::dma_addr
-- use Page Pool's function for that.
No surprise perf loses up to 30% here, but that regression will go away
once PP lands.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c |   2 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 279 ++++++--------------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  17 +-
 3 files changed, 87 insertions(+), 211 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index a497acd96385de..f7c585d10834ce 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1237,7 +1237,7 @@ static void iavf_configure(struct iavf_adapter *adapter)
 	for (i = 0; i < adapter->num_active_queues; i++) {
 		struct iavf_ring *ring = &adapter->rx_rings[i];
 
-		iavf_alloc_rx_buffers(ring);
+		iavf_alloc_rx_pages(ring);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index fd08ce67380ee2..a761f3e3d7ccce 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -690,11 +690,10 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
  **/
 void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 {
-	unsigned long bi_size;
 	u16 i;
 
 	/* ring already cleared, nothing to do */
-	if (!rx_ring->rx_bi)
+	if (!rx_ring->rx_pages)
 		return;
 
 	if (rx_ring->skb) {
@@ -704,38 +703,30 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 
 	/* Free all the Rx ring sk_buffs */
 	for (i = 0; i < rx_ring->count; i++) {
-		struct iavf_rx_buffer *rx_bi = &rx_ring->rx_bi[i];
+		struct page *page = rx_ring->rx_pages[i];
+		dma_addr_t dma;
 
-		if (!rx_bi->page)
+		if (!page)
 			continue;
 
+		dma = page_pool_get_dma_addr(page);
+
 		/* Invalidate cache lines that may have been written to by
 		 * device so that we avoid corrupting memory.
 		 */
-		dma_sync_single_range_for_cpu(rx_ring->dev,
-					      rx_bi->dma,
-					      rx_bi->page_offset,
+		dma_sync_single_range_for_cpu(rx_ring->dev, dma, IAVF_SKB_PAD,
 					      rx_ring->rx_buf_len,
 					      DMA_FROM_DEVICE);
 
 		/* free resources associated with mapping */
-		dma_unmap_page_attrs(rx_ring->dev, rx_bi->dma,
+		dma_unmap_page_attrs(rx_ring->dev, dma,
 				     iavf_rx_pg_size(rx_ring),
 				     DMA_FROM_DEVICE,
 				     IAVF_RX_DMA_ATTR);
 
-		__page_frag_cache_drain(rx_bi->page, rx_bi->pagecnt_bias);
-
-		rx_bi->page = NULL;
-		rx_bi->page_offset = 0;
+		__free_pages(page, iavf_rx_pg_order(rx_ring));
 	}
 
-	bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
-	memset(rx_ring->rx_bi, 0, bi_size);
-
-	/* Zero out the descriptor ring */
-	memset(rx_ring->desc, 0, rx_ring->size);
-
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 }
@@ -749,8 +740,8 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 {
 	iavf_clean_rx_ring(rx_ring);
-	kfree(rx_ring->rx_bi);
-	rx_ring->rx_bi = NULL;
+	kfree(rx_ring->rx_pages);
+	rx_ring->rx_pages = NULL;
 
 	if (rx_ring->desc) {
 		dma_free_coherent(rx_ring->dev, rx_ring->size,
@@ -768,14 +759,13 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 {
 	struct device *dev = rx_ring->dev;
-	int bi_size;
 
 	/* warn if we are about to overwrite the pointer */
-	WARN_ON(rx_ring->rx_bi);
-	bi_size = sizeof(struct iavf_rx_buffer) * rx_ring->count;
-	rx_ring->rx_bi = kzalloc(bi_size, GFP_KERNEL);
-	if (!rx_ring->rx_bi)
-		goto err;
+	WARN_ON(rx_ring->rx_pages);
+	rx_ring->rx_pages = kcalloc(rx_ring->count, sizeof(*rx_ring->rx_pages),
+				    GFP_KERNEL);
+	if (!rx_ring->rx_pages)
+		return -ENOMEM;
 
 	u64_stats_init(&rx_ring->syncp);
 
@@ -796,8 +786,9 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 
 	return 0;
 err:
-	kfree(rx_ring->rx_bi);
-	rx_ring->rx_bi = NULL;
+	kfree(rx_ring->rx_pages);
+	rx_ring->rx_pages = NULL;
+
 	return -ENOMEM;
 }
 
@@ -820,36 +811,23 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 }
 
 /**
- * iavf_alloc_mapped_page - recycle or make a new page
- * @rx_ring: ring to use
- * @bi: rx_buffer struct to modify
+ * iavf_alloc_mapped_page - allocate and map a new page
  * @dev: device used for DMA mapping
  * @order: page order to allocate
  * @gfp: GFP mask to allocate page
  *
- * Returns true if the page was successfully allocated or
- * reused.
+ * Returns a new &page if the it was successfully allocated, %NULL otherwise.
  **/
-static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
-				   struct iavf_rx_buffer *bi,
-				   struct device *dev, u32 order,
-				   gfp_t gfp)
+static struct page *iavf_alloc_mapped_page(struct device *dev, u32 order,
+					   gfp_t gfp)
 {
-	struct page *page = bi->page;
+	struct page *page;
 	dma_addr_t dma;
 
-	/* since we are recycling buffers we should seldom need to alloc */
-	if (likely(page)) {
-		rx_ring->rx_stats.page_reuse_count++;
-		return true;
-	}
-
 	/* alloc new page for storage */
 	page = __dev_alloc_pages(gfp, order);
-	if (unlikely(!page)) {
-		rx_ring->rx_stats.alloc_page_failed++;
-		return false;
-	}
+	if (unlikely(!page))
+		return NULL;
 
 	/* map page for use */
 	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE << order,
@@ -860,18 +838,12 @@ static bool iavf_alloc_mapped_page(struct iavf_ring *rx_ring,
 	 */
 	if (dma_mapping_error(dev, dma)) {
 		__free_pages(page, order);
-		rx_ring->rx_stats.alloc_page_failed++;
-		return false;
+		return NULL;
 	}
 
-	bi->dma = dma;
-	bi->page = page;
-	bi->page_offset = IAVF_SKB_PAD;
-
-	/* initialize pagecnt_bias to 1 representing we fully own page */
-	bi->pagecnt_bias = 1;
+	page_pool_set_dma_addr(page, dma);
 
-	return true;
+	return page;
 }
 
 /**
@@ -896,7 +868,7 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
 }
 
 /**
- * __iavf_alloc_rx_buffers - Replace used receive buffers
+ * __iavf_alloc_rx_pages - Replace used receive pages
  * @rx_ring: ring to place buffers on
  * @to_refill: number of buffers to replace
  * @gfp: GFP mask to allocate pages
@@ -904,42 +876,47 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
  * Returns 0 if all allocations were successful or the number of buffers left
  * to refill in case of an allocation failure.
  **/
-static u32 __iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u32 to_refill,
-				   gfp_t gfp)
+static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
+				 gfp_t gfp)
 {
 	u32 order = iavf_rx_pg_order(rx_ring);
 	struct device *dev = rx_ring->dev;
 	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
-	struct iavf_rx_buffer *bi;
 
 	/* do nothing if no valid netdev defined */
 	if (unlikely(!rx_ring->netdev || !to_refill))
 		return 0;
 
 	rx_desc = IAVF_RX_DESC(rx_ring, ntu);
-	bi = &rx_ring->rx_bi[ntu];
 
 	do {
-		if (!iavf_alloc_mapped_page(rx_ring, bi, dev, order, gfp))
+		struct page *page;
+		dma_addr_t dma;
+
+		page = iavf_alloc_mapped_page(dev, order, gfp);
+		if (!page) {
+			rx_ring->rx_stats.alloc_page_failed++;
 			break;
+		}
+
+		rx_ring->rx_pages[ntu] = page;
+		dma = page_pool_get_dma_addr(page);
 
 		/* sync the buffer for use by the device */
-		dma_sync_single_range_for_device(dev, bi->dma, bi->page_offset,
+		dma_sync_single_range_for_device(dev, dma, IAVF_SKB_PAD,
 						 rx_ring->rx_buf_len,
 						 DMA_FROM_DEVICE);
 
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
-		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma + IAVF_SKB_PAD);
 
 		rx_desc++;
-		bi++;
 		ntu++;
 		if (unlikely(ntu == rx_ring->count)) {
 			rx_desc = IAVF_RX_DESC(rx_ring, 0);
-			bi = rx_ring->rx_bi;
 			ntu = 0;
 		}
 
@@ -953,9 +930,9 @@ static u32 __iavf_alloc_rx_buffers(struct iavf_ring *rx_ring, u32 to_refill,
 	return to_refill;
 }
 
-void iavf_alloc_rx_buffers(struct iavf_ring *rxr)
+void iavf_alloc_rx_pages(struct iavf_ring *rxr)
 {
-	__iavf_alloc_rx_buffers(rxr, IAVF_DESC_UNUSED(rxr), GFP_KERNEL);
+	__iavf_alloc_rx_pages(rxr, IAVF_DESC_UNUSED(rxr), GFP_KERNEL);
 }
 
 /**
@@ -1100,80 +1077,20 @@ static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
 	return false;
 }
 
-/**
- * iavf_can_reuse_rx_page - Determine if this page can be reused by
- * the adapter for another receive
- *
- * @rx_buffer: buffer containing the page
- *
- * If page is reusable, rx_buffer->page_offset is adjusted to point to
- * an unused region in the page.
- *
- * For small pages, @truesize will be a constant value, half the size
- * of the memory at page.  We'll attempt to alternate between high and
- * low halves of the page, with one half ready for use by the hardware
- * and the other half being consumed by the stack.  We use the page
- * ref count to determine whether the stack has finished consuming the
- * portion of this page that was passed up with a previous packet.  If
- * the page ref count is >1, we'll assume the "other" half page is
- * still busy, and this page cannot be reused.
- *
- * For larger pages, @truesize will be the actual space used by the
- * received packet (adjusted upward to an even multiple of the cache
- * line size).  This will advance through the page by the amount
- * actually consumed by the received packets while there is still
- * space for a buffer.  Each region of larger pages will be used at
- * most once, after which the page will not be reused.
- *
- * In either case, if the page is reusable its refcount is increased.
- **/
-static bool iavf_can_reuse_rx_page(struct iavf_rx_buffer *rx_buffer)
-{
-	unsigned int pagecnt_bias = rx_buffer->pagecnt_bias;
-	struct page *page = rx_buffer->page;
-
-	/* Is any reuse possible? */
-	if (!dev_page_is_reusable(page))
-		return false;
-
-#if (PAGE_SIZE < 8192)
-	/* if we are only owner of page we can reuse it */
-	if (unlikely((page_count(page) - pagecnt_bias) > 1))
-		return false;
-#else
-#define IAVF_LAST_OFFSET \
-	(SKB_WITH_OVERHEAD(PAGE_SIZE) - IAVF_RXBUFFER_2048)
-	if (rx_buffer->page_offset > IAVF_LAST_OFFSET)
-		return false;
-#endif
-
-	/* If we have drained the page fragment pool we need to update
-	 * the pagecnt_bias and page count so that we fully restock the
-	 * number of references the driver holds.
-	 */
-	if (unlikely(!pagecnt_bias)) {
-		page_ref_add(page, USHRT_MAX);
-		rx_buffer->pagecnt_bias = USHRT_MAX;
-	}
-
-	return true;
-}
-
 /**
  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
  * @skb: sk_buff to place the data into
- * @rx_buffer: buffer containing page to add
+ * @page: page containing data to add
  * @size: packet length from rx_desc
  * @pg_size: Rx buffer page size
  *
- * This function will add the data contained in rx_buffer->page to the skb.
+ * This function will add the data contained in page to the skb.
  * It will just attach the page as a frag to the skb.
  *
  * The function will then update the page offset.
  **/
-static void iavf_add_rx_frag(struct sk_buff *skb,
-			     struct iavf_rx_buffer *rx_buffer,
-			     u32 size, u32 pg_size)
+static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size,
+			     u32 pg_size)
 {
 #if (PAGE_SIZE < 8192)
 	unsigned int truesize = pg_size / 2;
@@ -1181,46 +1098,34 @@ static void iavf_add_rx_frag(struct sk_buff *skb,
 	unsigned int truesize = SKB_DATA_ALIGN(size + IAVF_SKB_PAD);
 #endif
 
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buffer->page,
-			rx_buffer->page_offset, size, truesize);
-
-	/* page is being used so we must update the page offset */
-#if (PAGE_SIZE < 8192)
-	rx_buffer->page_offset ^= truesize;
-#else
-	rx_buffer->page_offset += truesize;
-#endif
-
-	/* We have pulled a buffer for use, so decrement pagecnt_bias */
-	rx_buffer->pagecnt_bias--;
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, IAVF_SKB_PAD,
+			size, truesize);
 }
 
 /**
- * iavf_sync_rx_buffer - Synchronize received data for use
+ * iavf_sync_rx_page - Synchronize received data for use
  * @dev: device used for DMA mapping
- * @buf: Rx buffer containing the data
+ * @page: Rx page containing the data
  * @size: size of the received data
  *
  * This function will synchronize the Rx buffer for use by the CPU.
  */
-static void iavf_sync_rx_buffer(struct device *dev, struct iavf_rx_buffer *buf,
-				u32 size)
+static void iavf_sync_rx_page(struct device *dev, struct page *page, u32 size)
 {
-	dma_sync_single_range_for_cpu(dev, buf->dma, buf->page_offset, size,
-				      DMA_FROM_DEVICE);
+	dma_sync_single_range_for_cpu(dev, page_pool_get_dma_addr(page),
+				      IAVF_SKB_PAD, size, DMA_FROM_DEVICE);
 }
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
- * @rx_buffer: Rx buffer with the data
+ * @page: Rx page to with the data
  * @size: size of the data
  * @pg_size: size of the Rx page
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct iavf_rx_buffer *rx_buffer,
-				      u32 size, u32 pg_size)
+static struct sk_buff *iavf_build_skb(struct page *page, u32 size, u32 pg_size)
 {
 	void *va;
 #if (PAGE_SIZE < 8192)
@@ -1232,11 +1137,11 @@ static struct sk_buff *iavf_build_skb(struct iavf_rx_buffer *rx_buffer,
 	struct sk_buff *skb;
 
 	/* prefetch first cache line of first page */
-	va = page_address(rx_buffer->page) + rx_buffer->page_offset;
-	net_prefetch(va);
+	va = page_address(page);
+	net_prefetch(va + IAVF_SKB_PAD);
 
 	/* build an skb around the page buffer */
-	skb = napi_build_skb(va - IAVF_SKB_PAD, truesize);
+	skb = napi_build_skb(va, truesize);
 	if (unlikely(!skb))
 		return NULL;
 
@@ -1244,42 +1149,21 @@ static struct sk_buff *iavf_build_skb(struct iavf_rx_buffer *rx_buffer,
 	skb_reserve(skb, IAVF_SKB_PAD);
 	__skb_put(skb, size);
 
-	/* buffer is used by skb, update page_offset */
-#if (PAGE_SIZE < 8192)
-	rx_buffer->page_offset ^= truesize;
-#else
-	rx_buffer->page_offset += truesize;
-#endif
-
-	rx_buffer->pagecnt_bias--;
-
 	return skb;
 }
 
 /**
- * iavf_put_rx_buffer - Recycle or free used buffer
- * @rx_ring: rx descriptor ring to transact packets on
+ * iavf_unmap_rx_page - Unmap used page
  * @dev: device used for DMA mapping
- * @rx_buffer: Rx buffer to handle
+ * @page: page to release
  * @pg_size: Rx page size
- *
- * Either recycle the buffer if possible or unmap and free the page.
  */
-static void iavf_put_rx_buffer(struct iavf_ring *rx_ring, struct device *dev,
-			       struct iavf_rx_buffer *rx_buffer, u32 pg_size)
+static void iavf_unmap_rx_page(struct device *dev, struct page *page,
+			       u32 pg_size)
 {
-	if (iavf_can_reuse_rx_page(rx_buffer)) {
-		rx_ring->rx_stats.page_reuse_count++;
-		return;
-	}
-
-	/* we are not reusing the buffer so unmap it */
-	dma_unmap_page_attrs(dev, rx_buffer->dma, pg_size,
+	dma_unmap_page_attrs(dev, page_pool_get_dma_addr(page), pg_size,
 			     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
-	__page_frag_cache_drain(rx_buffer->page, rx_buffer->pagecnt_bias);
-
-	/* clear contents of buffer_info */
-	rx_buffer->page = NULL;
+	page_pool_set_dma_addr(page, 0);
 }
 
 /**
@@ -1332,8 +1216,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	u32 cleaned_count = 0;
 
 	while (likely(cleaned_count < budget)) {
-		struct iavf_rx_buffer *rx_buffer;
 		union iavf_rx_desc *rx_desc;
+		struct page *page;
 		unsigned int size;
 		u16 vlan_tag = 0;
 		u8 rx_ptype;
@@ -1341,8 +1225,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 
 		/* return some buffers to hardware, one at a time is too slow */
 		if (to_refill >= IAVF_RX_BUFFER_WRITE)
-			to_refill = __iavf_alloc_rx_buffers(rx_ring, to_refill,
-							    gfp);
+			to_refill = __iavf_alloc_rx_pages(rx_ring, to_refill,
+							  gfp);
 
 		rx_desc = IAVF_RX_DESC(rx_ring, ntc);
 
@@ -1366,32 +1250,37 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		       IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
 
 		iavf_trace(clean_rx_irq, rx_ring, rx_desc, skb);
-		rx_buffer = &rx_ring->rx_bi[ntc];
+
+		page = rx_ring->rx_pages[ntc];
+		rx_ring->rx_pages[ntc] = NULL;
 
 		/* Very rare, but possible case. The most common reason:
 		 * the last fragment contained FCS only, which was then
 		 * stripped by the HW.
 		 */
-		if (unlikely(!size))
+		if (unlikely(!size)) {
+			iavf_unmap_rx_page(dev, page, pg_size);
+			__free_pages(page, get_order(pg_size));
 			goto skip_data;
+		}
 
-		iavf_sync_rx_buffer(dev, rx_buffer, size);
+		iavf_sync_rx_page(dev, page, size);
+		iavf_unmap_rx_page(dev, page, pg_size);
 
 		/* retrieve a buffer from the ring */
 		if (skb)
-			iavf_add_rx_frag(skb, rx_buffer, size, pg_size);
+			iavf_add_rx_frag(skb, page, size, pg_size);
 		else
-			skb = iavf_build_skb(rx_buffer, size, pg_size);
+			skb = iavf_build_skb(page, size, pg_size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
+			__free_pages(page, get_order(pg_size));
 			rx_ring->rx_stats.alloc_buff_failed++;
 			break;
 		}
 
 skip_data:
-		iavf_put_rx_buffer(rx_ring, dev, rx_buffer, pg_size);
-
 		cleaned_count++;
 		to_refill++;
 		if (unlikely(++ntc == ring_size))
@@ -1448,7 +1337,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	rx_ring->skb = skb;
 
 	if (to_refill >= IAVF_RX_BUFFER_WRITE) {
-		to_refill = __iavf_alloc_rx_buffers(rx_ring, to_refill, gfp);
+		to_refill = __iavf_alloc_rx_pages(rx_ring, to_refill, gfp);
 		/* guarantee a trip back through this routine if there was
 		 * a failure
 		 */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 9c6661a6edf2f2..c09ac580fe84cc 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -272,17 +272,6 @@ struct iavf_tx_buffer {
 	u32 tx_flags;
 };
 
-struct iavf_rx_buffer {
-	dma_addr_t dma;
-	struct page *page;
-#if (BITS_PER_LONG > 32) || (PAGE_SIZE >= 65536)
-	__u32 page_offset;
-#else
-	__u16 page_offset;
-#endif
-	__u16 pagecnt_bias;
-};
-
 struct iavf_queue_stats {
 	u64 packets;
 	u64 bytes;
@@ -302,8 +291,6 @@ struct iavf_rx_queue_stats {
 	u64 non_eop_descs;
 	u64 alloc_page_failed;
 	u64 alloc_buff_failed;
-	u64 page_reuse_count;
-	u64 realloc_count;
 };
 
 enum iavf_ring_state_t {
@@ -331,7 +318,7 @@ struct iavf_ring {
 	struct net_device *netdev;	/* netdev ring maps to */
 	union {
 		struct iavf_tx_buffer *tx_bi;
-		struct iavf_rx_buffer *rx_bi;
+		struct page **rx_pages;
 	};
 	DECLARE_BITMAP(state, __IAVF_RING_STATE_NBITS);
 	u16 queue_index;		/* Queue number of ring */
@@ -425,7 +412,7 @@ static inline unsigned int iavf_rx_pg_order(struct iavf_ring *ring)
 
 #define iavf_rx_pg_size(_ring) (PAGE_SIZE << iavf_rx_pg_order(_ring))
 
-void iavf_alloc_rx_buffers(struct iavf_ring *rxr);
+void iavf_alloc_rx_pages(struct iavf_ring *rxr);
 netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
 void iavf_clean_tx_ring(struct iavf_ring *tx_ring);
 void iavf_clean_rx_ring(struct iavf_ring *rx_ring);

From 95a993b46b712af2304f5788aa7f691f07d05370 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 3 Feb 2023 17:18:57 +0100
Subject: [PATCH 05/32] iavf: always use a full order-0 page

The current scheme with trying to pick the smallest buffer possible for
the current MTU in order to flip/split pages is not very optimal.
For example, on default MTU of 1500 it gives only 192 bytes of headroom,
while XDP may require up to 258. But this also involves unnecessary code
complication, which sometimes is even hard to follow.
As page split is no more, always allocate order-0 pages. This optimizes
performance a bit and drops some bytes off the object code. Next, always
pick the maximum buffer length available for this %PAGE_SIZE to set it
up in the hardware. This means it now becomes a constant value, which
also has its positive impact.
On x64 this means (without XDP):

4096 page
64 head, 320 tail
3712 HW buffer size
3686 max MTU w/o frags

Previously, the maximum MTU w/o splitting a frame into several buffers
was 3046.
Increased buffer size allows us to reach the maximum frame size w/ frags
supported by HW: 16382 bytes (MTU 16356). Reflect it in the netdev
config as well. Relying on max single buffer size when calculating MTU
was not correct.
Move around a couple of fields in &iavf_ring after ::rx_buf_len removal
to reduce holes and improve cache locality.
Instead of providing the Rx definitions, which can and will be reused in
rest of the drivers, exclusively for IAVF, do that in the libie header.
Non-PP drivers could still use at least some of them and lose a couple
copied lines.

Function: add/remove: 0/0 grow/shrink: 3/9 up/down: 18/-265 (-247)

+ even reclaims a half percent of performance, nice.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  32 +-----
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  96 +++++++---------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   | 103 +-----------------
 drivers/net/ethernet/intel/iavf/iavf_type.h   |   2 -
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |  15 +--
 include/linux/net/intel/libie/rx.h            |  39 +++++++
 6 files changed, 89 insertions(+), 198 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index f7c585d10834ce..fb2bd1c423a158 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/net/intel/libie/rx.h>
+
 #include "iavf.h"
 #include "iavf_prototype.h"
 #include "iavf_client.h"
@@ -709,32 +711,10 @@ static void iavf_configure_tx(struct iavf_adapter *adapter)
  **/
 static void iavf_configure_rx(struct iavf_adapter *adapter)
 {
-	unsigned int rx_buf_len = IAVF_RXBUFFER_2048;
 	struct iavf_hw *hw = &adapter->hw;
-	int i;
-
-	if (PAGE_SIZE < 8192) {
-		struct net_device *netdev = adapter->netdev;
 
-		/* For jumbo frames on systems with 4K pages we have to use
-		 * an order 1 page, so we might as well increase the size
-		 * of our Rx buffer to make better use of the available space
-		 */
-		rx_buf_len = IAVF_RXBUFFER_3072;
-
-		/* We use a 1536 buffer size for configurations with
-		 * standard Ethernet mtu.  On x86 this gives us enough room
-		 * for shared info and 192 bytes of padding.
-		 */
-		if (!IAVF_2K_TOO_SMALL_WITH_PADDING &&
-		    (netdev->mtu <= ETH_DATA_LEN))
-			rx_buf_len = IAVF_RXBUFFER_1536 - NET_IP_ALIGN;
-	}
-
-	for (i = 0; i < adapter->num_active_queues; i++) {
+	for (u32 i = 0; i < adapter->num_active_queues; i++)
 		adapter->rx_rings[i].tail = hw->hw_addr + IAVF_QRX_TAIL1(i);
-		adapter->rx_rings[i].rx_buf_len = rx_buf_len;
-	}
 }
 
 /**
@@ -2583,11 +2563,7 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
 
 	netdev->netdev_ops = &iavf_netdev_ops;
 	iavf_set_ethtool_ops(netdev);
-	netdev->watchdog_timeo = 5 * HZ;
-
-	/* MTU range: 68 - 9710 */
-	netdev->min_mtu = ETH_MIN_MTU;
-	netdev->max_mtu = IAVF_MAX_RXBUFFER - IAVF_PACKET_HDR_PAD;
+	netdev->max_mtu = LIBIE_MAX_MTU;
 
 	if (!is_valid_ether_addr(adapter->hw.mac.addr)) {
 		dev_info(&pdev->dev, "Invalid MAC address %pM, using random\n",
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index a761f3e3d7ccce..8e0e6d59cd3e10 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -301,7 +301,7 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 		    ((j / WB_STRIDE) == 0) && (j > 0) &&
 		    !test_bit(__IAVF_VSI_DOWN, vsi->state) &&
 		    (IAVF_DESC_UNUSED(tx_ring) != tx_ring->count))
-			tx_ring->arm_wb = true;
+			tx_ring->flags |= IAVF_TXRX_FLAGS_ARM_WB;
 	}
 
 	/* notify netdev of completed buffers */
@@ -714,17 +714,16 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 		/* Invalidate cache lines that may have been written to by
 		 * device so that we avoid corrupting memory.
 		 */
-		dma_sync_single_range_for_cpu(rx_ring->dev, dma, IAVF_SKB_PAD,
-					      rx_ring->rx_buf_len,
+		dma_sync_single_range_for_cpu(rx_ring->dev, dma,
+					      LIBIE_SKB_HEADROOM,
+					      LIBIE_RX_BUF_LEN,
 					      DMA_FROM_DEVICE);
 
 		/* free resources associated with mapping */
-		dma_unmap_page_attrs(rx_ring->dev, dma,
-				     iavf_rx_pg_size(rx_ring),
-				     DMA_FROM_DEVICE,
-				     IAVF_RX_DMA_ATTR);
+		dma_unmap_page_attrs(rx_ring->dev, dma, LIBIE_RX_TRUESIZE,
+				     DMA_FROM_DEVICE, LIBIE_RX_DMA_ATTR);
 
-		__free_pages(page, iavf_rx_pg_order(rx_ring));
+		__free_page(page);
 	}
 
 	rx_ring->next_to_clean = 0;
@@ -813,31 +812,29 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 /**
  * iavf_alloc_mapped_page - allocate and map a new page
  * @dev: device used for DMA mapping
- * @order: page order to allocate
  * @gfp: GFP mask to allocate page
  *
  * Returns a new &page if the it was successfully allocated, %NULL otherwise.
  **/
-static struct page *iavf_alloc_mapped_page(struct device *dev, u32 order,
-					   gfp_t gfp)
+static struct page *iavf_alloc_mapped_page(struct device *dev, gfp_t gfp)
 {
 	struct page *page;
 	dma_addr_t dma;
 
 	/* alloc new page for storage */
-	page = __dev_alloc_pages(gfp, order);
+	page = __dev_alloc_page(gfp);
 	if (unlikely(!page))
 		return NULL;
 
 	/* map page for use */
-	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE << order,
-				 DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
+	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE,
+				 LIBIE_RX_DMA_ATTR);
 
 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
 	 */
 	if (dma_mapping_error(dev, dma)) {
-		__free_pages(page, order);
+		__free_page(page);
 		return NULL;
 	}
 
@@ -879,7 +876,6 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
 static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 				 gfp_t gfp)
 {
-	u32 order = iavf_rx_pg_order(rx_ring);
 	struct device *dev = rx_ring->dev;
 	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
@@ -894,7 +890,7 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		struct page *page;
 		dma_addr_t dma;
 
-		page = iavf_alloc_mapped_page(dev, order, gfp);
+		page = iavf_alloc_mapped_page(dev, gfp);
 		if (!page) {
 			rx_ring->rx_stats.alloc_page_failed++;
 			break;
@@ -904,14 +900,14 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		dma = page_pool_get_dma_addr(page);
 
 		/* sync the buffer for use by the device */
-		dma_sync_single_range_for_device(dev, dma, IAVF_SKB_PAD,
-						 rx_ring->rx_buf_len,
+		dma_sync_single_range_for_device(dev, dma, LIBIE_SKB_HEADROOM,
+						 LIBIE_RX_BUF_LEN,
 						 DMA_FROM_DEVICE);
 
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
-		rx_desc->read.pkt_addr = cpu_to_le64(dma + IAVF_SKB_PAD);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma + LIBIE_SKB_HEADROOM);
 
 		rx_desc++;
 		ntu++;
@@ -1082,24 +1078,16 @@ static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
  * @skb: sk_buff to place the data into
  * @page: page containing data to add
  * @size: packet length from rx_desc
- * @pg_size: Rx buffer page size
  *
  * This function will add the data contained in page to the skb.
  * It will just attach the page as a frag to the skb.
  *
  * The function will then update the page offset.
  **/
-static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size,
-			     u32 pg_size)
+static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size)
 {
-#if (PAGE_SIZE < 8192)
-	unsigned int truesize = pg_size / 2;
-#else
-	unsigned int truesize = SKB_DATA_ALIGN(size + IAVF_SKB_PAD);
-#endif
-
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, IAVF_SKB_PAD,
-			size, truesize);
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
+			LIBIE_SKB_HEADROOM, size, LIBIE_RX_TRUESIZE);
 }
 
 /**
@@ -1113,40 +1101,34 @@ static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size,
 static void iavf_sync_rx_page(struct device *dev, struct page *page, u32 size)
 {
 	dma_sync_single_range_for_cpu(dev, page_pool_get_dma_addr(page),
-				      IAVF_SKB_PAD, size, DMA_FROM_DEVICE);
+				      LIBIE_SKB_HEADROOM, size,
+				      DMA_FROM_DEVICE);
 }
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
  * @page: Rx page to with the data
  * @size: size of the data
- * @pg_size: size of the Rx page
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct page *page, u32 size, u32 pg_size)
+static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 {
-	void *va;
-#if (PAGE_SIZE < 8192)
-	unsigned int truesize = pg_size / 2;
-#else
-	unsigned int truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) +
-				SKB_DATA_ALIGN(IAVF_SKB_PAD + size);
-#endif
 	struct sk_buff *skb;
+	void *va;
 
 	/* prefetch first cache line of first page */
 	va = page_address(page);
-	net_prefetch(va + IAVF_SKB_PAD);
+	net_prefetch(va + LIBIE_SKB_HEADROOM);
 
 	/* build an skb around the page buffer */
-	skb = napi_build_skb(va, truesize);
+	skb = napi_build_skb(va, LIBIE_RX_TRUESIZE);
 	if (unlikely(!skb))
 		return NULL;
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, IAVF_SKB_PAD);
+	skb_reserve(skb, LIBIE_SKB_HEADROOM);
 	__skb_put(skb, size);
 
 	return skb;
@@ -1156,13 +1138,12 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size, u32 pg_size)
  * iavf_unmap_rx_page - Unmap used page
  * @dev: device used for DMA mapping
  * @page: page to release
- * @pg_size: Rx page size
  */
-static void iavf_unmap_rx_page(struct device *dev, struct page *page,
-			       u32 pg_size)
+static void iavf_unmap_rx_page(struct device *dev, struct page *page)
 {
-	dma_unmap_page_attrs(dev, page_pool_get_dma_addr(page), pg_size,
-			     DMA_FROM_DEVICE, IAVF_RX_DMA_ATTR);
+	dma_unmap_page_attrs(dev, page_pool_get_dma_addr(page),
+			     LIBIE_RX_TRUESIZE, DMA_FROM_DEVICE,
+			     LIBIE_RX_DMA_ATTR);
 	page_pool_set_dma_addr(page, 0);
 }
 
@@ -1208,7 +1189,6 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	u32 to_refill = IAVF_DESC_UNUSED(rx_ring);
-	u32 pg_size = iavf_rx_pg_size(rx_ring);
 	struct sk_buff *skb = rx_ring->skb;
 	struct device *dev = rx_ring->dev;
 	u32 ntc = rx_ring->next_to_clean;
@@ -1259,23 +1239,23 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		 * stripped by the HW.
 		 */
 		if (unlikely(!size)) {
-			iavf_unmap_rx_page(dev, page, pg_size);
-			__free_pages(page, get_order(pg_size));
+			iavf_unmap_rx_page(dev, page);
+			__free_page(page);
 			goto skip_data;
 		}
 
 		iavf_sync_rx_page(dev, page, size);
-		iavf_unmap_rx_page(dev, page, pg_size);
+		iavf_unmap_rx_page(dev, page);
 
 		/* retrieve a buffer from the ring */
 		if (skb)
-			iavf_add_rx_frag(skb, page, size, pg_size);
+			iavf_add_rx_frag(skb, page, size);
 		else
-			skb = iavf_build_skb(page, size, pg_size);
+			skb = iavf_build_skb(page, size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
-			__free_pages(page, get_order(pg_size));
+			__free_page(page);
 			rx_ring->rx_stats.alloc_buff_failed++;
 			break;
 		}
@@ -1485,8 +1465,8 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 			clean_complete = false;
 			continue;
 		}
-		arm_wb |= ring->arm_wb;
-		ring->arm_wb = false;
+		arm_wb |= !!(ring->flags & IAVF_TXRX_FLAGS_ARM_WB);
+		ring->flags &= ~IAVF_TXRX_FLAGS_ARM_WB;
 	}
 
 	/* Handle case where we are called by netpoll with a budget of 0 */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index c09ac580fe84cc..25459411000a66 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -81,79 +81,8 @@ enum iavf_dyn_idx_t {
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_UNICAST_IPV6_UDP) | \
 	BIT_ULL(IAVF_FILTER_PCTYPE_NONF_MULTICAST_IPV6_UDP))
 
-/* Supported Rx Buffer Sizes (a multiple of 128) */
-#define IAVF_RXBUFFER_256   256
-#define IAVF_RXBUFFER_1536  1536  /* 128B aligned standard Ethernet frame */
-#define IAVF_RXBUFFER_2048  2048
-#define IAVF_RXBUFFER_3072  3072  /* Used for large frames w/ padding */
-#define IAVF_MAX_RXBUFFER   9728  /* largest size for single descriptor */
-
-/* NOTE: netdev_alloc_skb reserves up to 64 bytes, NET_IP_ALIGN means we
- * reserve 2 more, and skb_shared_info adds an additional 384 bytes more,
- * this adds up to 512 bytes of extra data meaning the smallest allocation
- * we could have is 1K.
- * i.e. RXBUFFER_256 --> 960 byte skb (size-1024 slab)
- * i.e. RXBUFFER_512 --> 1216 byte skb (size-2048 slab)
- */
-#define IAVF_RX_HDR_SIZE IAVF_RXBUFFER_256
-#define IAVF_PACKET_HDR_PAD (ETH_HLEN + ETH_FCS_LEN + (VLAN_HLEN * 2))
 #define iavf_rx_desc iavf_32byte_rx_desc
 
-#define IAVF_RX_DMA_ATTR \
-	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
-
-/* Attempt to maximize the headroom available for incoming frames.  We
- * use a 2K buffer for receives and need 1536/1534 to store the data for
- * the frame.  This leaves us with 512 bytes of room.  From that we need
- * to deduct the space needed for the shared info and the padding needed
- * to IP align the frame.
- *
- * Note: For cache line sizes 256 or larger this value is going to end
- *	 up negative.  In these cases we should fall back to the legacy
- *	 receive path.
- */
-#if (PAGE_SIZE < 8192)
-#define IAVF_2K_TOO_SMALL_WITH_PADDING \
-((NET_SKB_PAD + IAVF_RXBUFFER_1536) > SKB_WITH_OVERHEAD(IAVF_RXBUFFER_2048))
-
-static inline int iavf_compute_pad(int rx_buf_len)
-{
-	int page_size, pad_size;
-
-	page_size = ALIGN(rx_buf_len, PAGE_SIZE / 2);
-	pad_size = SKB_WITH_OVERHEAD(page_size) - rx_buf_len;
-
-	return pad_size;
-}
-
-static inline int iavf_skb_pad(void)
-{
-	int rx_buf_len;
-
-	/* If a 2K buffer cannot handle a standard Ethernet frame then
-	 * optimize padding for a 3K buffer instead of a 1.5K buffer.
-	 *
-	 * For a 3K buffer we need to add enough padding to allow for
-	 * tailroom due to NET_IP_ALIGN possibly shifting us out of
-	 * cache-line alignment.
-	 */
-	if (IAVF_2K_TOO_SMALL_WITH_PADDING)
-		rx_buf_len = IAVF_RXBUFFER_3072 + SKB_DATA_ALIGN(NET_IP_ALIGN);
-	else
-		rx_buf_len = IAVF_RXBUFFER_1536;
-
-	/* if needed make room for NET_IP_ALIGN */
-	rx_buf_len -= NET_IP_ALIGN;
-
-	return iavf_compute_pad(rx_buf_len);
-}
-
-#define IAVF_SKB_PAD iavf_skb_pad()
-#else
-#define IAVF_2K_TOO_SMALL_WITH_PADDING false
-#define IAVF_SKB_PAD (NET_SKB_PAD + NET_IP_ALIGN)
-#endif
-
 /**
  * iavf_test_staterr - tests bits in Rx descriptor status and error fields
  * @rx_desc: pointer to receive descriptor (in le64 format)
@@ -293,12 +222,6 @@ struct iavf_rx_queue_stats {
 	u64 alloc_buff_failed;
 };
 
-enum iavf_ring_state_t {
-	__IAVF_TX_FDIR_INIT_DONE,
-	__IAVF_TX_XPS_INIT_DONE,
-	__IAVF_RING_STATE_NBITS /* must be last */
-};
-
 /* some useful defines for virtchannel interface, which
  * is the only remaining user of header split
  */
@@ -320,10 +243,9 @@ struct iavf_ring {
 		struct iavf_tx_buffer *tx_bi;
 		struct page **rx_pages;
 	};
-	DECLARE_BITMAP(state, __IAVF_RING_STATE_NBITS);
+	u8 __iomem *tail;
 	u16 queue_index;		/* Queue number of ring */
 	u8 dcb_tc;			/* Traffic class of ring */
-	u8 __iomem *tail;
 
 	/* high bit set means dynamic, use accessors routines to read/write.
 	 * hardware only supports 2us resolution for the ITR registers.
@@ -332,24 +254,16 @@ struct iavf_ring {
 	 */
 	u16 itr_setting;
 
-	u16 count;			/* Number of descriptors */
 	u16 reg_idx;			/* HW register index of the ring */
-	u16 rx_buf_len;
+	u16 count;			/* Number of descriptors */
 
 	/* used in interrupt processing */
 	u16 next_to_use;
 	u16 next_to_clean;
 
-	u8 atr_sample_rate;
-	u8 atr_count;
-
-	bool ring_active;		/* is ring online or not */
-	bool arm_wb;		/* do something to arm write back */
-	u8 packet_stride;
-
 	u16 flags;
 #define IAVF_TXR_FLAGS_WB_ON_ITR		BIT(0)
-/* BIT(1) is free, was IAVF_RXR_FLAGS_BUILD_SKB_ENABLED */
+#define IAVF_TXRX_FLAGS_ARM_WB			BIT(1)
 /* BIT(2) is free */
 #define IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(3)
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
@@ -401,17 +315,6 @@ struct iavf_ring_container {
 #define iavf_for_each_ring(pos, head) \
 	for (pos = (head).ring; pos != NULL; pos = pos->next)
 
-static inline unsigned int iavf_rx_pg_order(struct iavf_ring *ring)
-{
-#if (PAGE_SIZE < 8192)
-	if (ring->rx_buf_len > (PAGE_SIZE / 2))
-		return 1;
-#endif
-	return 0;
-}
-
-#define iavf_rx_pg_size(_ring) (PAGE_SIZE << iavf_rx_pg_order(_ring))
-
 void iavf_alloc_rx_pages(struct iavf_ring *rxr);
 netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev);
 void iavf_clean_tx_ring(struct iavf_ring *tx_ring);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_type.h b/drivers/net/ethernet/intel/iavf/iavf_type.h
index 3030ba33032603..bb90d8f3ad7efe 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_type.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_type.h
@@ -10,8 +10,6 @@
 #include "iavf_adminq.h"
 #include "iavf_devids.h"
 
-#define IAVF_RXQ_CTX_DBUFF_SHIFT 7
-
 /* IAVF_MASK is a macro used on 32 bit registers */
 #define IAVF_MASK(mask, shift) ((u32)(mask) << (shift))
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index c2e328ec5af8f0..3a031d8b9685e2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -1,6 +1,8 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/net/intel/libie/rx.h>
+
 #include "iavf.h"
 #include "iavf_prototype.h"
 #include "iavf_client.h"
@@ -269,13 +271,12 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
 void iavf_configure_queues(struct iavf_adapter *adapter)
 {
 	struct virtchnl_vsi_queue_config_info *vqci;
-	int i, max_frame = adapter->vf_res->max_mtu;
+	u32 i, max_frame = adapter->vf_res->max_mtu;
 	int pairs = adapter->num_active_queues;
 	struct virtchnl_queue_pair_info *vqpi;
 	size_t len;
 
-	if (max_frame > IAVF_MAX_RXBUFFER || !max_frame)
-		max_frame = IAVF_MAX_RXBUFFER;
+	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN);
 
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
@@ -289,10 +290,6 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 	if (!vqci)
 		return;
 
-	/* Limit maximum frame size when jumbo frames is not enabled */
-	if (adapter->netdev->mtu <= ETH_DATA_LEN)
-		max_frame = IAVF_RXBUFFER_1536 - NET_IP_ALIGN;
-
 	vqci->vsi_id = adapter->vsi_res->vsi_id;
 	vqci->num_queue_pairs = pairs;
 	vqpi = vqci->qpair;
@@ -309,9 +306,7 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 		vqpi->rxq.ring_len = adapter->rx_rings[i].count;
 		vqpi->rxq.dma_ring_addr = adapter->rx_rings[i].dma;
 		vqpi->rxq.max_pkt_size = max_frame;
-		vqpi->rxq.databuffer_size =
-			ALIGN(adapter->rx_rings[i].rx_buf_len,
-			      BIT_ULL(IAVF_RXQ_CTX_DBUFF_SHIFT));
+		vqpi->rxq.databuffer_size = LIBIE_RX_BUF_LEN;
 		vqpi++;
 	}
 
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index 58bd0f35d0253f..9c9db68d3f3f61 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -4,6 +4,7 @@
 #ifndef __LIBIE_RX_H
 #define __LIBIE_RX_H
 
+#include <linux/if_vlan.h>
 #include <linux/netdevice.h>
 
 /* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
@@ -125,4 +126,42 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 	skb_set_hash(skb, hash, parsed.payload_layer);
 }
 
+/* Rx MTU/buffer/truesize helpers. Mostly pure software-side; HW-defined values
+ * are valid for all Intel HW.
+ */
+
+/* Space reserved in front of each frame */
+#define LIBIE_SKB_HEADROOM	(NET_SKB_PAD + NET_IP_ALIGN)
+/* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
+#define LIBIE_RX_LL_LEN		(ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
+
+/* Truesize: total space wasted on each frame. Always use order-0 pages */
+#define LIBIE_RX_PAGE_ORDER	0
+#define LIBIE_RX_TRUESIZE	(PAGE_SIZE << LIBIE_RX_PAGE_ORDER)
+/* Rx buffer size config is a multiple of 128 */
+#define LIBIE_RX_BUF_LEN_ALIGN	128
+/* HW-writeable space in one buffer: truesize - headroom/tailroom,
+ * HW-aligned
+ */
+#define __LIBIE_RX_BUF_LEN						    \
+	ALIGN_DOWN(SKB_MAX_ORDER(LIBIE_SKB_HEADROOM, LIBIE_RX_PAGE_ORDER),  \
+		   LIBIE_RX_BUF_LEN_ALIGN)
+/* The largest size for a single descriptor as per HW */
+#define LIBIE_MAX_RX_BUF_LEN	9728U
+/* "True" HW-writeable space: minimum from SW and HW values */
+#define LIBIE_RX_BUF_LEN	min_t(u32, __LIBIE_RX_BUF_LEN,		    \
+				      LIBIE_MAX_RX_BUF_LEN)
+
+/* The maximum frame size as per HW (S/G) */
+#define __LIBIE_MAX_RX_FRM_LEN	16382U
+/* ATST, HW can chain up to 5 Rx descriptors */
+#define LIBIE_MAX_RX_FRM_LEN	min_t(u32, __LIBIE_MAX_RX_FRM_LEN,	    \
+				      LIBIE_RX_BUF_LEN * 5)
+/* Maximum frame size minus LL overhead */
+#define LIBIE_MAX_MTU		(LIBIE_MAX_RX_FRM_LEN - LIBIE_RX_LL_LEN)
+
+/* DMA mapping attributes for Rx buffers: no impl. sync + relaxed on Sparc */
+#define LIBIE_RX_DMA_ATTR						    \
+	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
 #endif /* __LIBIE_RX_H */

From 49b4e5c61faf7ac8d7df33165ba5fb6bcb001f04 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 3 Feb 2023 18:15:47 +0100
Subject: [PATCH 06/32] net: page_pool: allow DMA mapping with
 %DMA_ATTR_WEAK_ORDERING

Add a new flag, %PP_FLAG_DMA_MAP_WEAK, whill will tell PP to map pages
with %DMA_ATTR_WEAK_ORDERING.
To keep the code simple and optimized, map the following PP flags to DMA
map attr flags:

%PP_FLAG_DMA_MAP	=> %DMA_ATTR_SKIP_CPU_SYNC
%PP_FLAG_DMA_MAP_WEAK	=> %DMA_ATTR_WEAK_ORDERING

The first pair is done to be able to just pass it directly to
dma_map_page_attrs(). When a driver wants Page Pool to maintain DMA
mappings, it always sets this flag. Page Pool always skips CPU syncs
when mapping to do that separately later, so having those two 1:1 avoids
introducing ifs and/or bit-ors and keeps the code more compact.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/net/page_pool.h | 12 +++++++++---
 net/core/page_pool.c    | 20 ++++++++++++++++++--
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index ddfa0b32867776..dec5772e851030 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -34,10 +34,16 @@
 #include <linux/ptr_ring.h>
 #include <linux/dma-direction.h>
 
-#define PP_FLAG_DMA_MAP		BIT(0) /* Should page_pool do the DMA
+#define PP_FLAG_DMA_MAP		BIT(5) /* Should page_pool do the DMA
 					* map/unmap
 					*/
-#define PP_FLAG_DMA_SYNC_DEV	BIT(1) /* If set all pages that the driver gets
+#define PP_FLAG_DMA_MAP_WEAK	BIT(1) /* Map with %DMA_ATTR_WEAK_ORDERING */
+/* These flags correspond to the DMA map attributes to pass them directly to
+ * dma_map_page_attrs(), see page_pool_dma_map().
+ */
+#define PP_FLAG_DMA_ATTR	(PP_FLAG_DMA_MAP | \
+				 PP_FLAG_DMA_MAP_WEAK)
+#define PP_FLAG_DMA_SYNC_DEV	BIT(0) /* If set all pages that the driver gets
 					* from page_pool will be
 					* DMA-synced-for-device according to
 					* the length provided by the device
@@ -46,7 +52,7 @@
 					* device driver responsibility
 					*/
 #define PP_FLAG_PAGE_FRAG	BIT(2) /* for page frag feature */
-#define PP_FLAG_ALL		(PP_FLAG_DMA_MAP |\
+#define PP_FLAG_ALL		(PP_FLAG_DMA_ATTR |\
 				 PP_FLAG_DMA_SYNC_DEV |\
 				 PP_FLAG_PAGE_FRAG)
 
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 193c1879986503..74e25b55e2f062 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -161,6 +161,13 @@ static int page_pool_init(struct page_pool *pool,
 			return -EINVAL;
 	}
 
+	/* Passing DMA mapping attributes without asking PP to map pages
+	 * makes no sense.
+	 */
+	if ((pool->p.flags & PP_FLAG_DMA_ATTR) &&
+	    !(pool->p.flags & PP_FLAG_DMA_MAP))
+		return -EINVAL;
+
 	if (pool->p.flags & PP_FLAG_DMA_SYNC_DEV) {
 		/* In order to request DMA-sync-for-device the page
 		 * needs to be mapped
@@ -308,6 +315,14 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 {
 	dma_addr_t dma;
 
+	/* Pages are always mapped with %DMA_ATTR_SKIP_CPU_SYNC, so its value
+	 * corresponds to %PP_FLAG_DMA_MAP, which is always set when reaching
+	 * this function.
+	 */
+	static_assert(PP_FLAG_DMA_MAP == DMA_ATTR_SKIP_CPU_SYNC);
+	/* Drivers may set this for PP to map with weak ordering */
+	static_assert(PP_FLAG_DMA_MAP_WEAK == DMA_ATTR_WEAK_ORDERING);
+
 	/* Setup DMA mapping: use 'struct page' area for storing DMA-addr
 	 * since dma_addr_t can be either 32 or 64 bits and does not always fit
 	 * into page private data (i.e 32bit cpu with 64bit DMA caps)
@@ -315,7 +330,8 @@ static bool page_pool_dma_map(struct page_pool *pool, struct page *page)
 	 */
 	dma = dma_map_page_attrs(pool->p.dev, page, 0,
 				 (PAGE_SIZE << pool->p.order),
-				 pool->p.dma_dir, DMA_ATTR_SKIP_CPU_SYNC);
+				 pool->p.dma_dir,
+				 pool->p.flags & PP_FLAG_DMA_ATTR);
 	if (dma_mapping_error(pool->p.dev, dma))
 		return false;
 
@@ -483,7 +499,7 @@ void page_pool_release_page(struct page_pool *pool, struct page *page)
 	/* When page is unmapped, it cannot be returned to our pool */
 	dma_unmap_page_attrs(pool->p.dev, dma,
 			     PAGE_SIZE << pool->p.order, pool->p.dma_dir,
-			     DMA_ATTR_SKIP_CPU_SYNC);
+			     pool->p.flags & PP_FLAG_DMA_ATTR);
 	page_pool_set_dma_addr(page, 0);
 skip_dma_unmap:
 	page_pool_clear_pp_info(page);

From b71da32186369685ebc909aec6fae6b2bbedfb5c Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 9 Mar 2023 21:26:57 +0100
Subject: [PATCH 07/32] net: page_pool: add DMA-sync-for-CPU inline helpers

Each driver is responsible for syncing buffers written by HW for CPU
before accessing them. Almost each PP-enabled driver uses the same
pattern, which could be shorthanded into a static inline to make driver
code a little bit more compact.
Introduce a pair of such functions. The first one takes the actual size
of the data written by HW and is the main one to be used on Rx. The
second picks max_len from the PP params and is designed for more extreme
cases when the size is unknown, but the buffer still needs to be synced.
Also constify pointer arguments of page_pool_get_dma_dir() and
page_pool_get_dma_addr() to give a bit more room for optimization, as
both of them are read-only.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/net/page_pool.h | 39 +++++++++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/include/net/page_pool.h b/include/net/page_pool.h
index dec5772e851030..fb949d168e14c0 100644
--- a/include/net/page_pool.h
+++ b/include/net/page_pool.h
@@ -32,7 +32,7 @@
 
 #include <linux/mm.h> /* Needed by ptr_ring */
 #include <linux/ptr_ring.h>
-#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
 
 #define PP_FLAG_DMA_MAP		BIT(5) /* Should page_pool do the DMA
 					* map/unmap
@@ -239,8 +239,8 @@ static inline struct page *page_pool_dev_alloc_frag(struct page_pool *pool,
 /* get the stored dma direction. A driver might decide to treat this locally and
  * avoid the extra cache line from page_pool to determine the direction
  */
-static
-inline enum dma_data_direction page_pool_get_dma_dir(struct page_pool *pool)
+static inline enum dma_data_direction
+page_pool_get_dma_dir(const struct page_pool *pool)
 {
 	return pool->p.dma_dir;
 }
@@ -360,7 +360,7 @@ static inline void page_pool_recycle_direct(struct page_pool *pool,
 #define PAGE_POOL_DMA_USE_PP_FRAG_COUNT	\
 		(sizeof(dma_addr_t) > sizeof(unsigned long))
 
-static inline dma_addr_t page_pool_get_dma_addr(struct page *page)
+static inline dma_addr_t page_pool_get_dma_addr(const struct page *page)
 {
 	dma_addr_t ret = page->dma_addr;
 
@@ -377,6 +377,37 @@ static inline void page_pool_set_dma_addr(struct page *page, dma_addr_t addr)
 		page->dma_addr_upper = upper_32_bits(addr);
 }
 
+/**
+ * page_pool_dma_sync_for_cpu - sync Rx page for CPU after it's written by HW
+ * @pool: page_pool which this page belongs to
+ * @page: page to sync
+ * @dma_sync_size: size of the data written to the page
+ *
+ * Can be used as a shorthand to sync Rx pages before accessing them in the
+ * driver. The caller must ensure the pool was created with %PP_FLAG_DMA_MAP.
+ */
+static inline void page_pool_dma_sync_for_cpu(const struct page_pool *pool,
+					      const struct page *page,
+					      u32 dma_sync_size)
+{
+	dma_sync_single_range_for_cpu(pool->p.dev,
+				      page_pool_get_dma_addr(page),
+				      pool->p.offset, dma_sync_size,
+				      page_pool_get_dma_dir(pool));
+}
+
+/**
+ * page_pool_dma_sync_for_cpu - sync full Rx page for CPU
+ * @pool: page_pool which this page belongs to
+ * @page: page to sync
+ */
+static inline void
+page_pool_dma_sync_full_for_cpu(const struct page_pool *pool,
+				const struct page *page)
+{
+	page_pool_dma_sync_for_cpu(pool, page, pool->p.max_len);
+}
+
 static inline bool is_page_pool_compiled_in(void)
 {
 #ifdef CONFIG_PAGE_POOL

From b71ce3cb9ecd183b2e8e41d53454b1bdc517ba27 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 9 Mar 2023 13:31:08 +0100
Subject: [PATCH 08/32] iavf: switch to Page Pool

Now that the IAVF driver simply uses dev_alloc_page() + free_page() with
no custom recycling logics and one whole page per frame, it can easily
be switched to using Page Pool API instead.
Introduce libie_rx_page_pool_create(), a wrapper for creating a PP with
the default libie settings applicable to all Intel hardware, and replace
the alloc/free calls with the corresponding PP functions, including the
newly added sync-for-CPU helpers. Use skb_mark_for_recycle() to bring
back the recycling and restore the initial performance.

From the important object code changes, worth mentioning that
__iavf_alloc_rx_pages() is now inlined due to the greatly reduced size.
The resulting driver is on par with the pre-series code and 1-2% slower
than the "optimized" version right before the recycling removal.
But the number of locs and object code bytes slaughtered is much more
important here after all, not speaking of that there's still a vast
space for optimization and improvements.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 124 +++++---------------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |   5 +-
 drivers/net/ethernet/intel/libie/rx.c       |  31 +++++
 include/linux/net/intel/libie/rx.h          |   3 +
 4 files changed, 69 insertions(+), 94 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 8e0e6d59cd3e10..5d087f9b38ed47 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -690,8 +690,6 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
  **/
 void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 {
-	u16 i;
-
 	/* ring already cleared, nothing to do */
 	if (!rx_ring->rx_pages)
 		return;
@@ -702,28 +700,17 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 	}
 
 	/* Free all the Rx ring sk_buffs */
-	for (i = 0; i < rx_ring->count; i++) {
+	for (u32 i = 0; i < rx_ring->count; i++) {
 		struct page *page = rx_ring->rx_pages[i];
-		dma_addr_t dma;
 
 		if (!page)
 			continue;
 
-		dma = page_pool_get_dma_addr(page);
-
 		/* Invalidate cache lines that may have been written to by
 		 * device so that we avoid corrupting memory.
 		 */
-		dma_sync_single_range_for_cpu(rx_ring->dev, dma,
-					      LIBIE_SKB_HEADROOM,
-					      LIBIE_RX_BUF_LEN,
-					      DMA_FROM_DEVICE);
-
-		/* free resources associated with mapping */
-		dma_unmap_page_attrs(rx_ring->dev, dma, LIBIE_RX_TRUESIZE,
-				     DMA_FROM_DEVICE, LIBIE_RX_DMA_ATTR);
-
-		__free_page(page);
+		page_pool_dma_sync_full_for_cpu(rx_ring->pool, page);
+		page_pool_put_full_page(rx_ring->pool, page, false);
 	}
 
 	rx_ring->next_to_clean = 0;
@@ -738,10 +725,15 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
  **/
 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 {
+	struct device *dev = rx_ring->pool->p.dev;
+
 	iavf_clean_rx_ring(rx_ring);
 	kfree(rx_ring->rx_pages);
 	rx_ring->rx_pages = NULL;
 
+	page_pool_destroy(rx_ring->pool);
+	rx_ring->dev = dev;
+
 	if (rx_ring->desc) {
 		dma_free_coherent(rx_ring->dev, rx_ring->size,
 				  rx_ring->desc, rx_ring->dma);
@@ -758,13 +750,15 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 {
 	struct device *dev = rx_ring->dev;
+	struct page_pool *pool;
+	int ret = -ENOMEM;
 
 	/* warn if we are about to overwrite the pointer */
 	WARN_ON(rx_ring->rx_pages);
 	rx_ring->rx_pages = kcalloc(rx_ring->count, sizeof(*rx_ring->rx_pages),
 				    GFP_KERNEL);
 	if (!rx_ring->rx_pages)
-		return -ENOMEM;
+		return ret;
 
 	u64_stats_init(&rx_ring->syncp);
 
@@ -780,15 +774,26 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 		goto err;
 	}
 
+	pool = libie_rx_page_pool_create(rx_ring->netdev, rx_ring->count);
+	if (IS_ERR(pool)) {
+		ret = PTR_ERR(pool);
+		goto err_free_dma;
+	}
+
+	rx_ring->pool = pool;
+
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
 	return 0;
+
+err_free_dma:
+	dma_free_coherent(dev, rx_ring->size, rx_ring->desc, rx_ring->dma);
 err:
 	kfree(rx_ring->rx_pages);
 	rx_ring->rx_pages = NULL;
 
-	return -ENOMEM;
+	return ret;
 }
 
 /**
@@ -809,40 +814,6 @@ static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
 	writel(val, rx_ring->tail);
 }
 
-/**
- * iavf_alloc_mapped_page - allocate and map a new page
- * @dev: device used for DMA mapping
- * @gfp: GFP mask to allocate page
- *
- * Returns a new &page if the it was successfully allocated, %NULL otherwise.
- **/
-static struct page *iavf_alloc_mapped_page(struct device *dev, gfp_t gfp)
-{
-	struct page *page;
-	dma_addr_t dma;
-
-	/* alloc new page for storage */
-	page = __dev_alloc_page(gfp);
-	if (unlikely(!page))
-		return NULL;
-
-	/* map page for use */
-	dma = dma_map_page_attrs(dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE,
-				 LIBIE_RX_DMA_ATTR);
-
-	/* if mapping failed free memory back to system since
-	 * there isn't much point in holding memory we can't use
-	 */
-	if (dma_mapping_error(dev, dma)) {
-		__free_page(page);
-		return NULL;
-	}
-
-	page_pool_set_dma_addr(page, dma);
-
-	return page;
-}
-
 /**
  * iavf_receive_skb - Send a completed packet up the stack
  * @rx_ring:  rx ring in play
@@ -876,7 +847,7 @@ static void iavf_receive_skb(struct iavf_ring *rx_ring,
 static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 				 gfp_t gfp)
 {
-	struct device *dev = rx_ring->dev;
+	struct page_pool *pool = rx_ring->pool;
 	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
 
@@ -890,7 +861,7 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		struct page *page;
 		dma_addr_t dma;
 
-		page = iavf_alloc_mapped_page(dev, gfp);
+		page = page_pool_alloc_pages(pool, gfp);
 		if (!page) {
 			rx_ring->rx_stats.alloc_page_failed++;
 			break;
@@ -899,11 +870,6 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		rx_ring->rx_pages[ntu] = page;
 		dma = page_pool_get_dma_addr(page);
 
-		/* sync the buffer for use by the device */
-		dma_sync_single_range_for_device(dev, dma, LIBIE_SKB_HEADROOM,
-						 LIBIE_RX_BUF_LEN,
-						 DMA_FROM_DEVICE);
-
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
@@ -1090,21 +1056,6 @@ static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size)
 			LIBIE_SKB_HEADROOM, size, LIBIE_RX_TRUESIZE);
 }
 
-/**
- * iavf_sync_rx_page - Synchronize received data for use
- * @dev: device used for DMA mapping
- * @page: Rx page containing the data
- * @size: size of the received data
- *
- * This function will synchronize the Rx buffer for use by the CPU.
- */
-static void iavf_sync_rx_page(struct device *dev, struct page *page, u32 size)
-{
-	dma_sync_single_range_for_cpu(dev, page_pool_get_dma_addr(page),
-				      LIBIE_SKB_HEADROOM, size,
-				      DMA_FROM_DEVICE);
-}
-
 /**
  * iavf_build_skb - Build skb around an existing buffer
  * @page: Rx page to with the data
@@ -1127,6 +1078,8 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 	if (unlikely(!skb))
 		return NULL;
 
+	skb_mark_for_recycle(skb);
+
 	/* update pointers within the skb to store the data */
 	skb_reserve(skb, LIBIE_SKB_HEADROOM);
 	__skb_put(skb, size);
@@ -1134,19 +1087,6 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 	return skb;
 }
 
-/**
- * iavf_unmap_rx_page - Unmap used page
- * @dev: device used for DMA mapping
- * @page: page to release
- */
-static void iavf_unmap_rx_page(struct device *dev, struct page *page)
-{
-	dma_unmap_page_attrs(dev, page_pool_get_dma_addr(page),
-			     LIBIE_RX_TRUESIZE, DMA_FROM_DEVICE,
-			     LIBIE_RX_DMA_ATTR);
-	page_pool_set_dma_addr(page, 0);
-}
-
 /**
  * iavf_is_non_eop - process handling of non-EOP buffers
  * @rx_ring: Rx ring being processed
@@ -1189,8 +1129,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
 	u32 to_refill = IAVF_DESC_UNUSED(rx_ring);
+	struct page_pool *pool = rx_ring->pool;
 	struct sk_buff *skb = rx_ring->skb;
-	struct device *dev = rx_ring->dev;
 	u32 ntc = rx_ring->next_to_clean;
 	u32 ring_size = rx_ring->count;
 	u32 cleaned_count = 0;
@@ -1239,13 +1179,11 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		 * stripped by the HW.
 		 */
 		if (unlikely(!size)) {
-			iavf_unmap_rx_page(dev, page);
-			__free_page(page);
+			page_pool_recycle_direct(pool, page);
 			goto skip_data;
 		}
 
-		iavf_sync_rx_page(dev, page, size);
-		iavf_unmap_rx_page(dev, page);
+		page_pool_dma_sync_for_cpu(pool, page, size);
 
 		/* retrieve a buffer from the ring */
 		if (skb)
@@ -1255,7 +1193,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
-			__free_page(page);
+			page_pool_put_page(pool, page, size, true);
 			rx_ring->rx_stats.alloc_buff_failed++;
 			break;
 		}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 25459411000a66..8fbe549ce6a587 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -237,7 +237,10 @@ struct iavf_rx_queue_stats {
 struct iavf_ring {
 	struct iavf_ring *next;		/* pointer to next ring in q_vector */
 	void *desc;			/* Descriptor ring memory */
-	struct device *dev;		/* Used for DMA mapping */
+	union {
+		struct page_pool *pool;	/* Used for Rx page management */
+		struct device *dev;	/* Used for DMA mapping on Tx */
+	};
 	struct net_device *netdev;	/* netdev ring maps to */
 	union {
 		struct iavf_tx_buffer *tx_bi;
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
index f503476d8eeff9..85d024f0a88567 100644
--- a/drivers/net/ethernet/intel/libie/rx.c
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -105,6 +105,37 @@ const struct libie_rx_ptype_parsed libie_rx_ptype_lut[LIBIE_RX_PTYPE_NUM] = {
 };
 EXPORT_SYMBOL_NS_GPL(libie_rx_ptype_lut, LIBIE);
 
+/* Page Pool */
+
+/**
+ * libie_rx_page_pool_create - create a PP with the default libie settings
+ * @dev: &net_device which a PP will be created for
+ * @size: size of the PP, usually simply Rx queue len
+ *
+ * Returns &page_pool on success, casted -errno on failure.
+ */
+struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
+					    u32 size)
+{
+	const struct page_pool_params pp = {
+		.flags		= PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK |
+				  PP_FLAG_DMA_SYNC_DEV,
+		.order		= LIBIE_RX_PAGE_ORDER,
+		.pool_size	= size,
+		.nid		= NUMA_NO_NODE,
+		.dev		= dev->dev.parent,
+		.dma_dir	= DMA_FROM_DEVICE,
+		.max_len	= LIBIE_RX_BUF_LEN,
+		.offset		= LIBIE_SKB_HEADROOM,
+	};
+
+	static_assert((PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK) ==
+		      LIBIE_RX_DMA_ATTR);
+
+	return page_pool_create(&pp);
+}
+EXPORT_SYMBOL_NS_GPL(libie_rx_page_pool_create, LIBIE);
+
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("Intel(R) Ethernet common library");
 MODULE_LICENSE("GPL");
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index 9c9db68d3f3f61..44eafbd04a7c22 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -164,4 +164,7 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 #define LIBIE_RX_DMA_ATTR						    \
 	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
 
+struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
+					    u32 size);
+
 #endif /* __LIBIE_RX_H */

From 3d884e36aea0e2defa3f17aee9537ac8e7d7e678 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Tue, 14 Mar 2023 17:40:57 +0100
Subject: [PATCH 09/32] libie: add common queue stats

Next stop, per-queue private stats. They have only subtle differences
from driver to driver and can easily be resolved.
Define common structures, inline helpers and Ethtool helpers to collect,
update and export the statistics. Use u64_stats_t right from the start,
as well as the corresponding helpers to ensure tear-free operations.
For the NAPI parts of both Rx and Tx, also define small onstack
containers to update them in polling loops and then sync the actual
containers once a loop ends.
The drivers will be switched to use this API later on a per-driver
basis, along with conversion to PP.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/libie/Makefile |   1 +
 drivers/net/ethernet/intel/libie/stats.c  | 119 ++++++++++++++
 include/linux/net/intel/libie/stats.h     | 179 ++++++++++++++++++++++
 3 files changed, 299 insertions(+)
 create mode 100644 drivers/net/ethernet/intel/libie/stats.c
 create mode 100644 include/linux/net/intel/libie/stats.h

diff --git a/drivers/net/ethernet/intel/libie/Makefile b/drivers/net/ethernet/intel/libie/Makefile
index 95e81d09b4746c..76f32253481b70 100644
--- a/drivers/net/ethernet/intel/libie/Makefile
+++ b/drivers/net/ethernet/intel/libie/Makefile
@@ -4,3 +4,4 @@
 obj-$(CONFIG_LIBIE)	+= libie.o
 
 libie-objs		+= rx.o
+libie-objs		+= stats.o
diff --git a/drivers/net/ethernet/intel/libie/stats.c b/drivers/net/ethernet/intel/libie/stats.c
new file mode 100644
index 00000000000000..61456842a36211
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/stats.c
@@ -0,0 +1,119 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* Copyright(c) 2023 Intel Corporation. */
+
+#include <linux/ethtool.h>
+#include <linux/net/intel/libie/stats.h>
+
+/* Rx per-queue stats */
+
+static const char * const libie_rq_stats_str[] = {
+#define act(s)	__stringify(s),
+	DECLARE_LIBIE_RQ_STATS(act)
+#undef act
+};
+
+#define LIBIE_RQ_STATS_NUM	ARRAY_SIZE(libie_rq_stats_str)
+
+/**
+ * libie_rq_stats_get_sset_count - get the number of Ethtool RQ stats provided
+ *
+ * Returns the number of per-queue Rx stats supported by the library.
+ */
+u32 libie_rq_stats_get_sset_count(void)
+{
+	return LIBIE_RQ_STATS_NUM;
+}
+EXPORT_SYMBOL_NS_GPL(libie_rq_stats_get_sset_count, LIBIE);
+
+/**
+ * libie_rq_stats_get_strings - get the name strings of Ethtool RQ stats
+ * @data: reference to the cursor pointing to the output buffer
+ * @qid: RQ number to print in the prefix
+ */
+void libie_rq_stats_get_strings(u8 **data, u32 qid)
+{
+	for (u32 i = 0; i < LIBIE_RQ_STATS_NUM; i++)
+		ethtool_sprintf(data, "rq%u_%s", qid, libie_rq_stats_str[i]);
+}
+EXPORT_SYMBOL_NS_GPL(libie_rq_stats_get_strings, LIBIE);
+
+/**
+ * libie_rq_stats_get_data - get the RQ stats in Ethtool format
+ * @data: reference to the cursor pointing to the output array
+ * @stats: RQ stats container from the queue
+ */
+void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats)
+{
+	u64 sarr[LIBIE_RQ_STATS_NUM];
+	u32 start;
+
+	do {
+		start = u64_stats_fetch_begin(&stats->syncp);
+
+		for (u32 i = 0; i < LIBIE_RQ_STATS_NUM; i++)
+			sarr[i] = u64_stats_read(&stats->raw[i]);
+	} while (u64_stats_fetch_retry(&stats->syncp, start));
+
+	for (u32 i = 0; i < LIBIE_RQ_STATS_NUM; i++)
+		(*data)[i] += sarr[i];
+
+	*data += LIBIE_RQ_STATS_NUM;
+}
+EXPORT_SYMBOL_NS_GPL(libie_rq_stats_get_data, LIBIE);
+
+/* Tx per-queue stats */
+
+static const char * const libie_sq_stats_str[] = {
+#define act(s)	__stringify(s),
+	DECLARE_LIBIE_SQ_STATS(act)
+#undef act
+};
+
+#define LIBIE_SQ_STATS_NUM	ARRAY_SIZE(libie_sq_stats_str)
+
+/**
+ * libie_sq_stats_get_sset_count - get the number of Ethtool SQ stats provided
+ *
+ * Returns the number of per-queue Tx stats supported by the library.
+ */
+u32 libie_sq_stats_get_sset_count(void)
+{
+	return LIBIE_SQ_STATS_NUM;
+}
+EXPORT_SYMBOL_NS_GPL(libie_sq_stats_get_sset_count, LIBIE);
+
+/**
+ * libie_sq_stats_get_strings - get the name strings of Ethtool SQ stats
+ * @data: reference to the cursor pointing to the output buffer
+ * @qid: SQ number to print in the prefix
+ */
+void libie_sq_stats_get_strings(u8 **data, u32 qid)
+{
+	for (u32 i = 0; i < LIBIE_SQ_STATS_NUM; i++)
+		ethtool_sprintf(data, "sq%u_%s", qid, libie_sq_stats_str[i]);
+}
+EXPORT_SYMBOL_NS_GPL(libie_sq_stats_get_strings, LIBIE);
+
+/**
+ * libie_sq_stats_get_data - get the SQ stats in Ethtool format
+ * @data: reference to the cursor pointing to the output array
+ * @stats: SQ stats container from the queue
+ */
+void libie_sq_stats_get_data(u64 **data, const struct libie_sq_stats *stats)
+{
+	u64 sarr[LIBIE_SQ_STATS_NUM];
+	u32 start;
+
+	do {
+		start = u64_stats_fetch_begin(&stats->syncp);
+
+		for (u32 i = 0; i < LIBIE_SQ_STATS_NUM; i++)
+			sarr[i] = u64_stats_read(&stats->raw[i]);
+	} while (u64_stats_fetch_retry(&stats->syncp, start));
+
+	for (u32 i = 0; i < LIBIE_SQ_STATS_NUM; i++)
+		(*data)[i] += sarr[i];
+
+	*data += LIBIE_SQ_STATS_NUM;
+}
+EXPORT_SYMBOL_NS_GPL(libie_sq_stats_get_data, LIBIE);
diff --git a/include/linux/net/intel/libie/stats.h b/include/linux/net/intel/libie/stats.h
new file mode 100644
index 00000000000000..dbbc98bbd3a70c
--- /dev/null
+++ b/include/linux/net/intel/libie/stats.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* Copyright(c) 2023 Intel Corporation. */
+
+#ifndef __LIBIE_STATS_H
+#define __LIBIE_STATS_H
+
+#include <linux/u64_stats_sync.h>
+
+/* Common */
+
+/* Use 32-byte alignment to reduce false sharing */
+#define __libie_stats_aligned	__aligned(4 * sizeof(u64_stats_t))
+
+/**
+ * libie_stats_add - update one structure counter from a local struct
+ * @qs: queue stats structure to update (&libie_rq_stats or &libie_sq_stats)
+ * @ss: local/onstack stats structure
+ * @f: name of the field to update
+ *
+ * If a local/onstack stats structure is used to collect statistics during
+ * hotpath loops, this macro can be used to shorthand updates, given that
+ * the fields have the same name.
+ * Must be guarded with u64_stats_update_{begin,end}().
+ */
+#define libie_stats_add(qs, ss, f)			\
+	u64_stats_add(&(qs)->f, (ss)->f)
+
+/**
+ * __libie_stats_inc_one - safely increment one stats structure counter
+ * @s: queue stats structure to update (&libie_rq_stats or &libie_sq_stats)
+ * @f: name of the field to increment
+ * @n: name of the temporary variable, result of __UNIQUE_ID()
+ *
+ * To be used on exception or slow paths -- allocation fails, queue stops etc.
+ */
+#define __libie_stats_inc_one(s, f, n) ({		\
+	typeof(*(s)) *n = (s);				\
+							\
+	u64_stats_update_begin(&n->syncp);		\
+	u64_stats_inc(&n->f);				\
+	u64_stats_update_end(&n->syncp);		\
+})
+#define libie_stats_inc_one(s, f)			\
+	__libie_stats_inc_one(s, f, __UNIQUE_ID(qs_))
+
+/* Rx per-queue stats:
+ * packets: packets received on this queue
+ * bytes: bytes received on this queue
+ * fragments: number of processed descriptors carrying only a fragment
+ * alloc_page_fail: number of Rx page allocation fails
+ * build_skb_fail: number of build_skb() fails
+ */
+
+#define DECLARE_LIBIE_RQ_NAPI_STATS(act)		\
+	act(packets)					\
+	act(bytes)					\
+	act(fragments)
+
+#define DECLARE_LIBIE_RQ_FAIL_STATS(act)		\
+	act(alloc_page_fail)				\
+	act(build_skb_fail)
+
+#define DECLARE_LIBIE_RQ_STATS(act)			\
+	DECLARE_LIBIE_RQ_NAPI_STATS(act)		\
+	DECLARE_LIBIE_RQ_FAIL_STATS(act)
+
+struct libie_rq_stats {
+	struct u64_stats_sync	syncp;
+
+	union {
+		struct {
+#define act(s)	u64_stats_t	s;
+			DECLARE_LIBIE_RQ_NAPI_STATS(act);
+			DECLARE_LIBIE_RQ_FAIL_STATS(act);
+#undef act
+		};
+		DECLARE_FLEX_ARRAY(u64_stats_t, raw);
+	};
+} __libie_stats_aligned;
+
+/* Rx stats being modified frequently during the NAPI polling, to sync them
+ * with the queue stats once after the loop is finished.
+ */
+struct libie_rq_onstack_stats {
+	union {
+		struct {
+#define act(s)	u32		s;
+			DECLARE_LIBIE_RQ_NAPI_STATS(act);
+#undef act
+		};
+		DECLARE_FLEX_ARRAY(u32, raw);
+	};
+};
+
+/**
+ * libie_rq_napi_stats_add - add onstack Rx stats to the queue container
+ * @qs: Rx queue stats structure to update
+ * @ss: onstack structure to get the values from, updated during the NAPI loop
+ */
+static inline void
+libie_rq_napi_stats_add(struct libie_rq_stats *qs,
+			const struct libie_rq_onstack_stats *ss)
+{
+	u64_stats_update_begin(&qs->syncp);
+	libie_stats_add(qs, ss, packets);
+	libie_stats_add(qs, ss, bytes);
+	libie_stats_add(qs, ss, fragments);
+	u64_stats_update_end(&qs->syncp);
+}
+
+u32 libie_rq_stats_get_sset_count(void);
+void libie_rq_stats_get_strings(u8 **data, u32 qid);
+void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats);
+
+/* Tx per-queue stats:
+ * packets: packets sent from this queue
+ * bytes: bytes sent from this queue
+ * busy: number of xmit failures due to the ring being full
+ * stops: number times the ring was stopped from the driver
+ * restarts: number times it was started after being stopped
+ * linearized: number of skbs linearized due to HW limits
+ */
+
+#define DECLARE_LIBIE_SQ_NAPI_STATS(act)		\
+	act(packets)					\
+	act(bytes)
+
+#define DECLARE_LIBIE_SQ_XMIT_STATS(act)		\
+	act(busy)					\
+	act(stops)					\
+	act(restarts)					\
+	act(linearized)
+
+#define DECLARE_LIBIE_SQ_STATS(act)			\
+	DECLARE_LIBIE_SQ_NAPI_STATS(act)		\
+	DECLARE_LIBIE_SQ_XMIT_STATS(act)
+
+struct libie_sq_stats {
+	struct u64_stats_sync	syncp;
+
+	union {
+		struct {
+#define act(s)	u64_stats_t	s;
+			DECLARE_LIBIE_SQ_STATS(act);
+#undef act
+		};
+		DECLARE_FLEX_ARRAY(u64_stats_t, raw);
+	};
+} __libie_stats_aligned;
+
+struct libie_sq_onstack_stats {
+#define act(s)	u32		s;
+	DECLARE_LIBIE_SQ_NAPI_STATS(act);
+#undef act
+};
+
+/**
+ * libie_sq_napi_stats_add - add onstack Tx stats to the queue container
+ * @qs: Tx queue stats structure to update
+ * @ss: onstack structure to get the values from, updated during the NAPI loop
+ */
+static inline void
+libie_sq_napi_stats_add(struct libie_sq_stats *qs,
+			const struct libie_sq_onstack_stats *ss)
+{
+	if (unlikely(!ss->packets))
+		return;
+
+	u64_stats_update_begin(&qs->syncp);
+	libie_stats_add(qs, ss, packets);
+	libie_stats_add(qs, ss, bytes);
+	u64_stats_update_end(&qs->syncp);
+}
+
+u32 libie_sq_stats_get_sset_count(void);
+void libie_sq_stats_get_strings(u8 **data, u32 qid);
+void libie_sq_stats_get_data(u64 **data, const struct libie_sq_stats *stats);
+
+#endif /* __LIBIE_STATS_H */

From 16d126cbb22422dada94398928c76bb2e1571d75 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 16 Mar 2023 20:12:55 +0100
Subject: [PATCH 10/32] libie: add per-queue Page Pool stats

Expand the libie generic per-queue stats with the generic Page Pool
stats provided by the API itself, when CONFIG_PAGE_POOL is enable.
When it's not, there'll be no such fields in the stats structure, so
no space wasted.
They are also a bit special in terms of how they are obtained. One
&page_pool accumulates statistics until it's destroyed obviously,
which happens on ifdown. So, in order to not lose any statistics,
get the stats and store in the queue container before destroying
a pool. This container survives ifups/downs, so it basically stores
the statistics accumulated since the very first pool was allocated
on this queue. When it's needed to export the stats, first get the
numbers from this container and then add the "live" numbers -- the
ones that the current active pool returns. The result values will
always represent the actual device-lifetime* stats.
There's a cast from &page_pool_stats to `u64 *` in a couple functions,
but they are guarded with stats asserts to make sure it's safe to do.
FWIW it saves a lot of object code.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/libie/internal.h | 23 +++++++
 drivers/net/ethernet/intel/libie/rx.c       | 20 ++++++
 drivers/net/ethernet/intel/libie/stats.c    | 72 ++++++++++++++++++++-
 include/linux/net/intel/libie/rx.h          |  4 ++
 include/linux/net/intel/libie/stats.h       | 39 ++++++++++-
 5 files changed, 155 insertions(+), 3 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/libie/internal.h

diff --git a/drivers/net/ethernet/intel/libie/internal.h b/drivers/net/ethernet/intel/libie/internal.h
new file mode 100644
index 00000000000000..083398dc37c63d
--- /dev/null
+++ b/drivers/net/ethernet/intel/libie/internal.h
@@ -0,0 +1,23 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/* libie internal declarations not to be used in drivers.
+ *
+ * Copyright(c) 2023 Intel Corporation.
+ */
+
+#ifndef __LIBIE_INTERNAL_H
+#define __LIBIE_INTERNAL_H
+
+struct libie_rq_stats;
+struct page_pool;
+
+#ifdef CONFIG_PAGE_POOL_STATS
+void libie_rq_stats_sync_pp(struct libie_rq_stats *stats,
+			    struct page_pool *pool);
+#else
+static inline void libie_rq_stats_sync_pp(struct libie_rq_stats *stats,
+					  struct page_pool *pool)
+{
+}
+#endif
+
+#endif /* __LIBIE_INTERNAL_H */
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
index 85d024f0a88567..10ef8741326ad2 100644
--- a/drivers/net/ethernet/intel/libie/rx.c
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -3,6 +3,8 @@
 
 #include <linux/net/intel/libie/rx.h>
 
+#include "internal.h"
+
 /* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
  * bitfield struct.
  */
@@ -136,6 +138,24 @@ struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 }
 EXPORT_SYMBOL_NS_GPL(libie_rx_page_pool_create, LIBIE);
 
+/**
+ * libie_rx_page_pool_destroy - destroy a &page_pool created by libie
+ * @pool: pool to destroy
+ * @stats: RQ stats from the ring (or %NULL to skip updating PP stats)
+ *
+ * As the stats usually has the same lifetime as the device, but PP is usually
+ * created/destroyed on ifup/ifdown, in order to not lose the stats accumulated
+ * during the last ifup, the PP stats need to be added to the driver stats
+ * container. Then the PP gets destroyed.
+ */
+void libie_rx_page_pool_destroy(struct page_pool *pool,
+				struct libie_rq_stats *stats)
+{
+	libie_rq_stats_sync_pp(stats, pool);
+	page_pool_destroy(pool);
+}
+EXPORT_SYMBOL_NS_GPL(libie_rx_page_pool_destroy, LIBIE);
+
 MODULE_AUTHOR("Intel Corporation");
 MODULE_DESCRIPTION("Intel(R) Ethernet common library");
 MODULE_LICENSE("GPL");
diff --git a/drivers/net/ethernet/intel/libie/stats.c b/drivers/net/ethernet/intel/libie/stats.c
index 61456842a36211..95bbb38c39e348 100644
--- a/drivers/net/ethernet/intel/libie/stats.c
+++ b/drivers/net/ethernet/intel/libie/stats.c
@@ -4,6 +4,8 @@
 #include <linux/ethtool.h>
 #include <linux/net/intel/libie/stats.h>
 
+#include "internal.h"
+
 /* Rx per-queue stats */
 
 static const char * const libie_rq_stats_str[] = {
@@ -14,6 +16,70 @@ static const char * const libie_rq_stats_str[] = {
 
 #define LIBIE_RQ_STATS_NUM	ARRAY_SIZE(libie_rq_stats_str)
 
+#ifdef CONFIG_PAGE_POOL_STATS
+/**
+ * libie_rq_stats_get_pp - get the current stats from a &page_pool
+ * @sarr: local array to add stats to
+ * @pool: pool to get the stats from
+ *
+ * Adds the current "live" stats from an online PP to the stats read from
+ * the RQ container, so that the actual totals will be returned.
+ */
+static void libie_rq_stats_get_pp(u64 *sarr, struct page_pool *pool)
+{
+	struct page_pool_stats *pps;
+	/* Used only to calculate pos below */
+	struct libie_rq_stats tmp;
+	u32 pos;
+
+	/* Validate the libie PP stats array can be casted <-> PP struct */
+	static_assert(sizeof(tmp.pp) == sizeof(*pps));
+
+	if (!pool)
+		return;
+
+	/* Position of the first Page Pool stats field */
+	pos = (u64_stats_t *)&tmp.pp - tmp.raw;
+	pps = (typeof(pps))&sarr[pos];
+
+	page_pool_get_stats(pool, pps);
+}
+
+/**
+ * libie_rq_stats_sync_pp - add the current PP stats to the RQ stats container
+ * @stats: stats structure to update
+ * @pool: pool to read the stats
+ *
+ * Called by libie_rx_page_pool_destroy() to save the stats before destroying
+ * the pool.
+ */
+void libie_rq_stats_sync_pp(struct libie_rq_stats *stats,
+			    struct page_pool *pool)
+{
+	u64_stats_t *qarr = (u64_stats_t *)&stats->pp;
+	struct page_pool_stats pps = { };
+	u64 *sarr = (u64 *)&pps;
+
+	if (!stats)
+		return;
+
+	page_pool_get_stats(pool, &pps);
+
+	u64_stats_update_begin(&stats->syncp);
+
+	for (u32 i = 0; i < sizeof(pps) / sizeof(*sarr); i++)
+		u64_stats_add(&qarr[i], sarr[i]);
+
+	u64_stats_update_end(&stats->syncp);
+}
+#else
+static inline void libie_rq_stats_get_pp(u64 *sarr, struct page_pool *pool)
+{
+}
+
+/* static inline void libie_rq_stats_sync_pp() is declared in "internal.h" */
+#endif
+
 /**
  * libie_rq_stats_get_sset_count - get the number of Ethtool RQ stats provided
  *
@@ -41,8 +107,10 @@ EXPORT_SYMBOL_NS_GPL(libie_rq_stats_get_strings, LIBIE);
  * libie_rq_stats_get_data - get the RQ stats in Ethtool format
  * @data: reference to the cursor pointing to the output array
  * @stats: RQ stats container from the queue
+ * @pool: &page_pool from the queue (%NULL to ignore PP "live" stats)
  */
-void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats)
+void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats,
+			     struct page_pool *pool)
 {
 	u64 sarr[LIBIE_RQ_STATS_NUM];
 	u32 start;
@@ -54,6 +122,8 @@ void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats)
 			sarr[i] = u64_stats_read(&stats->raw[i]);
 	} while (u64_stats_fetch_retry(&stats->syncp, start));
 
+	libie_rq_stats_get_pp(sarr, pool);
+
 	for (u32 i = 0; i < LIBIE_RQ_STATS_NUM; i++)
 		(*data)[i] += sarr[i];
 
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index 44eafbd04a7c22..f063a30f182ecb 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -164,7 +164,11 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 #define LIBIE_RX_DMA_ATTR						    \
 	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
 
+struct libie_rq_stats;
+
 struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 					    u32 size);
+void libie_rx_page_pool_destroy(struct page_pool *pool,
+				struct libie_rq_stats *stats);
 
 #endif /* __LIBIE_RX_H */
diff --git a/include/linux/net/intel/libie/stats.h b/include/linux/net/intel/libie/stats.h
index dbbc98bbd3a70c..23ca0079a90586 100644
--- a/include/linux/net/intel/libie/stats.h
+++ b/include/linux/net/intel/libie/stats.h
@@ -49,6 +49,17 @@
  * fragments: number of processed descriptors carrying only a fragment
  * alloc_page_fail: number of Rx page allocation fails
  * build_skb_fail: number of build_skb() fails
+ * pp_alloc_fast: pages taken from the cache or ring
+ * pp_alloc_slow: actual page allocations
+ * pp_alloc_slow_ho: non-order-0 page allocations
+ * pp_alloc_empty: number of times the pool was empty
+ * pp_alloc_refill: number of cache refills
+ * pp_alloc_waive: NUMA node mismatches during recycling
+ * pp_recycle_cached: direct recyclings into the cache
+ * pp_recycle_cache_full: number of times the cache was full
+ * pp_recycle_ring: recyclings into the ring
+ * pp_recycle_ring_full: number of times the ring was full
+ * pp_recycle_released_ref: pages released due to elevated refcnt
  */
 
 #define DECLARE_LIBIE_RQ_NAPI_STATS(act)		\
@@ -60,9 +71,29 @@
 	act(alloc_page_fail)				\
 	act(build_skb_fail)
 
+#ifdef CONFIG_PAGE_POOL_STATS
+#define DECLARE_LIBIE_RQ_PP_STATS(act)			\
+	act(pp_alloc_fast)				\
+	act(pp_alloc_slow)				\
+	act(pp_alloc_slow_ho)				\
+	act(pp_alloc_empty)				\
+	act(pp_alloc_refill)				\
+	act(pp_alloc_waive)				\
+	act(pp_recycle_cached)				\
+	act(pp_recycle_cache_full)			\
+	act(pp_recycle_ring)				\
+	act(pp_recycle_ring_full)			\
+	act(pp_recycle_released_ref)
+#else
+#define DECLARE_LIBIE_RQ_PP_STATS(act)
+#endif
+
 #define DECLARE_LIBIE_RQ_STATS(act)			\
 	DECLARE_LIBIE_RQ_NAPI_STATS(act)		\
-	DECLARE_LIBIE_RQ_FAIL_STATS(act)
+	DECLARE_LIBIE_RQ_FAIL_STATS(act)		\
+	DECLARE_LIBIE_RQ_PP_STATS(act)
+
+struct page_pool;
 
 struct libie_rq_stats {
 	struct u64_stats_sync	syncp;
@@ -72,6 +103,9 @@ struct libie_rq_stats {
 #define act(s)	u64_stats_t	s;
 			DECLARE_LIBIE_RQ_NAPI_STATS(act);
 			DECLARE_LIBIE_RQ_FAIL_STATS(act);
+			struct_group(pp,
+				DECLARE_LIBIE_RQ_PP_STATS(act);
+			);
 #undef act
 		};
 		DECLARE_FLEX_ARRAY(u64_stats_t, raw);
@@ -110,7 +144,8 @@ libie_rq_napi_stats_add(struct libie_rq_stats *qs,
 
 u32 libie_rq_stats_get_sset_count(void);
 void libie_rq_stats_get_strings(u8 **data, u32 qid);
-void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats);
+void libie_rq_stats_get_data(u64 **data, const struct libie_rq_stats *stats,
+			     struct page_pool *pool);
 
 /* Tx per-queue stats:
  * packets: packets sent from this queue

From 1f934b6e9745525e418c394d2741d2cce434f152 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 15 Mar 2023 17:50:17 +0100
Subject: [PATCH 11/32] iavf: switch queue stats to libie

iavf is pretty much ready for using the generic libie stats, so drop all
the custom code and just use generic definitions. The only thing is that
it previously lacked the counter of Tx queue stops. It's present in the
other drivers, so add it here as well.
The rest is straightforward. There were two fields in the Tx stats
struct, which didn't belong there. The first one has never been used,
wipe it; and move the other to the queue structure. Plus move around
a couple fields in &iavf_ring to account stats structs' alignment.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 .../net/ethernet/intel/iavf/iavf_ethtool.c    | 87 ++++------------
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  2 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   | 98 ++++++++++---------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   | 47 +++------
 4 files changed, 87 insertions(+), 147 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
index de3050c02b6ffc..0dcf50d75f8614 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_ethtool.c
@@ -46,16 +46,6 @@ struct iavf_stats {
 	.stat_offset = offsetof(_type, _stat) \
 }
 
-/* Helper macro for defining some statistics related to queues */
-#define IAVF_QUEUE_STAT(_name, _stat) \
-	IAVF_STAT(struct iavf_ring, _name, _stat)
-
-/* Stats associated with a Tx or Rx ring */
-static const struct iavf_stats iavf_gstrings_queue_stats[] = {
-	IAVF_QUEUE_STAT("%s-%u.packets", stats.packets),
-	IAVF_QUEUE_STAT("%s-%u.bytes", stats.bytes),
-};
-
 /**
  * iavf_add_one_ethtool_stat - copy the stat into the supplied buffer
  * @data: location to store the stat value
@@ -141,43 +131,6 @@ __iavf_add_ethtool_stats(u64 **data, void *pointer,
 #define iavf_add_ethtool_stats(data, pointer, stats) \
 	__iavf_add_ethtool_stats(data, pointer, stats, ARRAY_SIZE(stats))
 
-/**
- * iavf_add_queue_stats - copy queue statistics into supplied buffer
- * @data: ethtool stats buffer
- * @ring: the ring to copy
- *
- * Queue statistics must be copied while protected by
- * u64_stats_fetch_begin, so we can't directly use iavf_add_ethtool_stats.
- * Assumes that queue stats are defined in iavf_gstrings_queue_stats. If the
- * ring pointer is null, zero out the queue stat values and update the data
- * pointer. Otherwise safely copy the stats from the ring into the supplied
- * buffer and update the data pointer when finished.
- *
- * This function expects to be called while under rcu_read_lock().
- **/
-static void
-iavf_add_queue_stats(u64 **data, struct iavf_ring *ring)
-{
-	const unsigned int size = ARRAY_SIZE(iavf_gstrings_queue_stats);
-	const struct iavf_stats *stats = iavf_gstrings_queue_stats;
-	unsigned int start;
-	unsigned int i;
-
-	/* To avoid invalid statistics values, ensure that we keep retrying
-	 * the copy until we get a consistent value according to
-	 * u64_stats_fetch_retry. But first, make sure our ring is
-	 * non-null before attempting to access its syncp.
-	 */
-	do {
-		start = !ring ? 0 : u64_stats_fetch_begin(&ring->syncp);
-		for (i = 0; i < size; i++)
-			iavf_add_one_ethtool_stat(&(*data)[i], ring, &stats[i]);
-	} while (ring && u64_stats_fetch_retry(&ring->syncp, start));
-
-	/* Once we successfully copy the stats in, update the data pointer */
-	*data += size;
-}
-
 /**
  * __iavf_add_stat_strings - copy stat strings into ethtool buffer
  * @p: ethtool supplied buffer
@@ -237,8 +190,6 @@ static const struct iavf_stats iavf_gstrings_stats[] = {
 
 #define IAVF_STATS_LEN	ARRAY_SIZE(iavf_gstrings_stats)
 
-#define IAVF_QUEUE_STATS_LEN	ARRAY_SIZE(iavf_gstrings_queue_stats)
-
 /**
  * iavf_get_link_ksettings - Get Link Speed and Duplex settings
  * @netdev: network interface device structure
@@ -308,18 +259,22 @@ static int iavf_get_link_ksettings(struct net_device *netdev,
  **/
 static int iavf_get_sset_count(struct net_device *netdev, int sset)
 {
-	/* Report the maximum number queues, even if not every queue is
-	 * currently configured. Since allocation of queues is in pairs,
-	 * use netdev->real_num_tx_queues * 2. The real_num_tx_queues is set
-	 * at device creation and never changes.
-	 */
+	u32 num;
 
-	if (sset == ETH_SS_STATS)
-		return IAVF_STATS_LEN +
-			(IAVF_QUEUE_STATS_LEN * 2 *
-			 netdev->real_num_tx_queues);
-	else
+	switch (sset) {
+	case ETH_SS_STATS:
+		/* Per-queue */
+		num = libie_rq_stats_get_sset_count();
+		num += libie_sq_stats_get_sset_count();
+		num *= netdev->real_num_tx_queues;
+
+		/* Global */
+		num += IAVF_STATS_LEN;
+
+		return num;
+	default:
 		return -EINVAL;
+	}
 }
 
 /**
@@ -346,15 +301,15 @@ static void iavf_get_ethtool_stats(struct net_device *netdev,
 	 * it to iterate over rings' stats.
 	 */
 	for (i = 0; i < adapter->num_active_queues; i++) {
-		struct iavf_ring *ring;
+		const struct iavf_ring *ring;
 
 		/* Tx rings stats */
-		ring = &adapter->tx_rings[i];
-		iavf_add_queue_stats(&data, ring);
+		libie_sq_stats_get_data(&data, &adapter->tx_rings[i].sq_stats);
 
 		/* Rx rings stats */
 		ring = &adapter->rx_rings[i];
-		iavf_add_queue_stats(&data, ring);
+		libie_rq_stats_get_data(&data, &ring->rq_stats,
+					ring->rx_pages ? ring->pool : NULL);
 	}
 	rcu_read_unlock();
 }
@@ -376,10 +331,8 @@ static void iavf_get_stat_strings(struct net_device *netdev, u8 *data)
 	 * real_num_tx_queues for both Tx and Rx queues.
 	 */
 	for (i = 0; i < netdev->real_num_tx_queues; i++) {
-		iavf_add_stat_strings(&data, iavf_gstrings_queue_stats,
-				      "tx", i);
-		iavf_add_stat_strings(&data, iavf_gstrings_queue_stats,
-				      "rx", i);
+		libie_sq_stats_get_strings(&data, i);
+		libie_rq_stats_get_strings(&data, i);
 	}
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index fb2bd1c423a158..60463b3edfacf2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1587,6 +1587,7 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 		tx_ring->itr_setting = IAVF_ITR_TX_DEF;
 		if (adapter->flags & IAVF_FLAG_WB_ON_ITR_CAPABLE)
 			tx_ring->flags |= IAVF_TXR_FLAGS_WB_ON_ITR;
+		u64_stats_init(&tx_ring->sq_stats.syncp);
 
 		rx_ring = &adapter->rx_rings[i];
 		rx_ring->queue_index = i;
@@ -1594,6 +1595,7 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 		rx_ring->dev = &adapter->pdev->dev;
 		rx_ring->count = adapter->rx_desc_count;
 		rx_ring->itr_setting = IAVF_ITR_RX_DEF;
+		u64_stats_init(&rx_ring->rq_stats.syncp);
 	}
 
 	adapter->num_active_queues = num_active_queues;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 5d087f9b38ed47..ab4863f86a3c3e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -158,6 +158,9 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi)
 	for (i = 0; i < vsi->back->num_active_queues; i++) {
 		tx_ring = &vsi->back->tx_rings[i];
 		if (tx_ring && tx_ring->desc) {
+			const struct libie_sq_stats *st = &tx_ring->sq_stats;
+			u32 start;
+
 			/* If packet counter has not changed the queue is
 			 * likely stalled, so force an interrupt for this
 			 * queue.
@@ -165,8 +168,13 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi)
 			 * prev_pkt_ctr would be negative if there was no
 			 * pending work.
 			 */
-			packets = tx_ring->stats.packets & INT_MAX;
-			if (tx_ring->tx_stats.prev_pkt_ctr == packets) {
+			do {
+				start = u64_stats_fetch_begin(&st->syncp);
+				packets = u64_stats_read(&st->packets) &
+					  INT_MAX;
+			} while (u64_stats_fetch_retry(&st->syncp, start));
+
+			if (tx_ring->prev_pkt_ctr == packets) {
 				iavf_force_wb(vsi, tx_ring->q_vector);
 				continue;
 			}
@@ -175,7 +183,7 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi)
 			 * to iavf_get_tx_pending()
 			 */
 			smp_rmb();
-			tx_ring->tx_stats.prev_pkt_ctr =
+			tx_ring->prev_pkt_ctr =
 			  iavf_get_tx_pending(tx_ring, true) ? packets : -1;
 		}
 	}
@@ -194,10 +202,10 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi)
 static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 			      struct iavf_ring *tx_ring, int napi_budget)
 {
+	struct libie_sq_onstack_stats stats = { };
 	int i = tx_ring->next_to_clean;
 	struct iavf_tx_buffer *tx_buf;
 	struct iavf_tx_desc *tx_desc;
-	unsigned int total_bytes = 0, total_packets = 0;
 	unsigned int budget = IAVF_DEFAULT_IRQ_WORK;
 
 	tx_buf = &tx_ring->tx_bi[i];
@@ -224,8 +232,8 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 		tx_buf->next_to_watch = NULL;
 
 		/* update the statistics for this packet */
-		total_bytes += tx_buf->bytecount;
-		total_packets += tx_buf->gso_segs;
+		stats.bytes += tx_buf->bytecount;
+		stats.packets += tx_buf->gso_segs;
 
 		/* free the skb */
 		napi_consume_skb(tx_buf->skb, napi_budget);
@@ -282,12 +290,9 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 
 	i += tx_ring->count;
 	tx_ring->next_to_clean = i;
-	u64_stats_update_begin(&tx_ring->syncp);
-	tx_ring->stats.bytes += total_bytes;
-	tx_ring->stats.packets += total_packets;
-	u64_stats_update_end(&tx_ring->syncp);
-	tx_ring->q_vector->tx.total_bytes += total_bytes;
-	tx_ring->q_vector->tx.total_packets += total_packets;
+	libie_sq_napi_stats_add(&tx_ring->sq_stats, &stats);
+	tx_ring->q_vector->tx.total_bytes += stats.bytes;
+	tx_ring->q_vector->tx.total_packets += stats.packets;
 
 	if (tx_ring->flags & IAVF_TXR_FLAGS_WB_ON_ITR) {
 		/* check to see if there are < 4 descriptors
@@ -306,10 +311,10 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 
 	/* notify netdev of completed buffers */
 	netdev_tx_completed_queue(txring_txq(tx_ring),
-				  total_packets, total_bytes);
+				  stats.packets, stats.bytes);
 
 #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
-	if (unlikely(total_packets && netif_carrier_ok(tx_ring->netdev) &&
+	if (unlikely(stats.packets && netif_carrier_ok(tx_ring->netdev) &&
 		     (IAVF_DESC_UNUSED(tx_ring) >= TX_WAKE_THRESHOLD))) {
 		/* Make sure that anybody stopping the queue after this
 		 * sees the new next_to_clean.
@@ -320,7 +325,7 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 		   !test_bit(__IAVF_VSI_DOWN, vsi->state)) {
 			netif_wake_subqueue(tx_ring->netdev,
 					    tx_ring->queue_index);
-			++tx_ring->tx_stats.restart_queue;
+			libie_stats_inc_one(&tx_ring->sq_stats, restarts);
 		}
 	}
 
@@ -675,7 +680,7 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
-	tx_ring->tx_stats.prev_pkt_ctr = -1;
+	tx_ring->prev_pkt_ctr = -1;
 	return 0;
 
 err:
@@ -731,7 +736,7 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 	kfree(rx_ring->rx_pages);
 	rx_ring->rx_pages = NULL;
 
-	page_pool_destroy(rx_ring->pool);
+	libie_rx_page_pool_destroy(rx_ring->pool, &rx_ring->rq_stats);
 	rx_ring->dev = dev;
 
 	if (rx_ring->desc) {
@@ -760,8 +765,6 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 	if (!rx_ring->rx_pages)
 		return ret;
 
-	u64_stats_init(&rx_ring->syncp);
-
 	/* Round up to nearest 4K */
 	rx_ring->size = rx_ring->count * sizeof(union iavf_32byte_rx_desc);
 	rx_ring->size = ALIGN(rx_ring->size, 4096);
@@ -862,10 +865,8 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		dma_addr_t dma;
 
 		page = page_pool_alloc_pages(pool, gfp);
-		if (!page) {
-			rx_ring->rx_stats.alloc_page_failed++;
+		if (!page)
 			break;
-		}
 
 		rx_ring->rx_pages[ntu] = page;
 		dma = page_pool_get_dma_addr(page);
@@ -1089,25 +1090,23 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 
 /**
  * iavf_is_non_eop - process handling of non-EOP buffers
- * @rx_ring: Rx ring being processed
  * @rx_desc: Rx descriptor for current buffer
- * @skb: Current socket buffer containing buffer in progress
+ * @stats: NAPI poll local stats to update
  *
  * This function updates next to clean.  If the buffer is an EOP buffer
  * this function exits returning false, otherwise it will place the
  * sk_buff in the next buffer to be chained and return true indicating
  * that this is in fact a non-EOP buffer.
  **/
-static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
-			    union iavf_rx_desc *rx_desc,
-			    struct sk_buff *skb)
+static bool iavf_is_non_eop(union iavf_rx_desc *rx_desc,
+			    struct libie_rq_onstack_stats *stats)
 {
 	/* if we are the last buffer then there is nothing else to do */
 #define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT)
 	if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF)))
 		return false;
 
-	rx_ring->rx_stats.non_eop_descs++;
+	stats->fragments++;
 
 	return true;
 }
@@ -1126,8 +1125,8 @@ static bool iavf_is_non_eop(struct iavf_ring *rx_ring,
  **/
 static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 {
-	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
 	const gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN;
+	struct libie_rq_onstack_stats stats = { };
 	u32 to_refill = IAVF_DESC_UNUSED(rx_ring);
 	struct page_pool *pool = rx_ring->pool;
 	struct sk_buff *skb = rx_ring->skb;
@@ -1144,9 +1143,13 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		u64 qword;
 
 		/* return some buffers to hardware, one at a time is too slow */
-		if (to_refill >= IAVF_RX_BUFFER_WRITE)
+		if (to_refill >= IAVF_RX_BUFFER_WRITE) {
 			to_refill = __iavf_alloc_rx_pages(rx_ring, to_refill,
 							  gfp);
+			if (unlikely(to_refill))
+				libie_stats_inc_one(&rx_ring->rq_stats,
+						    alloc_page_fail);
+		}
 
 		rx_desc = IAVF_RX_DESC(rx_ring, ntc);
 
@@ -1194,7 +1197,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
 			page_pool_put_page(pool, page, size, true);
-			rx_ring->rx_stats.alloc_buff_failed++;
+			libie_stats_inc_one(&rx_ring->rq_stats,
+					    build_skb_fail);
 			break;
 		}
 
@@ -1206,7 +1210,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 
 		prefetch(IAVF_RX_DESC(rx_ring, ntc));
 
-		if (iavf_is_non_eop(rx_ring, rx_desc, skb))
+		if (iavf_is_non_eop(rx_desc, &stats))
 			continue;
 
 		/* ERR_MASK will only have valid bits if EOP set, and
@@ -1226,7 +1230,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		}
 
 		/* probably a little skewed due to removing CRC */
-		total_rx_bytes += skb->len;
+		stats.bytes += skb->len;
 
 		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
 		rx_ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >>
@@ -1248,7 +1252,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		skb = NULL;
 
 		/* update budget accounting */
-		total_rx_packets++;
+		stats.packets++;
 	}
 
 	rx_ring->next_to_clean = ntc;
@@ -1259,16 +1263,16 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* guarantee a trip back through this routine if there was
 		 * a failure
 		 */
-		if (unlikely(to_refill))
+		if (unlikely(to_refill)) {
+			libie_stats_inc_one(&rx_ring->rq_stats,
+					    alloc_page_fail);
 			cleaned_count = budget;
+		}
 	}
 
-	u64_stats_update_begin(&rx_ring->syncp);
-	rx_ring->stats.packets += total_rx_packets;
-	rx_ring->stats.bytes += total_rx_bytes;
-	u64_stats_update_end(&rx_ring->syncp);
-	rx_ring->q_vector->rx.total_packets += total_rx_packets;
-	rx_ring->q_vector->rx.total_bytes += total_rx_bytes;
+	libie_rq_napi_stats_add(&rx_ring->rq_stats, &stats);
+	rx_ring->q_vector->rx.total_packets += stats.packets;
+	rx_ring->q_vector->rx.total_bytes += stats.bytes;
 
 	return cleaned_count;
 }
@@ -1447,10 +1451,8 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 			return budget - 1;
 		}
 tx_only:
-		if (arm_wb) {
-			q_vector->tx.ring[0].tx_stats.tx_force_wb++;
+		if (arm_wb)
 			iavf_enable_wb_on_itr(vsi, q_vector);
-		}
 		return budget;
 	}
 
@@ -1909,6 +1911,7 @@ bool __iavf_chk_linearize(struct sk_buff *skb)
 int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size)
 {
 	netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
+	libie_stats_inc_one(&tx_ring->sq_stats, stops);
 	/* Memory barrier before checking head and tail */
 	smp_mb();
 
@@ -1918,7 +1921,8 @@ int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size)
 
 	/* A reprieve! - use start_queue because it doesn't call schedule */
 	netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
-	++tx_ring->tx_stats.restart_queue;
+	libie_stats_inc_one(&tx_ring->sq_stats, restarts);
+
 	return 0;
 }
 
@@ -2099,7 +2103,7 @@ static netdev_tx_t iavf_xmit_frame_ring(struct sk_buff *skb,
 			return NETDEV_TX_OK;
 		}
 		count = iavf_txd_use_count(skb->len);
-		tx_ring->tx_stats.tx_linearize++;
+		libie_stats_inc_one(&tx_ring->sq_stats, linearized);
 	}
 
 	/* need: 1 descriptor per page * PAGE_SIZE/IAVF_MAX_DATA_PER_TXD,
@@ -2109,7 +2113,7 @@ static netdev_tx_t iavf_xmit_frame_ring(struct sk_buff *skb,
 	 * otherwise try next time
 	 */
 	if (iavf_maybe_stop_tx(tx_ring, count + 4 + 1)) {
-		tx_ring->tx_stats.tx_busy++;
+		libie_stats_inc_one(&tx_ring->sq_stats, busy);
 		return NETDEV_TX_BUSY;
 	}
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 8fbe549ce6a587..64c93d6fa54d09 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -4,6 +4,8 @@
 #ifndef _IAVF_TXRX_H_
 #define _IAVF_TXRX_H_
 
+#include <linux/net/intel/libie/stats.h>
+
 /* Interrupt Throttling and Rate Limiting Goodies */
 #define IAVF_DEFAULT_IRQ_WORK      256
 
@@ -201,27 +203,6 @@ struct iavf_tx_buffer {
 	u32 tx_flags;
 };
 
-struct iavf_queue_stats {
-	u64 packets;
-	u64 bytes;
-};
-
-struct iavf_tx_queue_stats {
-	u64 restart_queue;
-	u64 tx_busy;
-	u64 tx_done_old;
-	u64 tx_linearize;
-	u64 tx_force_wb;
-	int prev_pkt_ctr;
-	u64 tx_lost_interrupt;
-};
-
-struct iavf_rx_queue_stats {
-	u64 non_eop_descs;
-	u64 alloc_page_failed;
-	u64 alloc_buff_failed;
-};
-
 /* some useful defines for virtchannel interface, which
  * is the only remaining user of header split
  */
@@ -272,21 +253,9 @@ struct iavf_ring {
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
 #define IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2	BIT(5)
 
-	/* stats structs */
-	struct iavf_queue_stats	stats;
-	struct u64_stats_sync syncp;
-	union {
-		struct iavf_tx_queue_stats tx_stats;
-		struct iavf_rx_queue_stats rx_stats;
-	};
-
-	unsigned int size;		/* length of descriptor ring in bytes */
-	dma_addr_t dma;			/* physical address of ring */
-
 	struct iavf_vsi *vsi;		/* Backreference to associated VSI */
 	struct iavf_q_vector *q_vector;	/* Backreference to associated vector */
 
-	struct rcu_head rcu;		/* to avoid race on free */
 	struct sk_buff *skb;		/* When iavf_clean_rx_ring_irq() must
 					 * return before it sees the EOP for
 					 * the current packet, we save that skb
@@ -295,6 +264,18 @@ struct iavf_ring {
 					 * iavf_clean_rx_ring_irq() is called
 					 * for this ring.
 					 */
+
+	/* stats structs */
+	union {
+		struct libie_sq_stats sq_stats;
+		struct libie_rq_stats rq_stats;
+	};
+
+	int prev_pkt_ctr;		/* For stall detection */
+	unsigned int size;		/* length of descriptor ring in bytes */
+	dma_addr_t dma;			/* physical address of ring */
+
+	struct rcu_head rcu;		/* to avoid race on free */
 } ____cacheline_internodealigned_in_smp;
 
 #define IAVF_ITR_ADAPTIVE_MIN_INC	0x0002

From 4e602363494a5f98dd3da00f7e5fba2a5a7d2ea9 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Mon, 13 Mar 2023 19:24:05 +0100
Subject: [PATCH 12/32] selftests/bpf: robustify test_xdp_do_redirect with more
 payload magics

Currently, the test relies on that only dropped ("xmitted") frames will
be recycled and if a frame became an skb, it will be freed later by the
stack and never come back to its page_pool.
So, it easily gets broken by trying to recycle skbs:

  test_xdp_do_redirect:PASS:pkt_count_xdp 0 nsec
  test_xdp_do_redirect:FAIL:pkt_count_zero unexpected pkt_count_zero:
actual 9936 != expected 2
  test_xdp_do_redirect:PASS:pkt_count_tc 0 nsec

That huge mismatch happened because after the TC ingress hook zeroes the
magic, the page gets recycled when skb is freed, not returned to the MM
layer. "Live frames" mode initializes only new pages and keeps the
recycled ones as is by design, so they appear with zeroed magic on the
Rx path again.
Expand the possible magic values from two: 0 (was "xmitted"/dropped or
did hit the TC hook) and 0x42 (hit the input XDP prog) to three: the new
one will mark frames hit the TC hook, so that they will elide both
@pkt_count_zero and @pkt_count_xdp. They can then be recycled to their
page_pool or returned to the page allocator, this won't affect the
counters anyhow. Just make sure to mark them as "input" (0x42) when they
appear on the Rx path again.
Also make an enum from those magics, so that they will be always visible
and can be changed in just one place anytime. This also eases adding any
new marks later on.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 .../bpf/progs/test_xdp_do_redirect.c          | 36 +++++++++++++------
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c b/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c
index 77a123071940ae..cd2d4e3258b899 100644
--- a/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c
+++ b/tools/testing/selftests/bpf/progs/test_xdp_do_redirect.c
@@ -4,6 +4,19 @@
 
 #define ETH_ALEN 6
 #define HDR_SZ (sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + sizeof(struct udphdr))
+
+/**
+ * enum frame_mark - magics to distinguish page/packet paths
+ * @MARK_XMIT: page was recycled due to the frame being "xmitted" by the NIC.
+ * @MARK_IN: frame is being processed by the input XDP prog.
+ * @MARK_SKB: frame did hit the TC ingress hook as an skb.
+ */
+enum frame_mark {
+	MARK_XMIT	= 0U,
+	MARK_IN		= 0x42,
+	MARK_SKB	= 0x45,
+};
+
 const volatile int ifindex_out;
 const volatile int ifindex_in;
 const volatile __u8 expect_dst[ETH_ALEN];
@@ -34,10 +47,10 @@ int xdp_redirect(struct xdp_md *xdp)
 	if (*metadata != 0x42)
 		return XDP_ABORTED;
 
-	if (*payload == 0) {
-		*payload = 0x42;
+	if (*payload == MARK_XMIT)
 		pkts_seen_zero++;
-	}
+
+	*payload = MARK_IN;
 
 	if (bpf_xdp_adjust_meta(xdp, 4))
 		return XDP_ABORTED;
@@ -51,7 +64,7 @@ int xdp_redirect(struct xdp_md *xdp)
 	return ret;
 }
 
-static bool check_pkt(void *data, void *data_end)
+static bool check_pkt(void *data, void *data_end, const __u32 mark)
 {
 	struct ipv6hdr *iph = data + sizeof(struct ethhdr);
 	__u8 *payload = data + HDR_SZ;
@@ -59,13 +72,13 @@ static bool check_pkt(void *data, void *data_end)
 	if (payload + 1 > data_end)
 		return false;
 
-	if (iph->nexthdr != IPPROTO_UDP || *payload != 0x42)
+	if (iph->nexthdr != IPPROTO_UDP || *payload != MARK_IN)
 		return false;
 
 	/* reset the payload so the same packet doesn't get counted twice when
 	 * it cycles back through the kernel path and out the dst veth
 	 */
-	*payload = 0;
+	*payload = mark;
 	return true;
 }
 
@@ -75,11 +88,11 @@ int xdp_count_pkts(struct xdp_md *xdp)
 	void *data = (void *)(long)xdp->data;
 	void *data_end = (void *)(long)xdp->data_end;
 
-	if (check_pkt(data, data_end))
+	if (check_pkt(data, data_end, MARK_XMIT))
 		pkts_seen_xdp++;
 
-	/* Return XDP_DROP to make sure the data page is recycled, like when it
-	 * exits a physical NIC. Recycled pages will be counted in the
+	/* Return %XDP_DROP to recycle the data page with %MARK_XMIT, like
+	 * it exited a physical NIC. Those pages will be counted in the
 	 * pkts_seen_zero counter above.
 	 */
 	return XDP_DROP;
@@ -91,9 +104,12 @@ int tc_count_pkts(struct __sk_buff *skb)
 	void *data = (void *)(long)skb->data;
 	void *data_end = (void *)(long)skb->data_end;
 
-	if (check_pkt(data, data_end))
+	if (check_pkt(data, data_end, MARK_SKB))
 		pkts_seen_tc++;
 
+	/* Will be either recycled or freed, %MARK_SKB makes sure it won't
+	 * hit any of the counters above.
+	 */
 	return 0;
 }
 

From 2e28dffa0f454f96f40e25c39e6e5c8c49017ec5 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 3 Mar 2023 13:25:11 +0100
Subject: [PATCH 13/32] net: page_pool, skbuff: make skb_mark_for_recycle()
 always available

skb_mark_for_recycle() is guarded with CONFIG_PAGE_POOL, this creates
unneeded complication when using it in the generic code. For now, it's
only used in the drivers always selecting Page Pool, so this works.
Move the guards so that preprocessor will cut out only the operation
itself and the function will still be a noop on !PAGE_POOL systems,
but available there as well.
No functional changes.

Reported-by: kernel test robot <lkp@intel.com>
Link: https://lore.kernel.org/oe-kbuild-all/202303020342.Wi2PRFFH-lkp@intel.com
Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/linux/skbuff.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index fe661011644b8f..3f3a2a82a86b30 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -5069,12 +5069,12 @@ static inline u64 skb_get_kcov_handle(struct sk_buff *skb)
 #endif
 }
 
-#ifdef CONFIG_PAGE_POOL
 static inline void skb_mark_for_recycle(struct sk_buff *skb)
 {
+#ifdef CONFIG_PAGE_POOL
 	skb->pp_recycle = 1;
-}
 #endif
+}
 
 #endif	/* __KERNEL__ */
 #endif	/* _LINUX_SKBUFF_H */

From df09c63ee0dbd772647bddbbb475050ddbeabad2 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 1 Mar 2023 16:21:58 +0100
Subject: [PATCH 14/32] xdp: recycle Page Pool backed skbs built from XDP
 frames

__xdp_build_skb_from_frame() state(d):

/* Until page_pool get SKB return path, release DMA here */

Page Pool got skb pages recycling in April 2021, but missed this
function.

xdp_release_frame() is relevant only for Page Pool backed frames and it
detaches the page from the corresponding page_pool in order to make it
freeable via page_frag_free(). It can instead just mark the output skb
as eligible for recycling if the frame is backed by a pp. No change for
other memory model types (the same condition check as before).
cpumap redirect and veth on Page Pool drivers now become zero-alloc (or
almost).

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 net/core/xdp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/xdp.c b/net/core/xdp.c
index 528d4b37983df8..f9b9ffb6beb140 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -658,8 +658,8 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
 	 * - RX ring dev queue index	(skb_record_rx_queue)
 	 */
 
-	/* Until page_pool get SKB return path, release DMA here */
-	xdp_release_frame(xdpf);
+	if (xdpf->mem.type == MEM_TYPE_PAGE_POOL)
+		skb_mark_for_recycle(skb);
 
 	/* Allow SKB to reuse area used by xdp_frame */
 	xdp_scrub_frame(xdpf);

From 4fcbfcde0dd707a786ebf0c0c587f72f996a8935 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 1 Mar 2023 16:29:40 +0100
Subject: [PATCH 15/32] xdp: remove unused {__,}xdp_release_frame()

__xdp_build_skb_from_frame() was the last user of
{__,}xdp_release_frame(), which detaches pages from the page_pool.
All the consumers now recycle Page Pool skbs and page, except mlx5,
stmmac and tsnep drivers, which use page_pool_release_page() directly
(might change one day). It's safe to assume this functionality is not
needed anymore and can be removed (in favor of recycling).

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 include/net/xdp.h | 29 -----------------------------
 net/core/xdp.c    | 15 ---------------
 2 files changed, 44 deletions(-)

diff --git a/include/net/xdp.h b/include/net/xdp.h
index 41c57b8b167147..383b25b426a482 100644
--- a/include/net/xdp.h
+++ b/include/net/xdp.h
@@ -317,35 +317,6 @@ void xdp_flush_frame_bulk(struct xdp_frame_bulk *bq);
 void xdp_return_frame_bulk(struct xdp_frame *xdpf,
 			   struct xdp_frame_bulk *bq);
 
-/* When sending xdp_frame into the network stack, then there is no
- * return point callback, which is needed to release e.g. DMA-mapping
- * resources with page_pool.  Thus, have explicit function to release
- * frame resources.
- */
-void __xdp_release_frame(void *data, struct xdp_mem_info *mem);
-static inline void xdp_release_frame(struct xdp_frame *xdpf)
-{
-	struct xdp_mem_info *mem = &xdpf->mem;
-	struct skb_shared_info *sinfo;
-	int i;
-
-	/* Curr only page_pool needs this */
-	if (mem->type != MEM_TYPE_PAGE_POOL)
-		return;
-
-	if (likely(!xdp_frame_has_frags(xdpf)))
-		goto out;
-
-	sinfo = xdp_get_shared_info_from_frame(xdpf);
-	for (i = 0; i < sinfo->nr_frags; i++) {
-		struct page *page = skb_frag_page(&sinfo->frags[i]);
-
-		__xdp_release_frame(page_address(page), mem);
-	}
-out:
-	__xdp_release_frame(xdpf->data, mem);
-}
-
 static __always_inline unsigned int xdp_get_frame_len(struct xdp_frame *xdpf)
 {
 	struct skb_shared_info *sinfo;
diff --git a/net/core/xdp.c b/net/core/xdp.c
index f9b9ffb6beb140..018e0fe4e71405 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -531,21 +531,6 @@ void xdp_return_buff(struct xdp_buff *xdp)
 }
 EXPORT_SYMBOL_GPL(xdp_return_buff);
 
-/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
-void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
-{
-	struct xdp_mem_allocator *xa;
-	struct page *page;
-
-	rcu_read_lock();
-	xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
-	page = virt_to_head_page(data);
-	if (xa)
-		page_pool_release_page(xa->page_pool, page);
-	rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(__xdp_release_frame);
-
 void xdp_attachment_setup(struct xdp_attachment_info *info,
 			  struct netdev_bpf *bpf)
 {

From 3916621255d368632919dc96a7e5654fc3accfde Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Fri, 17 Mar 2023 15:12:33 +0100
Subject: [PATCH 16/32] iavf: optimize Rx hotpath a bunch -- vol. 2

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 100 ++++++--------------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  10 +-
 2 files changed, 33 insertions(+), 77 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index ab4863f86a3c3e..d6093f4608db1e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
+#include <linux/bitfield.h>
 #include <linux/net/intel/libie/rx.h>
 #include <linux/prefetch.h>
 
@@ -902,21 +903,16 @@ void iavf_alloc_rx_pages(struct iavf_ring *rxr)
  * iavf_rx_checksum - Indicate in skb if hw indicated a good cksum
  * @vsi: the VSI we care about
  * @skb: skb currently being received and modified
- * @rx_desc: the receive descriptor
+ * @qword: `wb.qword1.status_error_len` from the descriptor
  **/
 static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
 				    struct sk_buff *skb,
-				    union iavf_rx_desc *rx_desc)
+				    u64 qword)
 {
 	struct libie_rx_ptype_parsed parsed;
-	u32 rx_error, rx_status;
+	u32 ptype, rx_error, rx_status;
 	bool ipv4, ipv6;
-	u8 ptype;
-	u64 qword;
 
-	skb->ip_summed = CHECKSUM_NONE;
-
-	qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
 	ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> IAVF_RXD_QW1_PTYPE_SHIFT;
 
 	parsed = libie_parse_rx_ptype(ptype);
@@ -969,24 +965,25 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
  * @ring: descriptor ring
  * @rx_desc: specific descriptor
  * @skb: skb currently being received and modified
- * @rx_ptype: Rx packet type
+ * @qword: `wb.qword1.status_error_len` from the descriptor
  **/
 static inline void iavf_rx_hash(struct iavf_ring *ring,
 				union iavf_rx_desc *rx_desc,
 				struct sk_buff *skb,
-				u8 rx_ptype)
+				u64 qword)
 {
+	const u64 rss_mask = (u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
+			     IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT;
 	struct libie_rx_ptype_parsed parsed;
-	u32 hash;
-	const __le64 rss_mask =
-		cpu_to_le64((u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
-			    IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT);
+	u32 rx_ptype, hash;
+
+	rx_ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
 
 	parsed = libie_parse_rx_ptype(rx_ptype);
 	if (!libie_has_rx_hash(ring->netdev, parsed))
 		return;
 
-	if ((rx_desc->wb.qword1.status_error_len & rss_mask) == rss_mask) {
+	if ((qword & rss_mask) == rss_mask) {
 		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
 		libie_skb_set_hash(skb, hash, parsed);
 	}
@@ -997,7 +994,7 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
  * @rx_ring: rx descriptor ring packet is being transacted on
  * @rx_desc: pointer to the EOP Rx descriptor
  * @skb: pointer to current skb being populated
- * @rx_ptype: the packet type decoded by hardware
+ * @qword: `wb.qword1.status_error_len` from the descriptor
  *
  * This function checks the ring, descriptor, and packet information in
  * order to populate the hash, checksum, VLAN, protocol, and
@@ -1006,11 +1003,11 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
 static inline
 void iavf_process_skb_fields(struct iavf_ring *rx_ring,
 			     union iavf_rx_desc *rx_desc, struct sk_buff *skb,
-			     u8 rx_ptype)
+			     u64 qword)
 {
-	iavf_rx_hash(rx_ring, rx_desc, skb, rx_ptype);
+	iavf_rx_hash(rx_ring, rx_desc, skb, qword);
 
-	iavf_rx_checksum(rx_ring->vsi, skb, rx_desc);
+	iavf_rx_checksum(rx_ring->vsi, skb, qword);
 
 	skb_record_rx_queue(skb, rx_ring->queue_index);
 
@@ -1018,28 +1015,6 @@ void iavf_process_skb_fields(struct iavf_ring *rx_ring,
 	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 }
 
-/**
- * iavf_cleanup_headers - Correct empty headers
- * @rx_ring: rx descriptor ring packet is being transacted on
- * @skb: pointer to current skb being fixed
- *
- * Also address the case where we are pulling data in on pages only
- * and as such no data is present in the skb header.
- *
- * In addition if skb is not at least 60 bytes we need to pad it so that
- * it is large enough to qualify as a valid Ethernet frame.
- *
- * Returns true if an error was encountered and skb was freed.
- **/
-static bool iavf_cleanup_headers(struct iavf_ring *rx_ring, struct sk_buff *skb)
-{
-	/* if eth_skb_pad returns an error the skb was freed */
-	if (eth_skb_pad(skb))
-		return true;
-
-	return false;
-}
-
 /**
  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
  * @skb: sk_buff to place the data into
@@ -1089,21 +1064,14 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 }
 
 /**
- * iavf_is_non_eop - process handling of non-EOP buffers
- * @rx_desc: Rx descriptor for current buffer
+ * iavf_is_non_eop - check whether a buffer is non-EOP
+ * @qword: `wb.qword1.status_error_len` from the descriptor
  * @stats: NAPI poll local stats to update
- *
- * This function updates next to clean.  If the buffer is an EOP buffer
- * this function exits returning false, otherwise it will place the
- * sk_buff in the next buffer to be chained and return true indicating
- * that this is in fact a non-EOP buffer.
  **/
-static bool iavf_is_non_eop(union iavf_rx_desc *rx_desc,
-			    struct libie_rq_onstack_stats *stats)
+static bool iavf_is_non_eop(u64 qword, struct libie_rq_onstack_stats *stats)
 {
 	/* if we are the last buffer then there is nothing else to do */
-#define IAVF_RXD_EOF BIT(IAVF_RX_DESC_STATUS_EOF_SHIFT)
-	if (likely(iavf_test_staterr(rx_desc, IAVF_RXD_EOF)))
+	if (likely(iavf_test_staterr(qword, IAVF_RX_DESC_STATUS_EOF_SHIFT)))
 		return false;
 
 	stats->fragments++;
@@ -1139,7 +1107,6 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		struct page *page;
 		unsigned int size;
 		u16 vlan_tag = 0;
-		u8 rx_ptype;
 		u64 qword;
 
 		/* return some buffers to hardware, one at a time is too slow */
@@ -1159,15 +1126,14 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		 * hardware wrote DD then the length will be non-zero
 		 */
 		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+		if (!iavf_test_staterr(qword, IAVF_RX_DESC_STATUS_DD_SHIFT))
+			break;
 
 		/* This memory barrier is needed to keep us from reading
 		 * any other fields out of the rx_desc until we have
 		 * verified the descriptor has been written back.
 		 */
 		dma_rmb();
-#define IAVF_RXD_DD BIT(IAVF_RX_DESC_STATUS_DD_SHIFT)
-		if (!iavf_test_staterr(rx_desc, IAVF_RXD_DD))
-			break;
 
 		size = (qword & IAVF_RXD_QW1_LENGTH_PBUF_MASK) >>
 		       IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
@@ -1208,23 +1174,19 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		if (unlikely(++ntc == ring_size))
 			ntc = 0;
 
-		prefetch(IAVF_RX_DESC(rx_ring, ntc));
-
-		if (iavf_is_non_eop(rx_desc, &stats))
+		if (iavf_is_non_eop(qword, &stats))
 			continue;
 
+		prefetch(rx_desc);
+
 		/* ERR_MASK will only have valid bits if EOP set, and
 		 * what we are doing here is actually checking
 		 * IAVF_RX_DESC_ERROR_RXE_SHIFT, since it is the zeroth bit in
 		 * the error field
 		 */
-		if (unlikely(iavf_test_staterr(rx_desc, BIT(IAVF_RXD_QW1_ERROR_SHIFT)))) {
-			dev_kfree_skb_any(skb);
-			skb = NULL;
-			continue;
-		}
-
-		if (iavf_cleanup_headers(rx_ring, skb)) {
+		if (unlikely(iavf_test_staterr(qword,
+					       IAVF_RXD_QW1_ERROR_SHIFT))) {
+			dev_kfree_skb(skb);
 			skb = NULL;
 			continue;
 		}
@@ -1232,12 +1194,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* probably a little skewed due to removing CRC */
 		stats.bytes += skb->len;
 
-		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
-		rx_ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >>
-			   IAVF_RXD_QW1_PTYPE_SHIFT;
-
 		/* populate checksum, VLAN, and protocol */
-		iavf_process_skb_fields(rx_ring, rx_desc, skb, rx_ptype);
+		iavf_process_skb_fields(rx_ring, rx_desc, skb, qword);
 
 		if (qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT) &&
 		    rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1)
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 64c93d6fa54d09..764b0ada0e6833 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -87,19 +87,17 @@ enum iavf_dyn_idx_t {
 
 /**
  * iavf_test_staterr - tests bits in Rx descriptor status and error fields
- * @rx_desc: pointer to receive descriptor (in le64 format)
- * @stat_err_bits: value to mask
+ * @qword: `wb.qword1.status_error_len` from the descriptor
+ * @stat_err: bit number to mask
  *
  * This function does some fast chicanery in order to return the
  * value of the mask which is really only used for boolean tests.
  * The status_error_len doesn't need to be shifted because it begins
  * at offset zero.
  */
-static inline bool iavf_test_staterr(union iavf_rx_desc *rx_desc,
-				     const u64 stat_err_bits)
+static inline bool iavf_test_staterr(u64 qword, const u64 stat_err)
 {
-	return !!(rx_desc->wb.qword1.status_error_len &
-		  cpu_to_le64(stat_err_bits));
+	return !!(qword & BIT_ULL(stat_err));
 }
 
 /* How many Rx Buffers do we bundle into one write to the hardware ? */

From c24d2cddd182cd8e82b55fff263175101029718c Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Fri, 7 Oct 2022 09:27:27 -0400
Subject: [PATCH 17/32] i40e: Unify handling of zero ring length in 'configure
 queue'

The current version of Intel 'ice' driver allows for using zero
for the ring lenghth in 'configure queue' VIRTCHNL message.
Such a value indicates the ring should not be configured.

Implement the same handling in i40e driver. Instead of returning
an 'invalid parameter' error for zero-sized rings, just skip
that ring during queue pair configuration.

That unified handling is needed for AF_XDP implementation for
'iavf' driver. In that use case we sometimes need to configure
Tx ring only for a given queue pair.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
index 8a4587585acde7..ee2a1e682a1c9b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c
@@ -612,6 +612,9 @@ static int i40e_config_vsi_tx_queue(struct i40e_vf *vf, u16 vsi_id,
 	u32 qtx_ctl;
 	int ret = 0;
 
+	if (info->ring_len == 0)
+		return 0;
+
 	if (!i40e_vc_isvalid_vsi_id(vf, info->vsi_id)) {
 		ret = -ENOENT;
 		goto error_context;
@@ -688,6 +691,9 @@ static int i40e_config_vsi_rx_queue(struct i40e_vf *vf, u16 vsi_id,
 	struct i40e_hmc_obj_rxq rx_ctx;
 	int ret = 0;
 
+	if (info->ring_len == 0)
+		return 0;
+
 	/* clear the context structure first */
 	memset(&rx_ctx, 0, sizeof(struct i40e_hmc_obj_rxq));
 

From ffae47843288803ed8fd7cab12406e45a9b666ad Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Tue, 29 Nov 2022 08:41:22 -0500
Subject: [PATCH 18/32] iavf: Prepare VIRTCHNL functions to support XDP

The XDP and AF_XDP feature is initialized using .ndo functions. Those
functions are always synchronous and may require some serious queues
reconfiguration including changing the number of queues.

Performing such a reconfiguration implies sending a bunch of VIRTCHNL
messages to the PF in order to disable queues, re-enable and re-configure
them, or update the RSS LUT.
By definition, those VIRTCHNL messages are sent asynchronously, so the
result of each VIRTCHNL operation can be received from the PF via admin
queue after some time.
Moreover, the previous implementation of some VIRTCHNL functions (e.g.
'iavf_disable_queues()' or 'iavf_enable_queues()' does not allow to call
them selectively for specific queues only.

In order to addres those problems and cover all scenarios of XDP and
AF_XDP initialization, implement a polling mechanism with a timeout for
blocking the execution of XDP .ndo functions until the result of
VIRTCHNL operation on PF is known to the driver.
Also, refactor the existing VIRTCHNL API by adding functions for
selective queue enabling, disabling and configuration.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h        |  15 +
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   | 334 ++++++++++++++++--
 2 files changed, 321 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 7dbec98d2a983f..5b3eb353ff79c3 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -537,11 +537,22 @@ int iavf_send_vf_offload_vlan_v2_msg(struct iavf_adapter *adapter);
 void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter);
 u16 iavf_get_num_vlans_added(struct iavf_adapter *adapter);
 void iavf_irq_enable(struct iavf_adapter *adapter, bool flush);
+void iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask);
 void iavf_configure_queues(struct iavf_adapter *adapter);
+int iavf_get_configure_queues_result(struct iavf_adapter *adapter,
+				    unsigned int msecs);
 void iavf_deconfigure_queues(struct iavf_adapter *adapter);
 void iavf_enable_queues(struct iavf_adapter *adapter);
 void iavf_disable_queues(struct iavf_adapter *adapter);
+void iavf_enable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
+				 u32 tx_queues);
+void iavf_disable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
+				  u32 tx_queues);
+int iavf_get_queue_enable_result(struct iavf_adapter *adapter, unsigned int msecs);
+int iavf_get_queue_disable_result(struct iavf_adapter *adapter, unsigned int msecs);
 void iavf_map_queues(struct iavf_adapter *adapter);
+int iavf_get_map_queues_result(struct iavf_adapter *adapter,
+			       unsigned int msecs);
 int iavf_request_queues(struct iavf_adapter *adapter, int num);
 void iavf_add_ether_addrs(struct iavf_adapter *adapter);
 void iavf_del_ether_addrs(struct iavf_adapter *adapter);
@@ -554,11 +565,15 @@ void iavf_get_hena(struct iavf_adapter *adapter);
 void iavf_set_hena(struct iavf_adapter *adapter);
 void iavf_set_rss_key(struct iavf_adapter *adapter);
 void iavf_set_rss_lut(struct iavf_adapter *adapter);
+int iavf_get_setting_rss_lut_result(struct iavf_adapter *adapter,
+				    unsigned int msecs);
 void iavf_enable_vlan_stripping(struct iavf_adapter *adapter);
 void iavf_disable_vlan_stripping(struct iavf_adapter *adapter);
 void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 			      enum virtchnl_ops v_opcode,
 			      enum iavf_status v_retval, u8 *msg, u16 msglen);
+int iavf_process_pending_pf_msg(struct iavf_adapter *adapter,
+				unsigned int timeout_msecs);
 int iavf_config_rss(struct iavf_adapter *adapter);
 int iavf_lan_add_device(struct iavf_adapter *adapter);
 int iavf_lan_del_device(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 3a031d8b9685e2..18c27ae2d2ee54 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -52,6 +52,59 @@ int iavf_send_api_ver(struct iavf_adapter *adapter)
 				sizeof(vvi));
 }
 
+/**
+ * iavf_poll_virtchnl_msg_timeout
+ * @hw: HW configuration structure
+ * @event: event to populate on success
+ * @op_to_poll: requested virtchnl op to poll for
+ * @msecs: timeout in milliseconds
+ *
+ * Initialize poll for virtchnl msg matching the requested_op. Returns 0
+ * if a message of the correct opcode is in the queue or an error code
+ * if no message matching the op code is waiting and other failures
+ * (including timeout). In case of timeout -EBUSY error is returned.
+ */
+static int
+iavf_poll_virtchnl_msg_timeout(struct iavf_hw *hw,
+			       struct iavf_arq_event_info *event,
+			       enum virtchnl_ops op_to_poll,
+			       unsigned int msecs)
+{
+	unsigned int wait, delay = 10;
+	enum virtchnl_ops received_op;
+	enum iavf_status status;
+	u32 v_retval;
+
+	for (wait = 0; wait < msecs; wait += delay) {
+		/* When the AQ is empty, iavf_clean_arq_element will be
+		 * nonzero and after some delay this loop will check again
+		 * if any message is added to the AQ.
+		 */
+		status = iavf_clean_arq_element(hw, event, NULL);
+		if (status == IAVF_ERR_ADMIN_QUEUE_NO_WORK)
+			goto wait_for_msg;
+		else if (status != IAVF_SUCCESS)
+			break;
+		received_op =
+		    (enum virtchnl_ops)le32_to_cpu(event->desc.cookie_high);
+		if (op_to_poll == received_op)
+			break;
+wait_for_msg:
+		msleep(delay);
+		status = IAVF_ERR_NOT_READY;
+	}
+
+	if (status == IAVF_SUCCESS) {
+		v_retval = le32_to_cpu(event->desc.cookie_low);
+		v_retval = virtchnl_status_to_errno((enum virtchnl_status_code)
+						    v_retval);
+	} else {
+		v_retval = iavf_status_to_errno(status);
+	}
+
+	return v_retval;
+}
+
 /**
  * iavf_poll_virtchnl_msg
  * @hw: HW configuration structure
@@ -87,6 +140,83 @@ iavf_poll_virtchnl_msg(struct iavf_hw *hw, struct iavf_arq_event_info *event,
 	return virtchnl_status_to_errno((enum virtchnl_status_code)v_retval);
 }
 
+/**
+ * iavf_process_pending_pf_msg
+ * @adapter: adapter structure
+ * @timeout_msec: timeout in milliseconds
+ *
+ * Check if any VIRTCHNL message is currently pending and process it
+ * if needed.
+ * Poll the admin queue for the PF response and process it using
+ * a standard handler.
+ * If no PF response has been received within a given timeout, exit
+ * with an error.
+ */
+int
+iavf_process_pending_pf_msg(struct iavf_adapter *adapter,
+			    unsigned int timeout_msecs)
+{
+	enum virtchnl_ops current_op = adapter->current_op;
+	struct iavf_hw *hw = &adapter->hw;
+	struct iavf_arq_event_info event;
+	enum virtchnl_ops v_op;
+	enum iavf_status v_ret;
+	int err;
+
+	if (current_op == VIRTCHNL_OP_UNKNOWN)
+		return 0;
+
+	event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
+	event.msg_buf = kzalloc(IAVF_MAX_AQ_BUF_SIZE, GFP_KERNEL);
+	if (!event.msg_buf)
+		return -ENOMEM;
+
+	err = iavf_poll_virtchnl_msg_timeout(hw, &event, current_op,
+					     timeout_msecs);
+	if (err)
+		goto free_exit;
+
+	v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high);
+	v_ret = (enum iavf_status)le32_to_cpu(event.desc.cookie_low);
+
+	iavf_virtchnl_completion(adapter, v_op, v_ret, event.msg_buf,
+				 event.msg_len);
+
+free_exit:
+	kfree(event.msg_buf);
+
+	return err;
+}
+
+/**
+ * iavf_get_vf_op_result
+ * @adapter: adapter structure
+ * @op: virtchnl operation
+ * @msecs: timeout in milliseconds
+ *
+ * Return a result of a given operation returned by PF
+ * or exit with timeout.
+ **/
+static int iavf_get_vf_op_result(struct iavf_adapter *adapter,
+				 enum virtchnl_ops op,
+				 unsigned int msecs)
+{
+	struct iavf_hw *hw = &adapter->hw;
+	struct iavf_arq_event_info event;
+	int err;
+
+	event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
+	event.msg_buf = kzalloc(IAVF_MAX_AQ_BUF_SIZE, GFP_KERNEL);
+	if (!event.msg_buf)
+		return -ENOMEM;
+
+	err = iavf_poll_virtchnl_msg_timeout(hw, &event, op, msecs);
+	kfree(event.msg_buf);
+	adapter->current_op = VIRTCHNL_OP_UNKNOWN;
+
+	return err;
+}
+
 /**
  * iavf_verify_api_ver
  * @adapter: adapter structure
@@ -263,16 +393,62 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
 }
 
 /**
- * iavf_configure_queues
+ * iavf_set_qp_config_info
+ * @vqpi: virtchannel structure for queue pair configuration
  * @adapter: adapter structure
+ * @queue_index: index of queue pair in the adapter structure
+ * @max_frame: maximal frame size supported by the adapter
  *
- * Request that the PF set up our (previously allocated) queues.
+ * Fill virtchannel queue pair configuration structure
+ * with data for the Rx and Tx queues of a given index.
  **/
-void iavf_configure_queues(struct iavf_adapter *adapter)
+static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
+				    struct iavf_adapter *adapter,
+				    int queue_index, int max_frame)
 {
+	struct iavf_ring *txq = &adapter->tx_rings[queue_index];
+	struct iavf_ring *rxq = &adapter->rx_rings[queue_index];
+
+	vqpi->txq.vsi_id = adapter->vsi_res->vsi_id;
+	vqpi->txq.queue_id = queue_index;
+	vqpi->txq.ring_len = txq->count;
+	vqpi->txq.dma_ring_addr = txq->dma;
+
+	vqpi->rxq.vsi_id = adapter->vsi_res->vsi_id;
+	vqpi->rxq.queue_id = queue_index;
+	vqpi->rxq.ring_len = rxq->count;
+	vqpi->rxq.dma_ring_addr = rxq->dma;
+	vqpi->rxq.max_pkt_size = max_frame;
+	vqpi->rxq.databuffer_size = LIBIE_RX_BUF_LEN;
+}
+
+/**
+ * iavf_get_configure_queues_result
+ * @adapter: adapter structure
+ * @msecs: timeout in milliseconds
+ *
+ * Return a result of CONFIG_VSI_QUEUES command returned by PF
+ * or exit with timeout.
+ **/
+int iavf_get_configure_queues_result(struct iavf_adapter *adapter,
+				    unsigned int msecs)
+{
+	return iavf_get_vf_op_result(adapter, VIRTCHNL_OP_CONFIG_VSI_QUEUES,
+				     msecs);
+}
+
+/**
+ * iavf_configure_selected_queues
+ * @adapter: adapter structure
+ * @qp_mask: mask of queue pairs to configure
+ *
+ * Request that the PF set up our selected (previously allocated) queues.
+ **/
+void iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask)
+{
+	unsigned long num_qps_to_config, mask = qp_mask;
+	u32 idx, max_frame = adapter->vf_res->max_mtu;
 	struct virtchnl_vsi_queue_config_info *vqci;
-	u32 i, max_frame = adapter->vf_res->max_mtu;
-	int pairs = adapter->num_active_queues;
 	struct virtchnl_queue_pair_info *vqpi;
 	size_t len;
 
@@ -284,29 +460,21 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 			adapter->current_op);
 		return;
 	}
+	num_qps_to_config = hweight_long(mask);
 	adapter->current_op = VIRTCHNL_OP_CONFIG_VSI_QUEUES;
-	len = struct_size(vqci, qpair, pairs);
+	len = struct_size(vqci, qpair, num_qps_to_config);
 	vqci = kzalloc(len, GFP_KERNEL);
 	if (!vqci)
 		return;
 
 	vqci->vsi_id = adapter->vsi_res->vsi_id;
-	vqci->num_queue_pairs = pairs;
+	vqci->num_queue_pairs = num_qps_to_config;
 	vqpi = vqci->qpair;
 	/* Size check is not needed here - HW max is 16 queue pairs, and we
 	 * can fit info for 31 of them into the AQ buffer before it overflows.
 	 */
-	for (i = 0; i < pairs; i++) {
-		vqpi->txq.vsi_id = vqci->vsi_id;
-		vqpi->txq.queue_id = i;
-		vqpi->txq.ring_len = adapter->tx_rings[i].count;
-		vqpi->txq.dma_ring_addr = adapter->tx_rings[i].dma;
-		vqpi->rxq.vsi_id = vqci->vsi_id;
-		vqpi->rxq.queue_id = i;
-		vqpi->rxq.ring_len = adapter->rx_rings[i].count;
-		vqpi->rxq.dma_ring_addr = adapter->rx_rings[i].dma;
-		vqpi->rxq.max_pkt_size = max_frame;
-		vqpi->rxq.databuffer_size = LIBIE_RX_BUF_LEN;
+	for_each_set_bit(idx, &mask, adapter->num_active_queues) {
+		iavf_set_qp_config_info(vqpi, adapter, idx, max_frame);
 		vqpi++;
 	}
 
@@ -317,12 +485,59 @@ void iavf_configure_queues(struct iavf_adapter *adapter)
 }
 
 /**
- * iavf_enable_queues
+ * iavf_configure_queues
  * @adapter: adapter structure
  *
- * Request that the PF enable all of our queues.
+ * Send a request to PF to set up all allocated queues.
  **/
-void iavf_enable_queues(struct iavf_adapter *adapter)
+void iavf_configure_queues(struct iavf_adapter *adapter)
+{
+	int pairs = adapter->num_active_queues;
+	u32 qpair_mask = BIT(pairs) - 1;
+
+	iavf_configure_selected_queues(adapter, qpair_mask);
+}
+
+/**
+ * iavf_get_queue_disable_result
+ * @adapter: adapter structure
+ * @msecs: timeout in milliseconds
+ *
+ * Return a result DISABLE_QUEUES command result returned by PF or exit
+ * with timeout.
+ **/
+int iavf_get_queue_disable_result(struct iavf_adapter *adapter,
+				  unsigned int msecs)
+{
+	return iavf_get_vf_op_result(adapter, VIRTCHNL_OP_DISABLE_QUEUES,
+				     msecs);
+}
+
+/**
+ * iavf_get_queue_enable_result
+ * @adapter: adapter structure
+ * @msecs: timeout in milliseconds
+ *
+ * Return a result ENABLE_QUEUES command result returned by PF or exit
+ * with timeout.
+ **/
+int iavf_get_queue_enable_result(struct iavf_adapter *adapter,
+				 unsigned int msecs)
+{
+	return iavf_get_vf_op_result(adapter, VIRTCHNL_OP_ENABLE_QUEUES,
+				     msecs);
+}
+
+/**
+ * iavf_enable_selected_queues
+ * @adapter: adapter structure
+ * @rx_queues: mask of Rx queues
+ * @tx_queues: mask of Tx queues
+ *
+ * Send a request to PF to enable selected queues.
+ **/
+void iavf_enable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
+				 u32 tx_queues)
 {
 	struct virtchnl_queue_select vqs;
 
@@ -334,20 +549,23 @@ void iavf_enable_queues(struct iavf_adapter *adapter)
 	}
 	adapter->current_op = VIRTCHNL_OP_ENABLE_QUEUES;
 	vqs.vsi_id = adapter->vsi_res->vsi_id;
-	vqs.tx_queues = BIT(adapter->num_active_queues) - 1;
-	vqs.rx_queues = vqs.tx_queues;
+	vqs.tx_queues = tx_queues;
+	vqs.rx_queues = rx_queues;
 	adapter->aq_required &= ~IAVF_FLAG_AQ_ENABLE_QUEUES;
 	iavf_send_pf_msg(adapter, VIRTCHNL_OP_ENABLE_QUEUES,
 			 (u8 *)&vqs, sizeof(vqs));
 }
 
 /**
- * iavf_disable_queues
+ * iavf_disable_selected_queues
  * @adapter: adapter structure
+ * @rx_queues: mask of Rx queues
+ * @tx_queues: mask of Tx queues
  *
- * Request that the PF disable all of our queues.
+ * Send a request to PF to disable selected queues.
  **/
-void iavf_disable_queues(struct iavf_adapter *adapter)
+void iavf_disable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
+				  u32 tx_queues)
 {
 	struct virtchnl_queue_select vqs;
 
@@ -359,13 +577,58 @@ void iavf_disable_queues(struct iavf_adapter *adapter)
 	}
 	adapter->current_op = VIRTCHNL_OP_DISABLE_QUEUES;
 	vqs.vsi_id = adapter->vsi_res->vsi_id;
-	vqs.tx_queues = BIT(adapter->num_active_queues) - 1;
-	vqs.rx_queues = vqs.tx_queues;
+	vqs.tx_queues = tx_queues;
+	vqs.rx_queues = rx_queues;
 	adapter->aq_required &= ~IAVF_FLAG_AQ_DISABLE_QUEUES;
 	iavf_send_pf_msg(adapter, VIRTCHNL_OP_DISABLE_QUEUES,
 			 (u8 *)&vqs, sizeof(vqs));
 }
 
+/**
+ * iavf_enable_queues
+ * @adapter: adapter structure
+ *
+ * Send a request to PF to enable all allocated queues.
+ **/
+void iavf_enable_queues(struct iavf_adapter *adapter)
+{
+	u32 num_tx_queues = adapter->num_active_queues;
+	u32 rx_queues = BIT(adapter->num_active_queues) - 1;
+	u32 tx_queues = BIT(num_tx_queues) - 1;
+
+	iavf_enable_selected_queues(adapter, rx_queues, tx_queues);
+}
+
+/**
+ * iavf_disable_queues
+ * @adapter: adapter structure
+ *
+ * Send a request to PF to disable all allocated queues.
+ **/
+void iavf_disable_queues(struct iavf_adapter *adapter)
+{
+	u32 num_tx_queues = adapter->num_active_queues;
+	u32 rx_queues = BIT(adapter->num_active_queues) - 1;
+	u32 tx_queues = BIT(num_tx_queues) - 1;
+
+	iavf_disable_selected_queues(adapter, rx_queues, tx_queues);
+}
+
+/**
+ * iavf_get_map_queues_result
+ * @adapter: adapter structure
+ * @msecs: timeout in milliseconds
+ *
+ * Return a result CONFIG_VSI_QUEUES command result returned by PF
+ * or exit with timeout.
+ **/
+int iavf_get_map_queues_result(struct iavf_adapter *adapter,
+			       unsigned int msecs)
+{
+	return iavf_get_vf_op_result(adapter, VIRTCHNL_OP_CONFIG_IRQ_MAP,
+				     msecs);
+}
+
 /**
  * iavf_map_queues
  * @adapter: adapter structure
@@ -1083,6 +1346,21 @@ void iavf_set_rss_key(struct iavf_adapter *adapter)
 	kfree(vrk);
 }
 
+/**
+ * iavf_get_setting_rss_lut_result
+ * @adapter: adapter structure
+ * @msecs: timeout in milliseconds
+ *
+ * Return a result of CONFIG_VSI_QUEUES command returned by PF
+ * or exit with timeout.
+ **/
+int iavf_get_setting_rss_lut_result(struct iavf_adapter *adapter,
+				    unsigned int msecs)
+{
+	return iavf_get_vf_op_result(adapter, VIRTCHNL_OP_CONFIG_RSS_LUT,
+				     msecs);
+}
+
 /**
  * iavf_set_rss_lut
  * @adapter: adapter structure

From 421edaaba55d7f0fe9eb7ada9fcc2ab8afad909c Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Tue, 29 Nov 2022 16:32:05 +0100
Subject: [PATCH 19/32] iavf: Refactor ring initialization functions to handle
 XDP

Introduce modular functions to allocate and initialize Rx and Tx rings
in order to prepare the initialization procedure to easily fit the XDP
setup.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h      |   6 +-
 drivers/net/ethernet/intel/iavf/iavf_main.c | 236 ++++++++++++--------
 drivers/net/ethernet/intel/iavf/iavf_txrx.c |   8 +-
 3 files changed, 149 insertions(+), 101 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 5b3eb353ff79c3..43017aca25cc20 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -263,8 +263,8 @@ struct iavf_adapter {
 	/* Lock to protect accesses to MAC and VLAN lists */
 	spinlock_t mac_vlan_list_lock;
 	char misc_vector_name[IFNAMSIZ + 9];
-	int num_active_queues;
-	int num_req_queues;
+	u32 num_active_queues;
+	u32 num_req_queues;
 
 	/* TX */
 	struct iavf_ring *tx_rings;
@@ -574,6 +574,8 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter,
 			      enum iavf_status v_retval, u8 *msg, u16 msglen);
 int iavf_process_pending_pf_msg(struct iavf_adapter *adapter,
 				unsigned int timeout_msecs);
+void iavf_configure_rx_ring(struct iavf_adapter *adapter,
+			    struct iavf_ring *rx_ring);
 int iavf_config_rss(struct iavf_adapter *adapter);
 int iavf_lan_add_device(struct iavf_adapter *adapter);
 int iavf_lan_del_device(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 60463b3edfacf2..2fff901227e78c 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -703,6 +703,22 @@ static void iavf_configure_tx(struct iavf_adapter *adapter)
 		adapter->tx_rings[i].tail = hw->hw_addr + IAVF_QTX_TAIL1(i);
 }
 
+/**
+ * iavf_configure_rx_ring - Configure a single Rx ring
+ * @adapter: board private structure
+ * @rx_ring: Rx ring to be configured
+ * @rx_buf_len: buffer length that shall be used for the given Rx ring.
+ *
+ **/
+void iavf_configure_rx_ring(struct iavf_adapter *adapter,
+			    struct iavf_ring *rx_ring)
+{
+	u32 queue_idx = rx_ring->queue_index;
+
+	rx_ring->tail = adapter->hw.hw_addr + IAVF_QRX_TAIL1(queue_idx);
+	iavf_alloc_rx_pages(rx_ring);
+}
+
 /**
  * iavf_configure_rx - Configure Receive Unit after Reset
  * @adapter: board private structure
@@ -711,10 +727,8 @@ static void iavf_configure_tx(struct iavf_adapter *adapter)
  **/
 static void iavf_configure_rx(struct iavf_adapter *adapter)
 {
-	struct iavf_hw *hw = &adapter->hw;
-
 	for (u32 i = 0; i < adapter->num_active_queues; i++)
-		adapter->rx_rings[i].tail = hw->hw_addr + IAVF_QRX_TAIL1(i);
+		iavf_configure_rx_ring(adapter, &adapter->rx_rings[i]);
 }
 
 /**
@@ -1206,19 +1220,12 @@ static void iavf_napi_disable_all(struct iavf_adapter *adapter)
 static void iavf_configure(struct iavf_adapter *adapter)
 {
 	struct net_device *netdev = adapter->netdev;
-	int i;
 
 	iavf_set_rx_mode(netdev);
 
 	iavf_configure_tx(adapter);
 	iavf_configure_rx(adapter);
 	adapter->aq_required |= IAVF_FLAG_AQ_CONFIGURE_QUEUES;
-
-	for (i = 0; i < adapter->num_active_queues; i++) {
-		struct iavf_ring *ring = &adapter->rx_rings[i];
-
-		iavf_alloc_rx_pages(ring);
-	}
 }
 
 /**
@@ -1457,6 +1464,72 @@ static void iavf_free_queues(struct iavf_adapter *adapter)
 	adapter->rx_rings = NULL;
 }
 
+/**
+ * iavf_set_rx_queue_vlan_tag_loc - set location for VLAN tag offload in Rx
+ * @adapter: board private structure
+ * @rx_ring: Rx ring where VLAN tag offload for VLAN will be set
+ *
+ * Helper function for setting VLAN tag offload location in a given Rx ring.
+ */
+static void iavf_set_rx_queue_vlan_tag_loc(struct iavf_adapter *adapter,
+					   struct iavf_ring *rx_ring)
+{
+	struct virtchnl_vlan_supported_caps *caps;
+
+	/* prevent multiple L2TAG bits being set after VFR */
+	rx_ring->flags &=
+		~(IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1 |
+		  IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2);
+
+	if (VLAN_ALLOWED(adapter)) {
+		rx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+		return;
+	}
+
+	if (!VLAN_V2_ALLOWED(adapter))
+		return;
+
+	caps = &adapter->vlan_v2_caps.offloads.stripping_support;
+
+	if ((caps->outer | caps->inner) & VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
+		rx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+	else if ((caps->outer | caps->inner) & VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2)
+		rx_ring->flags |= IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2;
+}
+
+/**
+ * iavf_set_tx_queue_vlan_tag_loc - set location for VLAN tag offload in Tx
+ * @adapter: board private structure
+ * @tx_ring: Tx ring where VLAN tag offload for VLAN will be set
+ *
+ * Helper function for setting VLAN tag offload location in a given Tx ring.
+ */
+static void iavf_set_tx_queue_vlan_tag_loc(struct iavf_adapter *adapter,
+					   struct iavf_ring *tx_ring)
+{
+	struct virtchnl_vlan_supported_caps *caps;
+
+	/* prevent multiple L2TAG bits being set after VFR */
+	tx_ring->flags &=
+		~(IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1 |
+		  IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2);
+
+	if (VLAN_ALLOWED(adapter)) {
+		tx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+		return;
+	}
+
+	if (!VLAN_V2_ALLOWED(adapter))
+		return;
+
+	caps = &adapter->vlan_v2_caps.offloads.insertion_support;
+
+	if ((caps->outer | caps->inner) & VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
+		tx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
+	else if ((caps->outer | caps->inner) & VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2)
+		tx_ring->flags |= IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2;
+}
+
 /**
  * iavf_set_queue_vlan_tag_loc - set location for VLAN tag offload
  * @adapter: board private structure
@@ -1471,72 +1544,58 @@ void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter)
 	int i;
 
 	for (i = 0; i < adapter->num_active_queues; i++) {
-		struct iavf_ring *tx_ring = &adapter->tx_rings[i];
-		struct iavf_ring *rx_ring = &adapter->rx_rings[i];
-
-		/* prevent multiple L2TAG bits being set after VFR */
-		tx_ring->flags &=
-			~(IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1 |
-			  IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2);
-		rx_ring->flags &=
-			~(IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1 |
-			  IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2);
-
-		if (VLAN_ALLOWED(adapter)) {
-			tx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-			rx_ring->flags |= IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-		} else if (VLAN_V2_ALLOWED(adapter)) {
-			struct virtchnl_vlan_supported_caps *stripping_support;
-			struct virtchnl_vlan_supported_caps *insertion_support;
-
-			stripping_support =
-				&adapter->vlan_v2_caps.offloads.stripping_support;
-			insertion_support =
-				&adapter->vlan_v2_caps.offloads.insertion_support;
-
-			if (stripping_support->outer) {
-				if (stripping_support->outer &
-				    VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
-					rx_ring->flags |=
-						IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-				else if (stripping_support->outer &
-					 VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2)
-					rx_ring->flags |=
-						IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2;
-			} else if (stripping_support->inner) {
-				if (stripping_support->inner &
-				    VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
-					rx_ring->flags |=
-						IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-				else if (stripping_support->inner &
-					 VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2_2)
-					rx_ring->flags |=
-						IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2;
-			}
-
-			if (insertion_support->outer) {
-				if (insertion_support->outer &
-				    VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
-					tx_ring->flags |=
-						IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-				else if (insertion_support->outer &
-					 VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2)
-					tx_ring->flags |=
-						IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2;
-			} else if (insertion_support->inner) {
-				if (insertion_support->inner &
-				    VIRTCHNL_VLAN_TAG_LOCATION_L2TAG1)
-					tx_ring->flags |=
-						IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1;
-				else if (insertion_support->inner &
-					 VIRTCHNL_VLAN_TAG_LOCATION_L2TAG2)
-					tx_ring->flags |=
-						IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2;
-			}
-		}
+		iavf_set_rx_queue_vlan_tag_loc(adapter, &adapter->rx_rings[i]);
+		iavf_set_tx_queue_vlan_tag_loc(adapter, &adapter->tx_rings[i]);
 	}
 }
 
+/**
+ * iavf_init_rx_ring - Init pointers and flags for a given Rx ring
+ * @adapter: board private structure to initialize
+ * @ring_index: index of the ring to be initialized
+ *
+ * Init all basic pointers and flags in a newly allocated Rx ring.
+ **/
+static void iavf_init_rx_ring(struct iavf_adapter *adapter,
+			      int ring_index)
+{
+	struct iavf_ring *rx_ring = &adapter->rx_rings[ring_index];
+
+	rx_ring->vsi = &adapter->vsi;
+	rx_ring->queue_index = ring_index;
+	rx_ring->netdev = adapter->netdev;
+	rx_ring->dev = &adapter->pdev->dev;
+	rx_ring->count = adapter->rx_desc_count;
+	rx_ring->itr_setting = IAVF_ITR_RX_DEF;
+}
+
+/**
+ * iavf_init_tx_ring - Init pointers and flags for a given Tx ring
+ * @adapter: board private structure to initialize
+ * @ring_index: index of the ring to be initialized
+ * @xdp_ring: set to true if the ring is XDP Tx queue
+ *
+ * Init all basic pointers and flags in a newly allocated Tx ring.
+ **/
+static void iavf_init_tx_ring(struct iavf_adapter *adapter, int ring_index)
+{
+	struct iavf_ring *tx_ring = &adapter->tx_rings[ring_index];
+
+	tx_ring->vsi = &adapter->vsi;
+	tx_ring->queue_index = ring_index;
+	tx_ring->netdev = adapter->netdev;
+	tx_ring->dev = &adapter->pdev->dev;
+	tx_ring->count = adapter->tx_desc_count;
+	tx_ring->itr_setting = IAVF_ITR_TX_DEF;
+
+	tx_ring->flags = 0;
+
+	if (adapter->flags & IAVF_FLAG_WB_ON_ITR_CAPABLE)
+		tx_ring->flags |= IAVF_TXR_FLAGS_WB_ON_ITR;
+
+	u64_stats_init(&tx_ring->sq_stats.syncp);
+}
+
 /**
  * iavf_alloc_queues - Allocate memory for all rings
  * @adapter: board private structure to initialize
@@ -1547,7 +1606,8 @@ void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter)
  **/
 static int iavf_alloc_queues(struct iavf_adapter *adapter)
 {
-	int i, num_active_queues;
+	u32 num_active_queues;
+	int i;
 
 	/* If we're in reset reallocating queues we don't actually know yet for
 	 * certain the PF gave us the number of queues we asked for but we'll
@@ -1564,7 +1624,6 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 					  adapter->vsi_res->num_queue_pairs,
 					  (int)(num_online_cpus()));
 
-
 	adapter->tx_rings = kcalloc(num_active_queues,
 				    sizeof(struct iavf_ring), GFP_KERNEL);
 	if (!adapter->tx_rings)
@@ -1574,32 +1633,13 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 	if (!adapter->rx_rings)
 		goto err_out;
 
-	for (i = 0; i < num_active_queues; i++) {
-		struct iavf_ring *tx_ring;
-		struct iavf_ring *rx_ring;
-
-		tx_ring = &adapter->tx_rings[i];
-
-		tx_ring->queue_index = i;
-		tx_ring->netdev = adapter->netdev;
-		tx_ring->dev = &adapter->pdev->dev;
-		tx_ring->count = adapter->tx_desc_count;
-		tx_ring->itr_setting = IAVF_ITR_TX_DEF;
-		if (adapter->flags & IAVF_FLAG_WB_ON_ITR_CAPABLE)
-			tx_ring->flags |= IAVF_TXR_FLAGS_WB_ON_ITR;
-		u64_stats_init(&tx_ring->sq_stats.syncp);
+	adapter->num_active_queues = num_active_queues;
 
-		rx_ring = &adapter->rx_rings[i];
-		rx_ring->queue_index = i;
-		rx_ring->netdev = adapter->netdev;
-		rx_ring->dev = &adapter->pdev->dev;
-		rx_ring->count = adapter->rx_desc_count;
-		rx_ring->itr_setting = IAVF_ITR_RX_DEF;
-		u64_stats_init(&rx_ring->rq_stats.syncp);
+	for (i = 0; i < num_active_queues; i++) {
+		iavf_init_tx_ring(adapter, i);
+		iavf_init_rx_ring(adapter, i);
 	}
 
-	adapter->num_active_queues = num_active_queues;
-
 	iavf_set_queue_vlan_tag_loc(adapter);
 
 	return 0;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index d6093f4608db1e..06060fa685e913 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -656,7 +656,8 @@ static void iavf_update_itr(struct iavf_q_vector *q_vector,
 int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 {
 	struct device *dev = tx_ring->dev;
-	int bi_size;
+	struct iavf_tx_desc *tx_desc;
+	int bi_size, j;
 
 	if (!dev)
 		return -ENOMEM;
@@ -682,6 +683,11 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
 	tx_ring->prev_pkt_ctr = -1;
+	for (j = 0; j < tx_ring->count; j++) {
+		tx_desc = IAVF_TX_DESC(tx_ring, j);
+		tx_desc->cmd_type_offset_bsz = 0;
+	}
+
 	return 0;
 
 err:

From d5d0aa217d5af330d8516a77d50f9f9f1ee5ea2e Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Fri, 2 Dec 2022 14:34:16 +0100
Subject: [PATCH 20/32] iavf: Prepare rings to support XDP

Extend basic structures of the driver (e.g. 'iavf_adapter', 'iavf_ring')
by adding members necessary to support XDP. Register those members using
required functions from BPF API.
Implement a support for XDP_TX and XDP_REDIRECT actions by adding
additional XDP Tx queues to transmit packets without interferring a
regular Tx traffic.
Finally, add required XDP setup and release calls to queue allocation
and deallocation functions respectively.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h        |  16 +++
 drivers/net/ethernet/intel/iavf/iavf_main.c   | 133 ++++++++++++++++--
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  14 +-
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   |  11 +-
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |  41 +++++-
 5 files changed, 189 insertions(+), 26 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 43017aca25cc20..567d49caf7272e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -4,6 +4,7 @@
 #ifndef _IAVF_H_
 #define _IAVF_H_
 
+#include <linux/bpf.h>
 #include <linux/module.h>
 #include <linux/pci.h>
 #include <linux/netdevice.h>
@@ -33,6 +34,7 @@
 #include <net/udp.h>
 #include <net/tc_act/tc_gact.h>
 #include <net/tc_act/tc_mirred.h>
+#include <net/xdp.h>
 
 #include "iavf_type.h"
 #include <linux/avf/virtchnl.h>
@@ -264,10 +266,13 @@ struct iavf_adapter {
 	spinlock_t mac_vlan_list_lock;
 	char misc_vector_name[IFNAMSIZ + 9];
 	u32 num_active_queues;
+	u32 num_xdp_tx_queues;
 	u32 num_req_queues;
+	struct bpf_prog *xdp_prog;
 
 	/* TX */
 	struct iavf_ring *tx_rings;
+	struct iavf_ring *xdp_rings;
 	u32 tx_timeout_count;
 	u32 tx_desc_count;
 
@@ -510,6 +515,17 @@ static inline void iavf_change_state(struct iavf_adapter *adapter,
 		iavf_state_str(adapter->state));
 }
 
+/**
+ * iavf_adapter_xdp_active - Determine if XDP program is loaded
+ * @adapter: board private structure
+ *
+ * Returns true if XDP program is loaded on a given adapter.
+ **/
+static inline bool iavf_adapter_xdp_active(struct iavf_adapter *adapter)
+{
+	return !!READ_ONCE(adapter->xdp_prog);
+}
+
 int iavf_up(struct iavf_adapter *adapter);
 void iavf_down(struct iavf_adapter *adapter);
 int iavf_process_config(struct iavf_adapter *adapter);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 2fff901227e78c..f7c6549086ae0b 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -466,12 +466,15 @@ iavf_map_vector_to_rxq(struct iavf_adapter *adapter, int v_idx, int r_idx)
  * @adapter: board private structure
  * @v_idx: interrupt number
  * @t_idx: queue number
+ * @xdpq: set to true if Tx queue is XDP Tx queue
  **/
 static void
-iavf_map_vector_to_txq(struct iavf_adapter *adapter, int v_idx, int t_idx)
+iavf_map_vector_to_txq(struct iavf_adapter *adapter, int v_idx, int t_idx,
+		       bool xdpq)
 {
+	struct iavf_ring *tx_ring =  xdpq ? &adapter->xdp_rings[t_idx]
+					  : &adapter->tx_rings[t_idx];
 	struct iavf_q_vector *q_vector = &adapter->q_vectors[v_idx];
-	struct iavf_ring *tx_ring = &adapter->tx_rings[t_idx];
 	struct iavf_hw *hw = &adapter->hw;
 
 	tx_ring->q_vector = q_vector;
@@ -507,7 +510,9 @@ static void iavf_map_rings_to_vectors(struct iavf_adapter *adapter)
 
 	for (; ridx < rings_remaining; ridx++) {
 		iavf_map_vector_to_rxq(adapter, vidx, ridx);
-		iavf_map_vector_to_txq(adapter, vidx, ridx);
+		iavf_map_vector_to_txq(adapter, vidx, ridx, false);
+		if (iavf_adapter_xdp_active(adapter))
+			iavf_map_vector_to_txq(adapter, vidx, ridx, true);
 
 		/* In the case where we have more queues than vectors, continue
 		 * round-robin on vectors until all queues are mapped.
@@ -697,10 +702,13 @@ static void iavf_free_misc_irq(struct iavf_adapter *adapter)
 static void iavf_configure_tx(struct iavf_adapter *adapter)
 {
 	struct iavf_hw *hw = &adapter->hw;
-	int i;
+	int i, j;
 
-	for (i = 0; i < adapter->num_active_queues; i++)
-		adapter->tx_rings[i].tail = hw->hw_addr + IAVF_QTX_TAIL1(i);
+	for (i = 0, j = 0; i < adapter->num_active_queues; i++, j++)
+		adapter->tx_rings[i].tail = hw->hw_addr + IAVF_QTX_TAIL1(j);
+
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++, j++)
+		adapter->xdp_rings[i].tail = hw->hw_addr + IAVF_QTX_TAIL1(j);
 }
 
 /**
@@ -714,8 +722,22 @@ void iavf_configure_rx_ring(struct iavf_adapter *adapter,
 			    struct iavf_ring *rx_ring)
 {
 	u32 queue_idx = rx_ring->queue_index;
+	int err;
 
 	rx_ring->tail = adapter->hw.hw_addr + IAVF_QRX_TAIL1(queue_idx);
+
+	if (!xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+		err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+				       rx_ring->queue_index,
+				       rx_ring->q_vector->napi.napi_id);
+
+	err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_POOL,
+					 rx_ring->pool);
+	if (err)
+		netdev_err(adapter->netdev, "Could not register XDP memory model for RX queue %u, error: %d\n",
+			   queue_idx, err);
+
+	RCU_INIT_POINTER(rx_ring->xdp_prog, adapter->xdp_prog);
 	iavf_alloc_rx_pages(rx_ring);
 }
 
@@ -1447,6 +1469,19 @@ iavf_acquire_msix_vectors(struct iavf_adapter *adapter, int vectors)
 	return 0;
 }
 
+/**
+ * iavf_free_xdp_queues - Free memory for XDP rings
+ * @adapter: board private structure to update
+ *
+ * Free all of the memory associated with XDP queues.
+ **/
+static void iavf_free_xdp_queues(struct iavf_adapter *adapter)
+{
+	adapter->num_xdp_tx_queues = 0;
+	kfree(adapter->xdp_rings);
+	adapter->xdp_rings = NULL;
+}
+
 /**
  * iavf_free_queues - Free memory for all rings
  * @adapter: board private structure to initialize
@@ -1455,13 +1490,12 @@ iavf_acquire_msix_vectors(struct iavf_adapter *adapter, int vectors)
  **/
 static void iavf_free_queues(struct iavf_adapter *adapter)
 {
-	if (!adapter->vsi_res)
-		return;
 	adapter->num_active_queues = 0;
 	kfree(adapter->tx_rings);
 	adapter->tx_rings = NULL;
 	kfree(adapter->rx_rings);
 	adapter->rx_rings = NULL;
+	iavf_free_xdp_queues(adapter);
 }
 
 /**
@@ -1530,6 +1564,20 @@ static void iavf_set_tx_queue_vlan_tag_loc(struct iavf_adapter *adapter,
 		tx_ring->flags |= IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2;
 }
 
+/**
+ * iavf_set_xdp_queue_vlan_tag_loc - set location for VLAN tag on XDP ring
+ * @adapter: board private structure
+ *
+ * Variation of iavf_set_queue_vlan_tag_loc, which configures XDP rings only.
+ */
+static void iavf_set_xdp_queue_vlan_tag_loc(struct iavf_adapter *adapter)
+{
+	int i;
+
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++)
+		iavf_set_tx_queue_vlan_tag_loc(adapter, &adapter->xdp_rings[i]);
+}
+
 /**
  * iavf_set_queue_vlan_tag_loc - set location for VLAN tag offload
  * @adapter: board private structure
@@ -1547,6 +1595,8 @@ void iavf_set_queue_vlan_tag_loc(struct iavf_adapter *adapter)
 		iavf_set_rx_queue_vlan_tag_loc(adapter, &adapter->rx_rings[i]);
 		iavf_set_tx_queue_vlan_tag_loc(adapter, &adapter->tx_rings[i]);
 	}
+
+	iavf_set_xdp_queue_vlan_tag_loc(adapter);
 }
 
 /**
@@ -1577,9 +1627,12 @@ static void iavf_init_rx_ring(struct iavf_adapter *adapter,
  *
  * Init all basic pointers and flags in a newly allocated Tx ring.
  **/
-static void iavf_init_tx_ring(struct iavf_adapter *adapter, int ring_index)
+static void iavf_init_tx_ring(struct iavf_adapter *adapter,
+			      int ring_index,
+			      bool xdp_ring)
 {
-	struct iavf_ring *tx_ring = &adapter->tx_rings[ring_index];
+	struct iavf_ring *tx_ring = xdp_ring ? &adapter->xdp_rings[ring_index]
+					     : &adapter->tx_rings[ring_index];
 
 	tx_ring->vsi = &adapter->vsi;
 	tx_ring->queue_index = ring_index;
@@ -1594,6 +1647,40 @@ static void iavf_init_tx_ring(struct iavf_adapter *adapter, int ring_index)
 		tx_ring->flags |= IAVF_TXR_FLAGS_WB_ON_ITR;
 
 	u64_stats_init(&tx_ring->sq_stats.syncp);
+
+	if (xdp_ring) {
+		tx_ring->queue_index += adapter->num_active_queues;
+		tx_ring->flags |= IAVF_TXRX_FLAGS_XDP;
+	}
+}
+
+/**
+ * iavf_alloc_xdp_queues - Allocate memory for XDP rings
+ * @adapter: board private structure to initialize
+ *
+ * Variation of iavf_alloc_queues(), which configures XDP queues only.
+ **/
+static int iavf_alloc_xdp_queues(struct iavf_adapter *adapter, u32 num_active_queues)
+{
+	int i;
+
+	adapter->num_xdp_tx_queues = iavf_adapter_xdp_active(adapter) ?
+				     num_active_queues : 0;
+	if (!adapter->num_xdp_tx_queues)
+		return 0;
+
+	adapter->xdp_rings = kcalloc(adapter->num_xdp_tx_queues,
+				     sizeof(struct iavf_ring), GFP_KERNEL);
+	if (!adapter->xdp_rings)
+		return -ENOMEM;
+
+	/* Setup extra XDP Tx queues if there are any */
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++) {
+		iavf_init_tx_ring(adapter, i, true);
+		adapter->rx_rings[i].xdp_ring = &adapter->xdp_rings[i];
+	}
+
+	return 0;
 }
 
 /**
@@ -1636,10 +1723,13 @@ static int iavf_alloc_queues(struct iavf_adapter *adapter)
 	adapter->num_active_queues = num_active_queues;
 
 	for (i = 0; i < num_active_queues; i++) {
-		iavf_init_tx_ring(adapter, i);
+		iavf_init_tx_ring(adapter, i, false);
 		iavf_init_rx_ring(adapter, i);
 	}
 
+	if (iavf_alloc_xdp_queues(adapter, num_active_queues))
+		goto err_out;
+
 	iavf_set_queue_vlan_tag_loc(adapter);
 
 	return 0;
@@ -3368,6 +3458,10 @@ void iavf_free_all_tx_resources(struct iavf_adapter *adapter)
 	for (i = 0; i < adapter->num_active_queues; i++)
 		if (adapter->tx_rings[i].desc)
 			iavf_free_tx_resources(&adapter->tx_rings[i]);
+
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++)
+		if (adapter->xdp_rings[i].desc)
+			iavf_free_tx_resources(&adapter->xdp_rings[i]);
 }
 
 /**
@@ -3382,11 +3476,13 @@ void iavf_free_all_tx_resources(struct iavf_adapter *adapter)
  **/
 static int iavf_setup_all_tx_resources(struct iavf_adapter *adapter)
 {
+	struct iavf_ring *ring;
 	int i, err = 0;
 
 	for (i = 0; i < adapter->num_active_queues; i++) {
-		adapter->tx_rings[i].count = adapter->tx_desc_count;
-		err = iavf_setup_tx_descriptors(&adapter->tx_rings[i]);
+		ring = &adapter->tx_rings[i];
+		ring->count = adapter->tx_desc_count;
+		err = iavf_setup_tx_descriptors(ring);
 		if (!err)
 			continue;
 		dev_err(&adapter->pdev->dev,
@@ -3394,6 +3490,17 @@ static int iavf_setup_all_tx_resources(struct iavf_adapter *adapter)
 		break;
 	}
 
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++) {
+		ring = &adapter->xdp_rings[i];
+		ring->count = adapter->tx_desc_count;
+		err = iavf_setup_tx_descriptors(ring);
+		if (!err)
+			continue;
+		dev_err(&adapter->pdev->dev,
+			"Allocation for XDP Queue %u failed\n", i);
+		break;
+	}
+
 	return err;
 }
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 06060fa685e913..2e7cf2dc014780 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -32,6 +32,8 @@ static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring,
 	if (tx_buffer->skb) {
 		if (tx_buffer->tx_flags & IAVF_TX_FLAGS_FD_SB)
 			kfree(tx_buffer->raw_buf);
+		else if (ring->flags & IAVF_TXRX_FLAGS_XDP)
+			page_frag_free(tx_buffer->raw_buf);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 		if (dma_unmap_len(tx_buffer, len))
@@ -82,7 +84,8 @@ void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
 		return;
 
 	/* cleanup Tx queue statistics */
-	netdev_tx_reset_queue(txring_txq(tx_ring));
+	if (!(tx_ring->flags & IAVF_TXRX_FLAGS_XDP))
+		netdev_tx_reset_queue(txring_txq(tx_ring));
 }
 
 /**
@@ -311,8 +314,9 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 	}
 
 	/* notify netdev of completed buffers */
-	netdev_tx_completed_queue(txring_txq(tx_ring),
-				  stats.packets, stats.bytes);
+	if (!(tx_ring->flags & IAVF_TXRX_FLAGS_XDP))
+		netdev_tx_completed_queue(txring_txq(tx_ring),
+					  stats.packets, stats.bytes);
 
 #define TX_WAKE_THRESHOLD ((s16)(DESC_NEEDED * 2))
 	if (unlikely(stats.packets && netif_carrier_ok(tx_ring->netdev) &&
@@ -743,6 +747,10 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 	kfree(rx_ring->rx_pages);
 	rx_ring->rx_pages = NULL;
 
+	/* This also unregisters memory model */
+	if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+
 	libie_rx_page_pool_destroy(rx_ring->pool, &rx_ring->rq_stats);
 	rx_ring->dev = dev;
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 764b0ada0e6833..d3a62fdceb558a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -246,14 +246,13 @@ struct iavf_ring {
 	u16 flags;
 #define IAVF_TXR_FLAGS_WB_ON_ITR		BIT(0)
 #define IAVF_TXRX_FLAGS_ARM_WB			BIT(1)
-/* BIT(2) is free */
+#define IAVF_TXRX_FLAGS_XDP			BIT(2)
 #define IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(3)
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
 #define IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2	BIT(5)
 
-	struct iavf_vsi *vsi;		/* Backreference to associated VSI */
-	struct iavf_q_vector *q_vector;	/* Backreference to associated vector */
-
+	struct bpf_prog __rcu *xdp_prog;
+	struct iavf_ring *xdp_ring;
 	struct sk_buff *skb;		/* When iavf_clean_rx_ring_irq() must
 					 * return before it sees the EOP for
 					 * the current packet, we save that skb
@@ -269,11 +268,15 @@ struct iavf_ring {
 		struct libie_rq_stats rq_stats;
 	};
 
+	struct iavf_vsi *vsi;		/* Backreference to associated VSI */
+	struct iavf_q_vector *q_vector;	/* Backreference to associated vector */
+
 	int prev_pkt_ctr;		/* For stall detection */
 	unsigned int size;		/* length of descriptor ring in bytes */
 	dma_addr_t dma;			/* physical address of ring */
 
 	struct rcu_head rcu;		/* to avoid race on free */
+	struct xdp_rxq_info xdp_rxq;
 } ____cacheline_internodealigned_in_smp;
 
 #define IAVF_ITR_ADAPTIVE_MIN_INC	0x0002
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 18c27ae2d2ee54..060fe36954e621 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -398,17 +398,30 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
  * @adapter: adapter structure
  * @queue_index: index of queue pair in the adapter structure
  * @max_frame: maximal frame size supported by the adapter
+ * @xdp_pair: true if the queue pair is assigned to XDP queues
  *
  * Fill virtchannel queue pair configuration structure
  * with data for the Rx and Tx queues of a given index.
+ * To handle XDP queues, only Tx part of vqpi structure is filled
+ * with data. Because of virtchnl protocol can operate on queue pairs only,
+ * associate each extra Tx queue with an empty Rx queue
+ * (with zero length).
  **/
 static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 				    struct iavf_adapter *adapter,
-				    int queue_index, int max_frame)
+				    int queue_index, int max_frame,
+				    bool xdp_pair)
 {
-	struct iavf_ring *txq = &adapter->tx_rings[queue_index];
 	struct iavf_ring *rxq = &adapter->rx_rings[queue_index];
+	struct iavf_ring *txq;
+	int xdpq_idx;
 
+	if (xdp_pair) {
+		xdpq_idx = queue_index - adapter->num_xdp_tx_queues;
+		txq = &adapter->xdp_rings[xdpq_idx];
+	} else {
+		txq = &adapter->tx_rings[queue_index];
+	}
 	vqpi->txq.vsi_id = adapter->vsi_res->vsi_id;
 	vqpi->txq.queue_id = queue_index;
 	vqpi->txq.ring_len = txq->count;
@@ -416,6 +429,11 @@ static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 
 	vqpi->rxq.vsi_id = adapter->vsi_res->vsi_id;
 	vqpi->rxq.queue_id = queue_index;
+	if (xdp_pair) {
+		vqpi->rxq.ring_len = 0;
+		return;
+	}
+
 	vqpi->rxq.ring_len = rxq->count;
 	vqpi->rxq.dma_ring_addr = rxq->dma;
 	vqpi->rxq.max_pkt_size = max_frame;
@@ -446,6 +464,7 @@ int iavf_get_configure_queues_result(struct iavf_adapter *adapter,
  **/
 void iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask)
 {
+	int pairs = adapter->num_active_queues + adapter->num_xdp_tx_queues;
 	unsigned long num_qps_to_config, mask = qp_mask;
 	u32 idx, max_frame = adapter->vf_res->max_mtu;
 	struct virtchnl_vsi_queue_config_info *vqci;
@@ -474,7 +493,13 @@ void iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask)
 	 * can fit info for 31 of them into the AQ buffer before it overflows.
 	 */
 	for_each_set_bit(idx, &mask, adapter->num_active_queues) {
-		iavf_set_qp_config_info(vqpi, adapter, idx, max_frame);
+		iavf_set_qp_config_info(vqpi, adapter, idx, max_frame, false);
+		vqpi++;
+	}
+
+	/* Set configuration info for XDP Tx queues. */
+	for_each_set_bit_from(idx, &mask, pairs) {
+		iavf_set_qp_config_info(vqpi, adapter, idx, max_frame, true);
 		vqpi++;
 	}
 
@@ -492,7 +517,7 @@ void iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask)
  **/
 void iavf_configure_queues(struct iavf_adapter *adapter)
 {
-	int pairs = adapter->num_active_queues;
+	int pairs = adapter->num_active_queues + adapter->num_xdp_tx_queues;
 	u32 qpair_mask = BIT(pairs) - 1;
 
 	iavf_configure_selected_queues(adapter, qpair_mask);
@@ -592,7 +617,9 @@ void iavf_disable_selected_queues(struct iavf_adapter *adapter, u32 rx_queues,
  **/
 void iavf_enable_queues(struct iavf_adapter *adapter)
 {
-	u32 num_tx_queues = adapter->num_active_queues;
+	u32 num_tx_queues = adapter->num_active_queues +
+			    adapter->num_xdp_tx_queues;
+
 	u32 rx_queues = BIT(adapter->num_active_queues) - 1;
 	u32 tx_queues = BIT(num_tx_queues) - 1;
 
@@ -607,7 +634,9 @@ void iavf_enable_queues(struct iavf_adapter *adapter)
  **/
 void iavf_disable_queues(struct iavf_adapter *adapter)
 {
-	u32 num_tx_queues = adapter->num_active_queues;
+	u32 num_tx_queues = adapter->num_active_queues +
+			    adapter->num_xdp_tx_queues;
+
 	u32 rx_queues = BIT(adapter->num_active_queues) - 1;
 	u32 tx_queues = BIT(num_tx_queues) - 1;
 

From 8bda0df374ae69b01bc68eba12d75343ab197ad6 Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Wed, 22 Feb 2023 13:00:48 +0100
Subject: [PATCH 21/32] iavf: don't hardcode DMA direction, headroom and buffer
 len on Rx

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   | 27 ++++++++++---------
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |  9 ++++---
 drivers/net/ethernet/intel/libie/rx.c         | 24 +++++++++++++++--
 include/linux/net/intel/libie/rx.h            | 15 ++++++-----
 4 files changed, 51 insertions(+), 24 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 2e7cf2dc014780..6ccd343bf09c9e 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -868,6 +868,7 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 	struct page_pool *pool = rx_ring->pool;
 	u32 ntu = rx_ring->next_to_use;
 	union iavf_rx_desc *rx_desc;
+	u32 hr = pool->p.offset;
 
 	/* do nothing if no valid netdev defined */
 	if (unlikely(!rx_ring->netdev || !to_refill))
@@ -889,7 +890,7 @@ static u32 __iavf_alloc_rx_pages(struct iavf_ring *rx_ring, u32 to_refill,
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
-		rx_desc->read.pkt_addr = cpu_to_le64(dma + LIBIE_SKB_HEADROOM);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma + hr);
 
 		rx_desc++;
 		ntu++;
@@ -1033,35 +1034,36 @@ void iavf_process_skb_fields(struct iavf_ring *rx_ring,
  * iavf_add_rx_frag - Add contents of Rx buffer to sk_buff
  * @skb: sk_buff to place the data into
  * @page: page containing data to add
+ * @hr: headroom in front of the data
  * @size: packet length from rx_desc
  *
  * This function will add the data contained in page to the skb.
  * It will just attach the page as a frag to the skb.
- *
- * The function will then update the page offset.
- **/
-static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 size)
+ */
+static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 hr,
+			     u32 size)
 {
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
-			LIBIE_SKB_HEADROOM, size, LIBIE_RX_TRUESIZE);
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, hr, size,
+			LIBIE_RX_TRUESIZE);
 }
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
  * @page: Rx page to with the data
+ * @hr: headroom in front of the data
  * @size: size of the data
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
+static struct sk_buff *iavf_build_skb(struct page *page, u32 hr, u32 size)
 {
 	struct sk_buff *skb;
 	void *va;
 
 	/* prefetch first cache line of first page */
 	va = page_address(page);
-	net_prefetch(va + LIBIE_SKB_HEADROOM);
+	net_prefetch(va + hr);
 
 	/* build an skb around the page buffer */
 	skb = napi_build_skb(va, LIBIE_RX_TRUESIZE);
@@ -1071,7 +1073,7 @@ static struct sk_buff *iavf_build_skb(struct page *page, u32 size)
 	skb_mark_for_recycle(skb);
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, LIBIE_SKB_HEADROOM);
+	skb_reserve(skb, hr);
 	__skb_put(skb, size);
 
 	return skb;
@@ -1114,6 +1116,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	struct sk_buff *skb = rx_ring->skb;
 	u32 ntc = rx_ring->next_to_clean;
 	u32 ring_size = rx_ring->count;
+	u32 hr = pool->p.offset;
 	u32 cleaned_count = 0;
 
 	while (likely(cleaned_count < budget)) {
@@ -1170,9 +1173,9 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 
 		/* retrieve a buffer from the ring */
 		if (skb)
-			iavf_add_rx_frag(skb, page, size);
+			iavf_add_rx_frag(skb, page, hr, size);
 		else
-			skb = iavf_build_skb(page, size);
+			skb = iavf_build_skb(page, hr, size);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index 060fe36954e621..d7d37e2e7ff4b3 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -409,10 +409,11 @@ int iavf_get_vf_vlan_v2_caps(struct iavf_adapter *adapter)
  **/
 static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 				    struct iavf_adapter *adapter,
-				    int queue_index, int max_frame,
+				    int queue_index, u32 max_frame,
 				    bool xdp_pair)
 {
 	struct iavf_ring *rxq = &adapter->rx_rings[queue_index];
+	const struct page_pool_params *pp = &rxq->pool->p;
 	struct iavf_ring *txq;
 	int xdpq_idx;
 
@@ -434,10 +435,12 @@ static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 		return;
 	}
 
+	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN(pp->offset));
+
 	vqpi->rxq.ring_len = rxq->count;
 	vqpi->rxq.dma_ring_addr = rxq->dma;
 	vqpi->rxq.max_pkt_size = max_frame;
-	vqpi->rxq.databuffer_size = LIBIE_RX_BUF_LEN;
+	vqpi->rxq.databuffer_size = pp->max_len;
 }
 
 /**
@@ -471,8 +474,6 @@ void iavf_configure_selected_queues(struct iavf_adapter *adapter, u32 qp_mask)
 	struct virtchnl_queue_pair_info *vqpi;
 	size_t len;
 
-	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN);
-
 	if (adapter->current_op != VIRTCHNL_OP_UNKNOWN) {
 		/* bail because we already have a command pending */
 		dev_err(&adapter->pdev->dev, "Cannot configure queues, command %d pending\n",
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
index 10ef8741326ad2..293c2cc19a0ec3 100644
--- a/drivers/net/ethernet/intel/libie/rx.c
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -109,6 +109,25 @@ EXPORT_SYMBOL_NS_GPL(libie_rx_ptype_lut, LIBIE);
 
 /* Page Pool */
 
+/**
+ * libie_rx_sync_len - get the actual buffer size to be synced and passed to HW
+ * @dev: &net_device to calculate the size for
+ * @hr: headroom in front of each frame
+ *
+ * Returns the buffer size to pass it to HW and use for DMA synchronization
+ * for the MTU the @dev has.
+ */
+static u32 libie_rx_sync_len(const struct net_device *dev, u32 hr)
+{
+	u32 len;
+
+	len = READ_ONCE(dev->mtu) + LIBIE_RX_LL_LEN;
+	len = ALIGN(len, LIBIE_RX_BUF_LEN_ALIGN);
+	len = min(len, LIBIE_RX_BUF_LEN(hr));
+
+	return len;
+}
+
 /**
  * libie_rx_page_pool_create - create a PP with the default libie settings
  * @dev: &net_device which a PP will be created for
@@ -119,6 +138,7 @@ EXPORT_SYMBOL_NS_GPL(libie_rx_ptype_lut, LIBIE);
 struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 					    u32 size)
 {
+	u32 hr = LIBIE_SKB_HEADROOM;
 	const struct page_pool_params pp = {
 		.flags		= PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK |
 				  PP_FLAG_DMA_SYNC_DEV,
@@ -127,8 +147,8 @@ struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 		.nid		= NUMA_NO_NODE,
 		.dev		= dev->dev.parent,
 		.dma_dir	= DMA_FROM_DEVICE,
-		.max_len	= LIBIE_RX_BUF_LEN,
-		.offset		= LIBIE_SKB_HEADROOM,
+		.max_len	= libie_rx_sync_len(dev, hr),
+		.offset		= hr,
 	};
 
 	static_assert((PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK) ==
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index f063a30f182ecb..ca601d8e4b8f8f 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -132,6 +132,8 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 
 /* Space reserved in front of each frame */
 #define LIBIE_SKB_HEADROOM	(NET_SKB_PAD + NET_IP_ALIGN)
+/* Maximum headroom to calculate max MTU below */
+#define LIBIE_MAX_HEADROOM	LIBIE_SKB_HEADROOM
 /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
 #define LIBIE_RX_LL_LEN		(ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
 
@@ -143,22 +145,23 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 /* HW-writeable space in one buffer: truesize - headroom/tailroom,
  * HW-aligned
  */
-#define __LIBIE_RX_BUF_LEN						    \
-	ALIGN_DOWN(SKB_MAX_ORDER(LIBIE_SKB_HEADROOM, LIBIE_RX_PAGE_ORDER),  \
+#define __LIBIE_RX_BUF_LEN(hr)						    \
+	ALIGN_DOWN(SKB_MAX_ORDER(hr, LIBIE_RX_PAGE_ORDER),		    \
 		   LIBIE_RX_BUF_LEN_ALIGN)
 /* The largest size for a single descriptor as per HW */
 #define LIBIE_MAX_RX_BUF_LEN	9728U
 /* "True" HW-writeable space: minimum from SW and HW values */
-#define LIBIE_RX_BUF_LEN	min_t(u32, __LIBIE_RX_BUF_LEN,		    \
+#define LIBIE_RX_BUF_LEN(hr)	min_t(u32, __LIBIE_RX_BUF_LEN(hr),	    \
 				      LIBIE_MAX_RX_BUF_LEN)
 
 /* The maximum frame size as per HW (S/G) */
 #define __LIBIE_MAX_RX_FRM_LEN	16382U
 /* ATST, HW can chain up to 5 Rx descriptors */
-#define LIBIE_MAX_RX_FRM_LEN	min_t(u32, __LIBIE_MAX_RX_FRM_LEN,	    \
-				      LIBIE_RX_BUF_LEN * 5)
+#define LIBIE_MAX_RX_FRM_LEN(hr)					    \
+	min_t(u32, __LIBIE_MAX_RX_FRM_LEN, LIBIE_RX_BUF_LEN(hr) * 5)
 /* Maximum frame size minus LL overhead */
-#define LIBIE_MAX_MTU		(LIBIE_MAX_RX_FRM_LEN - LIBIE_RX_LL_LEN)
+#define LIBIE_MAX_MTU		(LIBIE_MAX_RX_FRM_LEN(LIBIE_MAX_HEADROOM) - \
+				 LIBIE_RX_LL_LEN)
 
 /* DMA mapping attributes for Rx buffers: no impl. sync + relaxed on Sparc */
 #define LIBIE_RX_DMA_ATTR						    \

From 6b96d3cf6b42362d5e8b2f615847da3d443ec7b9 Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Fri, 2 Dec 2022 07:17:18 -0500
Subject: [PATCH 22/32] iavf: Handle XDP_SETUP_PROG command in .ndo_bpf

Add .ndo_bpf function to handle XDP_SETUP_PROG command.

In order to avoid synchronization issues, implement functions
dedicated to re-initialize only those parts of the interface which
are really necessary to setup the XDP program.
Such an approach is much lighter than performing a full reset of the
driver and thanks to it we can immediately know the result of traffic
initialization comparing to the reset task which triggers some
asynchronous events (e.g. link speed negotiation).

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 361 ++++++++++++++++++++
 include/linux/net/intel/libie/rx.h          |   5 +-
 2 files changed, 365 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index f7c6549086ae0b..cfb6d1bf45fdba 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -524,6 +524,53 @@ static void iavf_map_rings_to_vectors(struct iavf_adapter *adapter)
 	adapter->aq_required |= IAVF_FLAG_AQ_MAP_VECTORS;
 }
 
+/**
+ * iavf_unmap_rings_from_vectors - Clear existing mapping for queues and vectors
+ * @adapter: board private structure
+ *
+ **/
+static void iavf_unmap_rings_from_vectors(struct iavf_adapter *adapter)
+{
+	struct iavf_ring *rx_ring, *tx_ring;
+	struct iavf_q_vector *q_vector;
+	int num_q_vectors, i;
+
+	num_q_vectors = adapter->num_msix_vectors - NONQ_VECS;
+	for (i = 0; i < num_q_vectors; i++) {
+		q_vector = &adapter->q_vectors[i];
+		q_vector->tx.ring = NULL;
+		q_vector->tx.count = 0;
+		q_vector->tx.next_update = 0;
+		q_vector->tx.target_itr = 0;
+		q_vector->tx.current_itr = 0;
+		q_vector->num_ringpairs = 0;
+
+		q_vector->rx.ring = NULL;
+		q_vector->rx.count = 0;
+		q_vector->rx.next_update = 0;
+		q_vector->rx.target_itr = 0;
+		q_vector->rx.current_itr = 0;
+		q_vector->ring_mask = 0;
+	}
+
+	for (i = 0; i < adapter->num_active_queues; i++) {
+		rx_ring = &adapter->rx_rings[i];
+		tx_ring = &adapter->tx_rings[i];
+
+		rx_ring->q_vector = NULL;
+		rx_ring->next = NULL;
+		tx_ring->q_vector = NULL;
+		tx_ring->next = NULL;
+	}
+
+	for (i = 0; i < adapter->num_xdp_tx_queues; i++) {
+		tx_ring = &adapter->xdp_rings[i];
+
+		tx_ring->q_vector = NULL;
+		tx_ring->next = NULL;
+	}
+}
+
 /**
  * iavf_irq_affinity_notify - Callback for affinity changes
  * @notify: context as to what irq was changed
@@ -2972,6 +3019,36 @@ static void iavf_watchdog_task(struct work_struct *work)
 				   HZ * 2);
 }
 
+/**
+ * iavf_xchg_xdp_prog - set new prog and get an old one
+ * @adapter: board private structure
+ * @prog: new XDP program
+ *
+ * Returns pointer to the old XDP program.
+ * adapter->xdp_prog is not used in packet processing, so it can be
+ * safely set kinda like a flag before resource re-configuration (reset)
+ **/
+static struct bpf_prog *iavf_xchg_xdp_prog(struct iavf_adapter *adapter,
+					   struct bpf_prog *prog)
+{
+	return xchg(&adapter->xdp_prog, prog);
+}
+
+/**
+ * iavf_free_xdp_prog - Release XDP program, if present
+ * @adapter: board private structure
+ *
+ * Should be used when adapter is being removed.
+ **/
+static void iavf_free_xdp_prog(struct iavf_adapter *adapter)
+{
+	struct bpf_prog *old_xdp_prog;
+
+	old_xdp_prog = iavf_xchg_xdp_prog(adapter, NULL);
+	if (old_xdp_prog)
+		bpf_prog_put(old_xdp_prog);
+}
+
 /**
  * iavf_disable_vf - disable VF
  * @adapter: board private structure
@@ -3003,6 +3080,8 @@ static void iavf_disable_vf(struct iavf_adapter *adapter)
 		iavf_free_all_rx_resources(adapter);
 	}
 
+	iavf_free_xdp_prog(adapter);
+
 	spin_lock_bh(&adapter->mac_vlan_list_lock);
 
 	/* Delete all of the filters */
@@ -4793,6 +4872,285 @@ static netdev_features_t iavf_fix_features(struct net_device *netdev,
 	return iavf_fix_netdev_vlan_features(adapter, features);
 }
 
+/**
+ * iavf_copy_xdp_prog_to_rings - update XDP prog references in rings
+ * @adapter: board private structure
+ *
+ * If program change also requires XDP resources reconfiguration,
+ * schedule a reset instead
+ **/
+static void iavf_copy_xdp_prog_to_rings(const struct iavf_adapter *adapter)
+{
+	for (u32 i = 0; i < adapter->num_active_queues; i++)
+		rcu_assign_pointer(adapter->rx_rings[i].xdp_prog,
+				   adapter->xdp_prog);
+
+	/* No queue changes are needed, but running RX processing must finish */
+	synchronize_net();
+}
+
+/**
+ * iavf_assign_bpf_prog - Assign a given BPF program to adapter
+ * @adapter: board private structure
+ * @prog: BPF program to be assigned to adapter
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static void iavf_assign_bpf_prog(struct iavf_adapter *adapter,
+				 struct bpf_prog *prog)
+{
+	struct bpf_prog *old_prog;
+
+	old_prog = iavf_xchg_xdp_prog(adapter, prog);
+	if (old_prog)
+		bpf_prog_put(old_prog);
+}
+
+#define IAVF_XDP_MSG_TIMEOUT_MS		300
+#define IAVF_XDP_LOCK_TIMEOUT_MS	5000
+
+/**
+ * iavf_xdp_pause_if - Pause a working network interface for XDP setup
+ * @adapter: board private structure
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int iavf_xdp_pause_if(struct iavf_adapter *adapter)
+{
+	int err;
+
+	netif_carrier_off(adapter->netdev);
+	netif_tx_disable(adapter->netdev);
+
+	iavf_napi_disable_all(adapter);
+
+	iavf_irq_disable(adapter);
+	iavf_misc_irq_disable(adapter);
+
+	iavf_disable_queues(adapter);
+	err = iavf_get_queue_disable_result(adapter, IAVF_XDP_MSG_TIMEOUT_MS);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "cannot disable queues for XDP setup, error: %d\n", err);
+		return err;
+	}
+
+	iavf_free_all_tx_resources(adapter);
+	iavf_free_all_rx_resources(adapter);
+
+	iavf_free_traffic_irqs(adapter);
+
+	return 0;
+}
+
+/**
+ * iavf_xdp_resume_if - Resume a stopped network interface after XDP setup
+ * @adapter: board private structure
+ *
+ * Returns 0 on success, negative on failure
+ **/
+static int iavf_xdp_resume_if(struct iavf_adapter *adapter)
+{
+	int err;
+
+	err = iavf_setup_all_tx_resources(adapter);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "cannot setup Tx resources, error: %d\n", err);
+		goto err_setup_tx_resources;
+	}
+
+	err = iavf_setup_all_rx_resources(adapter);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "cannot setup Rx resources, error: %d\n", err);
+		goto err_setup_rx_resources;
+	}
+
+	iavf_request_traffic_irqs(adapter, adapter->netdev->name);
+
+	iavf_configure_tx(adapter);
+	iavf_configure_rx(adapter);
+
+	iavf_configure_queues(adapter);
+	err = iavf_get_configure_queues_result(adapter,
+					       IAVF_XDP_MSG_TIMEOUT_MS);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "cannot configure queues in PF, error: %d\n", err);
+		goto err_setup_rx_resources;
+	}
+
+	iavf_enable_queues(adapter);
+	err = iavf_get_queue_enable_result(adapter, IAVF_XDP_MSG_TIMEOUT_MS);
+	if (err) {
+		dev_err(&adapter->pdev->dev, "cannot enable queues in PF, error: %d\n", err);
+		goto err_setup_rx_resources;
+	}
+
+	iavf_napi_enable_all(adapter);
+	iavf_irq_enable(adapter, true);
+
+	netif_tx_start_all_queues(adapter->netdev);
+	netif_carrier_on(adapter->netdev);
+
+	return 0;
+
+err_setup_rx_resources:
+	iavf_free_all_rx_resources(adapter);
+err_setup_tx_resources:
+	iavf_free_all_tx_resources(adapter);
+
+	return err;
+}
+
+/**
+ * iavf_prepare_xdp_rings - add XDP program to adapter and setup XDP rings
+ *			    to handle that program.
+ * @adapter: board private structure
+ * @prog: XDP program
+ **/
+static int iavf_prepare_xdp_rings(struct iavf_adapter *adapter,
+				  struct bpf_prog *prog)
+{
+	int err;
+
+	iavf_unmap_rings_from_vectors(adapter);
+	iavf_free_xdp_queues(adapter);
+
+	iavf_assign_bpf_prog(adapter, prog);
+
+	err = iavf_alloc_xdp_queues(adapter, adapter->num_active_queues);
+	if (err) {
+		dev_err(&adapter->pdev->dev,
+			"cannot allocate memory for queues, error: %d\n", err);
+		goto err_alloc_queues;
+	}
+
+	iavf_set_xdp_queue_vlan_tag_loc(adapter);
+
+	iavf_map_rings_to_vectors(adapter);
+
+	return 0;
+
+err_alloc_queues:
+	return err;
+}
+
+/**
+ * iavf_xdp_can_create_queues - check if queue number is appropriate for XDP
+ * @adapter: board private structure
+ * @extack: netlink extended ack
+ **/
+static bool iavf_xdp_can_create_queues(struct iavf_adapter *adapter,
+				       struct netlink_ext_ack *extack)
+{
+	u32 max_qp_num = adapter->vsi_res->num_queue_pairs;
+	u32 num_active_queues = adapter->num_active_queues;
+
+	if (num_active_queues * 2 <= max_qp_num)
+		return true;
+
+	netdev_warn(adapter->netdev,
+		    "Current number of queue pairs (%u) set on adapter is too high to enable XDP, please configure queue number through ethtool to be no bigger than %u",
+		    num_active_queues, max_qp_num);
+
+	NL_SET_ERR_MSG_MOD(extack,
+			   "XDP cannot be enabled due to configured queue number being too large, please check dmesg for more info");
+
+	return false;
+}
+
+/**
+ * iavf_setup_xdp - handle xdp program change
+ * @adapter: board private structure
+ * @prog: XDP program
+ * @extack: netlink extended ack
+ **/
+static int iavf_setup_xdp(struct iavf_adapter *adapter, struct bpf_prog *prog,
+			  struct netlink_ext_ack *extack)
+{
+	u32 frame_size = READ_ONCE(adapter->netdev->mtu) + LIBIE_RX_LL_LEN;
+	bool needs_reconfig = !!prog != iavf_adapter_xdp_active(adapter);
+	bool was_running = netif_running(adapter->netdev);
+	int err;
+
+	if (prog && frame_size > LIBIE_RX_BUF_LEN(LIBIE_XDP_HEADROOM)) {
+		NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP");
+		return -EOPNOTSUPP;
+	}
+
+	if (needs_reconfig) {
+		if (iavf_lock_timeout(&adapter->crit_lock,
+				      IAVF_XDP_LOCK_TIMEOUT_MS)) {
+			dev_err(&adapter->pdev->dev,
+				"failed to acquire crit_lock in %s\n", __FUNCTION__);
+			return -EBUSY;
+		}
+		err = iavf_process_pending_pf_msg(adapter,
+						  IAVF_XDP_LOCK_TIMEOUT_MS);
+		if (err)
+			goto err_pause_resume_if;
+
+		if (!iavf_xdp_can_create_queues(adapter, extack)) {
+			err = -EOPNOTSUPP;
+			goto err_pause_resume_if;
+		}
+
+		if (was_running) {
+			err = iavf_xdp_pause_if(adapter);
+			if (err) {
+				dev_err(&adapter->pdev->dev,
+					"cannot pause the interface to setup XDP, error: %d\n",
+					err);
+				goto err_pause_resume_if;
+			}
+		}
+
+		err = iavf_prepare_xdp_rings(adapter, prog);
+		if (err) {
+			dev_err(&adapter->pdev->dev,
+				"cannot prepare rings to support XDP, error: %d\n",
+				err);
+			goto err_pause_resume_if;
+		}
+
+		if (was_running) {
+			err = iavf_xdp_resume_if(adapter);
+			if (err) {
+				dev_err(&adapter->pdev->dev,
+					"cannot resume the interface after XDP setup, error: %d\n",
+					err);
+				goto err_pause_resume_if;
+			}
+		}
+		mutex_unlock(&adapter->crit_lock);
+	} else {
+		iavf_assign_bpf_prog(adapter, prog);
+		iavf_copy_xdp_prog_to_rings(adapter);
+	}
+
+	return 0;
+
+err_pause_resume_if:
+	mutex_unlock(&adapter->crit_lock);
+
+	return err;
+}
+
+/**
+ * iavf_xdp - XDP command handler
+ * @dev: netdevice
+ * @xdp: XDP command
+ */
+static int iavf_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
+{
+	struct iavf_adapter *adapter = netdev_priv(netdev);
+
+	switch (xdp->command) {
+	case XDP_SETUP_PROG:
+		return iavf_setup_xdp(adapter, xdp->prog, xdp->extack);
+	default:
+		return -EINVAL;
+	}
+}
+
 static const struct net_device_ops iavf_netdev_ops = {
 	.ndo_open		= iavf_open,
 	.ndo_stop		= iavf_close,
@@ -4808,6 +5166,7 @@ static const struct net_device_ops iavf_netdev_ops = {
 	.ndo_fix_features	= iavf_fix_features,
 	.ndo_set_features	= iavf_set_features,
 	.ndo_setup_tc		= iavf_setup_tc,
+	.ndo_bpf		= iavf_xdp,
 };
 
 /**
@@ -5242,6 +5601,8 @@ static void iavf_remove(struct pci_dev *pdev)
 	iavf_free_all_rx_resources(adapter);
 	iavf_free_misc_irq(adapter);
 
+	iavf_free_xdp_prog(adapter);
+
 	iavf_reset_interrupt_capability(adapter);
 	iavf_free_q_vectors(adapter);
 
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index ca601d8e4b8f8f..72dd85f789fce1 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -6,6 +6,7 @@
 
 #include <linux/if_vlan.h>
 #include <linux/netdevice.h>
+#include <uapi/linux/bpf.h>
 
 /* O(1) converting i40e/ice/iavf's 8/10-bit hardware packet type to a parsed
  * bitfield struct.
@@ -132,8 +133,10 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 
 /* Space reserved in front of each frame */
 #define LIBIE_SKB_HEADROOM	(NET_SKB_PAD + NET_IP_ALIGN)
+#define LIBIE_XDP_HEADROOM	(max(XDP_PACKET_HEADROOM, NET_SKB_PAD) +    \
+				 NET_IP_ALIGN)
 /* Maximum headroom to calculate max MTU below */
-#define LIBIE_MAX_HEADROOM	LIBIE_SKB_HEADROOM
+#define LIBIE_MAX_HEADROOM	LIBIE_XDP_HEADROOM
 /* Link layer / L2 overhead: Ethernet, 2 VLAN tags (C + S), FCS */
 #define LIBIE_RX_LL_LEN		(ETH_HLEN + 2 * VLAN_HLEN + ETH_FCS_LEN)
 

From 01d958490f861fc28703e01c487b5c15be1cd3a6 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Tue, 29 Nov 2022 17:25:21 +0100
Subject: [PATCH 23/32] iavf: Add XDP_PASS and XDP_DROP support

Implement basic XDP program setup, refactor data path
to use xdp_buff, implement XDP_PASS and XDP_DROP actions.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf.h      |   1 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 113 ++++++++++++++++----
 drivers/net/ethernet/intel/libie/rx.c       |   7 +-
 include/linux/net/intel/libie/rx.h          |   2 +-
 4 files changed, 101 insertions(+), 22 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 567d49caf7272e..171a9b8efd12ce 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -28,6 +28,7 @@
 #include <linux/etherdevice.h>
 #include <linux/socket.h>
 #include <linux/jiffies.h>
+#include <linux/filter.h>
 #include <net/ip6_checksum.h>
 #include <net/pkt_cls.h>
 #include <net/pkt_sched.h>
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 6ccd343bf09c9e..8a8175d177fae7 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -2,6 +2,8 @@
 /* Copyright(c) 2013 - 2018 Intel Corporation. */
 
 #include <linux/bitfield.h>
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>
 #include <linux/net/intel/libie/rx.h>
 #include <linux/prefetch.h>
 
@@ -761,6 +763,17 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 	}
 }
 
+/**
+ * iavf_is_xdp_enabled - Check if XDP is enabled on the RX ring
+ * @rx_ring: Rx descriptor ring
+ *
+ * Returns true, if the ring has been configured for XDP.
+ */
+static bool iavf_is_xdp_enabled(const struct iavf_ring *rx_ring)
+{
+	return !!rcu_access_pointer(rx_ring->xdp_prog);
+}
+
 /**
  * iavf_setup_rx_descriptors - Allocate Rx descriptors
  * @rx_ring: Rx descriptor ring (for a specific queue) to setup
@@ -792,7 +805,8 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 		goto err;
 	}
 
-	pool = libie_rx_page_pool_create(rx_ring->netdev, rx_ring->count);
+	pool = libie_rx_page_pool_create(rx_ring->netdev, rx_ring->count,
+					 iavf_is_xdp_enabled(rx_ring));
 	if (IS_ERR(pool)) {
 		ret = PTR_ERR(pool);
 		goto err_free_dma;
@@ -1049,32 +1063,32 @@ static void iavf_add_rx_frag(struct sk_buff *skb, struct page *page, u32 hr,
 
 /**
  * iavf_build_skb - Build skb around an existing buffer
- * @page: Rx page to with the data
- * @hr: headroom in front of the data
- * @size: size of the data
+ * @xdp: initialized XDP buffer
  *
  * This function builds an skb around an existing Rx buffer, taking care
  * to set up the skb correctly and avoid any memcpy overhead.
  */
-static struct sk_buff *iavf_build_skb(struct page *page, u32 hr, u32 size)
+static struct sk_buff *iavf_build_skb(const struct xdp_buff *xdp)
 {
 	struct sk_buff *skb;
-	void *va;
+	u32 metasize;
 
-	/* prefetch first cache line of first page */
-	va = page_address(page);
-	net_prefetch(va + hr);
+	net_prefetch(xdp->data_meta);
 
 	/* build an skb around the page buffer */
-	skb = napi_build_skb(va, LIBIE_RX_TRUESIZE);
+	skb = napi_build_skb(xdp->data_hard_start, LIBIE_RX_TRUESIZE);
 	if (unlikely(!skb))
 		return NULL;
 
 	skb_mark_for_recycle(skb);
 
 	/* update pointers within the skb to store the data */
-	skb_reserve(skb, hr);
-	__skb_put(skb, size);
+	skb_reserve(skb, xdp->data - xdp->data_hard_start);
+	__skb_put(skb, xdp->data_end - xdp->data);
+
+	metasize = xdp->data - xdp->data_meta;
+	if (metasize)
+		skb_metadata_set(skb, metasize);
 
 	return skb;
 }
@@ -1095,6 +1109,39 @@ static bool iavf_is_non_eop(u64 qword, struct libie_rq_onstack_stats *stats)
 	return true;
 }
 
+/**
+ * iavf_run_xdp - Run XDP program and perform resulting action
+ * @rx_ring: RX descriptor ring to transact packets on
+ * @xdp: a prepared XDP buffer
+ * @xdp_prog: an XDP program assigned to the interface
+ *
+ * Returns resulting XDP action.
+ **/
+static unsigned int
+iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
+	     struct bpf_prog *xdp_prog)
+{
+	unsigned int xdp_act;
+
+	xdp_act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	switch (xdp_act) {
+	case XDP_PASS:
+	case XDP_DROP:
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, xdp_act);
+
+		fallthrough;
+	case XDP_ABORTED:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, xdp_act);
+
+		return XDP_DROP;
+	}
+
+	return xdp_act;
+}
+
 /**
  * iavf_clean_rx_irq - Clean completed descriptors from Rx ring - bounce buf
  * @rx_ring: rx descriptor ring to transact packets on
@@ -1116,13 +1163,19 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	struct sk_buff *skb = rx_ring->skb;
 	u32 ntc = rx_ring->next_to_clean;
 	u32 ring_size = rx_ring->count;
+	struct bpf_prog *xdp_prog;
 	u32 hr = pool->p.offset;
 	u32 cleaned_count = 0;
+	unsigned int xdp_act;
+	struct xdp_buff xdp;
+
+	xdp_prog = rcu_dereference(rx_ring->xdp_prog);
+	xdp_init_buff(&xdp, PAGE_SIZE, &rx_ring->xdp_rxq);
 
 	while (likely(cleaned_count < budget)) {
 		union iavf_rx_desc *rx_desc;
+		u32 size, put_size;
 		struct page *page;
-		unsigned int size;
 		u16 vlan_tag = 0;
 		u64 qword;
 
@@ -1166,32 +1219,52 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		 */
 		if (unlikely(!size)) {
 			page_pool_recycle_direct(pool, page);
-			goto skip_data;
+			goto no_skb;
 		}
 
 		page_pool_dma_sync_for_cpu(pool, page, size);
+		put_size = size;
+
+		xdp_prepare_buff(&xdp, page_address(page), hr, size, true);
+		if (!xdp_prog)
+			goto construct_skb;
 
+		xdp_act = iavf_run_xdp(rx_ring, &xdp, xdp_prog);
+		put_size = max_t(u32, xdp.data_end - xdp.data_hard_start - hr,
+				 put_size);
+
+		if (xdp_act != XDP_PASS) {
+			page_pool_put_page(pool, page, put_size, true);
+
+			stats.bytes += size;
+			stats.packets++;
+
+			skb = NULL;
+			goto no_skb;
+		}
+
+construct_skb:
 		/* retrieve a buffer from the ring */
 		if (skb)
 			iavf_add_rx_frag(skb, page, hr, size);
 		else
-			skb = iavf_build_skb(page, hr, size);
+			skb = iavf_build_skb(&xdp);
 
 		/* exit if we failed to retrieve a buffer */
 		if (!skb) {
-			page_pool_put_page(pool, page, size, true);
+			page_pool_put_page(pool, page, put_size, true);
 			libie_stats_inc_one(&rx_ring->rq_stats,
 					    build_skb_fail);
 			break;
 		}
 
-skip_data:
+no_skb:
 		cleaned_count++;
 		to_refill++;
 		if (unlikely(++ntc == ring_size))
 			ntc = 0;
 
-		if (iavf_is_non_eop(qword, &stats))
+		if (iavf_is_non_eop(qword, &stats) || !skb)
 			continue;
 
 		prefetch(rx_desc);
@@ -1395,6 +1468,8 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 	 */
 	budget_per_ring = max(budget/q_vector->num_ringpairs, 1);
 
+	rcu_read_lock();
+
 	iavf_for_each_ring(ring, q_vector->rx) {
 		int cleaned = iavf_clean_rx_irq(ring, budget_per_ring);
 
@@ -1404,6 +1479,8 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 			clean_complete = false;
 	}
 
+	rcu_read_unlock();
+
 	/* If work not completed, return budget and polling will return */
 	if (!clean_complete) {
 		int cpu_id = smp_processor_id();
diff --git a/drivers/net/ethernet/intel/libie/rx.c b/drivers/net/ethernet/intel/libie/rx.c
index 293c2cc19a0ec3..65475bf6d2d27f 100644
--- a/drivers/net/ethernet/intel/libie/rx.c
+++ b/drivers/net/ethernet/intel/libie/rx.c
@@ -132,13 +132,14 @@ static u32 libie_rx_sync_len(const struct net_device *dev, u32 hr)
  * libie_rx_page_pool_create - create a PP with the default libie settings
  * @dev: &net_device which a PP will be created for
  * @size: size of the PP, usually simply Rx queue len
+ * @xdp: whether XDP is enabled on the device
  *
  * Returns &page_pool on success, casted -errno on failure.
  */
 struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
-					    u32 size)
+					    u32 size, bool xdp)
 {
-	u32 hr = LIBIE_SKB_HEADROOM;
+	u32 hr = xdp ? LIBIE_XDP_HEADROOM : LIBIE_SKB_HEADROOM;
 	const struct page_pool_params pp = {
 		.flags		= PP_FLAG_DMA_MAP | PP_FLAG_DMA_MAP_WEAK |
 				  PP_FLAG_DMA_SYNC_DEV,
@@ -146,7 +147,7 @@ struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
 		.pool_size	= size,
 		.nid		= NUMA_NO_NODE,
 		.dev		= dev->dev.parent,
-		.dma_dir	= DMA_FROM_DEVICE,
+		.dma_dir	= xdp ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE,
 		.max_len	= libie_rx_sync_len(dev, hr),
 		.offset		= hr,
 	};
diff --git a/include/linux/net/intel/libie/rx.h b/include/linux/net/intel/libie/rx.h
index 72dd85f789fce1..d73efd721ffc6f 100644
--- a/include/linux/net/intel/libie/rx.h
+++ b/include/linux/net/intel/libie/rx.h
@@ -173,7 +173,7 @@ static inline void libie_skb_set_hash(struct sk_buff *skb, u32 hash,
 struct libie_rq_stats;
 
 struct page_pool *libie_rx_page_pool_create(const struct net_device *dev,
-					    u32 size);
+					    u32 size, bool xdp);
 void libie_rx_page_pool_destroy(struct page_pool *pool,
 				struct libie_rq_stats *stats);
 

From ddc7ac8719c8fe1f859d3d73f5eb3ffd07572db3 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 14:42:21 +0100
Subject: [PATCH 24/32] iavf: Implement XDP_TX action

Implement sending the packet from an XDP ring.
XDP path functions are separate from the general TX routines,
because this allows to simplify and therefore speedup the process.
It also makes code more friendly to future XDP-specific optimizations.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 174 ++++++++++++++++++--
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  56 +++++++
 2 files changed, 219 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 8a8175d177fae7..db2c29bccaefd3 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -11,6 +11,10 @@
 #include "iavf_trace.h"
 #include "iavf_prototype.h"
 
+static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
+			      struct iavf_ring *xdp_ring,
+			      bool map);
+
 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
 				u32 td_tag)
 {
@@ -81,6 +85,8 @@ void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
 
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
+	tx_ring->next_rs = IAVF_RING_QUARTER(tx_ring) - 1;
+	tx_ring->next_dd = IAVF_RING_QUARTER(tx_ring) - 1;
 
 	if (!tx_ring->netdev)
 		return;
@@ -296,9 +302,7 @@ static bool iavf_clean_tx_irq(struct iavf_vsi *vsi,
 
 	i += tx_ring->count;
 	tx_ring->next_to_clean = i;
-	libie_sq_napi_stats_add(&tx_ring->sq_stats, &stats);
-	tx_ring->q_vector->tx.total_bytes += stats.bytes;
-	tx_ring->q_vector->tx.total_packets += stats.packets;
+	iavf_update_tx_ring_stats(tx_ring, &stats);
 
 	if (tx_ring->flags & IAVF_TXR_FLAGS_WB_ON_ITR) {
 		/* check to see if there are < 4 descriptors
@@ -688,6 +692,8 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 
 	tx_ring->next_to_use = 0;
 	tx_ring->next_to_clean = 0;
+	tx_ring->next_rs = IAVF_RING_QUARTER(tx_ring) - 1;
+	tx_ring->next_dd = IAVF_RING_QUARTER(tx_ring) - 1;
 	tx_ring->prev_pkt_ctr = -1;
 	for (j = 0; j < tx_ring->count; j++) {
 		tx_desc = IAVF_TX_DESC(tx_ring, j);
@@ -1114,12 +1120,15 @@ static bool iavf_is_non_eop(u64 qword, struct libie_rq_onstack_stats *stats)
  * @rx_ring: RX descriptor ring to transact packets on
  * @xdp: a prepared XDP buffer
  * @xdp_prog: an XDP program assigned to the interface
+ * @xdp_ring: XDP TX queue assigned to the RX ring
+ * @rxq_xdp_act: Logical OR of flags of XDP actions that require finalization
  *
  * Returns resulting XDP action.
  **/
 static unsigned int
 iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
-	     struct bpf_prog *xdp_prog)
+	     struct bpf_prog *xdp_prog, struct iavf_ring *xdp_ring,
+	     u32 *rxq_xdp_act)
 {
 	unsigned int xdp_act;
 
@@ -1129,11 +1138,18 @@ iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 	case XDP_PASS:
 	case XDP_DROP:
 		break;
+	case XDP_TX:
+		if (unlikely(iavf_xmit_xdp_buff(xdp, xdp_ring, false)))
+			goto xdp_err;
+
+		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_TX;
+		break;
 	default:
 		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, xdp_act);
 
 		fallthrough;
 	case XDP_ABORTED:
+xdp_err:
 		trace_xdp_exception(rx_ring->netdev, xdp_prog, xdp_act);
 
 		return XDP_DROP;
@@ -1163,13 +1179,17 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	struct sk_buff *skb = rx_ring->skb;
 	u32 ntc = rx_ring->next_to_clean;
 	u32 ring_size = rx_ring->count;
+	struct iavf_ring *xdp_ring;
 	struct bpf_prog *xdp_prog;
 	u32 hr = pool->p.offset;
 	u32 cleaned_count = 0;
 	unsigned int xdp_act;
 	struct xdp_buff xdp;
+	u32 rxq_xdp_act = 0;
 
 	xdp_prog = rcu_dereference(rx_ring->xdp_prog);
+	if (xdp_prog)
+		xdp_ring = rx_ring->xdp_ring;
 	xdp_init_buff(&xdp, PAGE_SIZE, &rx_ring->xdp_rxq);
 
 	while (likely(cleaned_count < budget)) {
@@ -1229,19 +1249,21 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		if (!xdp_prog)
 			goto construct_skb;
 
-		xdp_act = iavf_run_xdp(rx_ring, &xdp, xdp_prog);
+		xdp_act = iavf_run_xdp(rx_ring, &xdp, xdp_prog, xdp_ring,
+				       &rxq_xdp_act);
 		put_size = max_t(u32, xdp.data_end - xdp.data_hard_start - hr,
 				 put_size);
 
-		if (xdp_act != XDP_PASS) {
+		if (xdp_act == XDP_PASS)
+			goto construct_skb;
+		else if (xdp_act == XDP_DROP)
 			page_pool_put_page(pool, page, put_size, true);
 
-			stats.bytes += size;
-			stats.packets++;
+		stats.bytes += size;
+		stats.packets++;
 
-			skb = NULL;
-			goto no_skb;
-		}
+		skb = NULL;
+		goto no_skb;
 
 construct_skb:
 		/* retrieve a buffer from the ring */
@@ -1306,6 +1328,8 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 	rx_ring->next_to_clean = ntc;
 	rx_ring->skb = skb;
 
+	iavf_finalize_xdp_rx(xdp_ring, rxq_xdp_act);
+
 	if (to_refill >= IAVF_RX_BUFFER_WRITE) {
 		to_refill = __iavf_alloc_rx_pages(rx_ring, to_refill, gfp);
 		/* guarantee a trip back through this routine if there was
@@ -2248,3 +2272,131 @@ netdev_tx_t iavf_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
 
 	return iavf_xmit_frame_ring(skb, tx_ring);
 }
+
+/**
+ * iavf_clean_xdp_irq - Reclaim a batch of TX resources from completed XDP_TX
+ * @xdp_ring: XDP Tx ring
+ *
+ * Returns number of cleaned descriptors.
+ */
+static u32 iavf_clean_xdp_irq(struct iavf_ring *xdp_ring)
+{
+	u32 batch_sz = IAVF_RING_QUARTER(xdp_ring);
+	struct libie_sq_onstack_stats stats = { };
+	struct iavf_tx_desc *next_dd_desc;
+	u32 ntc = xdp_ring->next_to_clean;
+	u32 next_dd = xdp_ring->next_dd;
+	u32 i;
+
+	next_dd_desc = IAVF_TX_DESC(xdp_ring, next_dd);
+	if (!(next_dd_desc->cmd_type_offset_bsz &
+	      cpu_to_le64(IAVF_TX_DESC_DTYPE_DESC_DONE)))
+		return 0;
+
+	for (i = 0; i < batch_sz; i++) {
+		struct iavf_tx_buffer *tx_buf = &xdp_ring->tx_bi[ntc];
+
+		stats.bytes += tx_buf->bytecount;
+		/* normally tx_buf->gso_segs was taken but at this point
+		 * it's always 1 for us
+		 */
+		stats.packets++;
+
+		dma_unmap_single(xdp_ring->dev, dma_unmap_addr(tx_buf, dma),
+				 dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
+		dma_unmap_len_set(tx_buf, len, 0);
+		page_frag_free(tx_buf->raw_buf);
+		tx_buf->raw_buf = NULL;
+
+		ntc++;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+
+	next_dd_desc->cmd_type_offset_bsz = 0;
+	xdp_ring->next_dd = xdp_ring->next_dd + batch_sz;
+	if (xdp_ring->next_dd > xdp_ring->count)
+		xdp_ring->next_dd = batch_sz - 1;
+
+	xdp_ring->next_to_clean = ntc;
+	iavf_update_tx_ring_stats(xdp_ring, &stats);
+
+	return i;
+}
+
+/**
+ * iavf_xmit_xdp_buff - submit single buffer to XDP ring for transmission
+ * @xdp: XDP buffer pointer
+ * @xdp_ring: XDP ring for transmission
+ * @map: whether to map the buffer
+ */
+static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
+			      struct iavf_ring *xdp_ring,
+			      bool map)
+{
+	u32 batch_sz = IAVF_RING_QUARTER(xdp_ring);
+	u32 size = xdp->data_end - xdp->data;
+	u32 ntu = xdp_ring->next_to_use;
+	struct iavf_tx_buffer *tx_buff;
+	struct iavf_tx_desc *tx_desc;
+	void *data = xdp->data;
+	dma_addr_t dma;
+	u32 free;
+
+	free = IAVF_DESC_UNUSED(xdp_ring);
+	if (unlikely(free < batch_sz))
+		free += iavf_clean_xdp_irq(xdp_ring);
+	if (unlikely(!free)) {
+		libie_stats_inc_one(&xdp_ring->sq_stats, busy);
+		return -EBUSY;
+	}
+
+	if (map) {
+		dma = dma_map_single(xdp_ring->dev, data, size, DMA_TO_DEVICE);
+		if (dma_mapping_error(xdp_ring->dev, dma))
+			return -ENOMEM;
+	} else {
+		struct page *page = virt_to_page(data);
+		u32 hr = data - xdp->data_hard_start;
+
+		dma = page_pool_get_dma_addr(page) + hr;
+		dma_sync_single_for_device(xdp_ring->dev, dma, size,
+					   DMA_BIDIRECTIONAL);
+	}
+
+	tx_buff = &xdp_ring->tx_bi[ntu];
+	tx_buff->bytecount = size;
+	tx_buff->gso_segs = 1;
+	/* TODO: set type to XDP_TX or XDP_XMIT depending on @map and assign
+	 * either ->data_hard_start (which is pointer to xdp_frame) or @page
+	 * above.
+	 */
+	tx_buff->raw_buf = data;
+
+	/* record length, and DMA address */
+	dma_unmap_len_set(tx_buff, len, size);
+	dma_unmap_addr_set(tx_buff, dma, dma);
+
+	tx_desc = IAVF_TX_DESC(xdp_ring, ntu);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = build_ctob(IAVF_TX_DESC_CMD_EOP, 0,
+						  size, 0);
+
+	ntu++;
+	if (ntu > xdp_ring->next_rs) {
+		tx_desc = IAVF_TX_DESC(xdp_ring, xdp_ring->next_rs);
+		tx_desc->cmd_type_offset_bsz |=
+			cpu_to_le64(IAVF_TX_DESC_CMD_RS <<
+				    IAVF_TXD_QW1_CMD_SHIFT);
+		xdp_ring->next_rs += batch_sz;
+	}
+
+	if (ntu == xdp_ring->count) {
+		ntu = 0;
+		xdp_ring->next_rs = batch_sz - 1;
+	}
+
+	xdp_ring->next_to_use = ntu;
+
+	return 0;
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index d3a62fdceb558a..ea63dd905d8ca1 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -243,6 +243,10 @@ struct iavf_ring {
 	u16 next_to_use;
 	u16 next_to_clean;
 
+	/* used for XDP rings only */
+	u16 next_dd;
+	u16 next_rs;
+
 	u16 flags;
 #define IAVF_TXR_FLAGS_WB_ON_ITR		BIT(0)
 #define IAVF_TXRX_FLAGS_ARM_WB			BIT(1)
@@ -279,6 +283,8 @@ struct iavf_ring {
 	struct xdp_rxq_info xdp_rxq;
 } ____cacheline_internodealigned_in_smp;
 
+#define IAVF_RING_QUARTER(R)		((R)->count >> 2)
+
 #define IAVF_ITR_ADAPTIVE_MIN_INC	0x0002
 #define IAVF_ITR_ADAPTIVE_MIN_USECS	0x0002
 #define IAVF_ITR_ADAPTIVE_MAX_USECS	0x007e
@@ -384,4 +390,54 @@ static inline struct netdev_queue *txring_txq(const struct iavf_ring *ring)
 {
 	return netdev_get_tx_queue(ring->netdev, ring->queue_index);
 }
+
+/**
+ * iavf_xdp_ring_update_tail - Updates the XDP Tx ring tail register
+ * @xdp_ring: XDP Tx ring
+ *
+ * Notify hardware the new descriptor is ready to be transmitted
+ */
+static inline void iavf_xdp_ring_update_tail(const struct iavf_ring *xdp_ring)
+{
+	/* Force memory writes to complete before letting h/w
+	 * know there are new descriptors to fetch.
+	 */
+	wmb();
+	writel_relaxed(xdp_ring->next_to_use, xdp_ring->tail);
+}
+
+/**
+ * iavf_update_tx_ring_stats - Update TX ring stats after transmit completes
+ * @tx_ring: TX descriptor ring
+ * @tc: TODO
+ * @total_pkts: Number of packets transmitted since the last update
+ * @total_bytes: Number of bytes transmitted since the last update
+ **/
+static inline void
+__iavf_update_tx_ring_stats(struct iavf_ring *tx_ring,
+			    struct iavf_ring_container *tc,
+			    const struct libie_sq_onstack_stats *stats)
+{
+	libie_sq_napi_stats_add(&tx_ring->sq_stats, stats);
+	tc->total_bytes += stats->bytes;
+	tc->total_packets += stats->packets;
+}
+
+#define iavf_update_tx_ring_stats(r, s) \
+	__iavf_update_tx_ring_stats(r, &(r)->q_vector->tx, s)
+
+#define IAVF_RXQ_XDP_ACT_FINALIZE_TX	BIT(0)
+
+/**
+ * iavf_finalize_xdp_rx - Finalize XDP actions once per RX ring clean
+ * @xdp_ring: XDP TX queue assigned to a given RX ring
+ * @rxq_xdp_act: Logical OR of flags of XDP actions that require finalization
+ **/
+static inline void iavf_finalize_xdp_rx(struct iavf_ring *xdp_ring,
+					u32 rxq_xdp_act)
+{
+	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_TX)
+		iavf_xdp_ring_update_tail(xdp_ring);
+}
+
 #endif /* _IAVF_TXRX_H_ */

From c08c702edf8fcbbcbf796e9aa8d064d67a17441c Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 09:10:04 +0100
Subject: [PATCH 25/32] iavf: Implement XDP redirect path

Implement XDP_REDIRECT action and ndo_xdp_xmit() callback.

For now, packets redirected from CPU with index greater than
XDP queues number are just dropped with an error.
This is a rather common situation, especially when VF is configured
to run on host and will be addressed in later patches.

Patch also refactors RX XDP handling to use switch statement due to
increased number of actions.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c |  1 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 47 +++++++++++++++++++++
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  6 +++
 3 files changed, 54 insertions(+)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index cfb6d1bf45fdba..6a17037456b555 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -5167,6 +5167,7 @@ static const struct net_device_ops iavf_netdev_ops = {
 	.ndo_set_features	= iavf_set_features,
 	.ndo_setup_tc		= iavf_setup_tc,
 	.ndo_bpf		= iavf_xdp,
+	.ndo_xdp_xmit		= iavf_xdp_xmit,
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index db2c29bccaefd3..2d9bbadfd2e591 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -1144,6 +1144,12 @@ iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 
 		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_TX;
 		break;
+	case XDP_REDIRECT:
+		if (unlikely(xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog)))
+			goto xdp_err;
+
+		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_REDIR;
+		break;
 	default:
 		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, xdp_act);
 
@@ -2400,3 +2406,44 @@ static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
 
 	return 0;
 }
+
+int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags)
+{
+	struct iavf_adapter *adapter = netdev_priv(dev);
+	struct iavf_ring *xdp_ring;
+	u32 queue_index, nxmit = 0;
+	int err = 0;
+
+	if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
+		return -EINVAL;
+
+	if (unlikely(adapter->state == __IAVF_DOWN))
+		return -ENETDOWN;
+
+	if (!iavf_adapter_xdp_active(adapter))
+		return -ENXIO;
+
+	queue_index = smp_processor_id();
+	if (queue_index >= adapter->num_active_queues)
+		return -ENXIO;
+
+	xdp_ring = &adapter->xdp_rings[queue_index];
+
+	for (u32 i = 0; i < n; i++) {
+		struct xdp_frame *xdpf = frames[i];
+		struct xdp_buff xdp;
+
+		xdp_convert_frame_to_buff(xdpf, &xdp);
+		err = iavf_xmit_xdp_buff(&xdp, xdp_ring, true);
+		if (unlikely(err))
+			break;
+
+		nxmit++;
+	}
+
+	if (flags & XDP_XMIT_FLUSH)
+		iavf_xdp_ring_update_tail(xdp_ring);
+
+	return nxmit;
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index ea63dd905d8ca1..3cb6b712a99500 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -321,6 +321,9 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi);
 int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size);
 bool __iavf_chk_linearize(struct sk_buff *skb);
 
+int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
+		  u32 flags);
+
 /**
  * iavf_xmit_descriptor_count - calculate number of Tx descriptors needed
  * @skb:     send buffer
@@ -427,6 +430,7 @@ __iavf_update_tx_ring_stats(struct iavf_ring *tx_ring,
 	__iavf_update_tx_ring_stats(r, &(r)->q_vector->tx, s)
 
 #define IAVF_RXQ_XDP_ACT_FINALIZE_TX	BIT(0)
+#define IAVF_RXQ_XDP_ACT_FINALIZE_REDIR	BIT(1)
 
 /**
  * iavf_finalize_xdp_rx - Finalize XDP actions once per RX ring clean
@@ -436,6 +440,8 @@ __iavf_update_tx_ring_stats(struct iavf_ring *tx_ring,
 static inline void iavf_finalize_xdp_rx(struct iavf_ring *xdp_ring,
 					u32 rxq_xdp_act)
 {
+	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_REDIR)
+		xdp_do_flush();
 	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_TX)
 		iavf_xdp_ring_update_tail(xdp_ring);
 }

From 442650e3fd29b67929a6f2327b7f375dcbca2790 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 15:12:21 +0100
Subject: [PATCH 26/32] iavf: Allow XDP TxQ sharing

Port of commit 22bf877 ("ice: introduce XDP_TX fallback path").
The patch handles the case, when queue number is not sufficient for
the current number of CPUs. To avoid dropping some packets
redirected from other interfaces, XDP TxQs are allowed to be shared
between CPUs, which imposes the locking requirement.
Static key approach has little to none performance penalties
when sharing is not needed.

This mechanism is much more applicable when dealing with VFs.
In fact, maximum number of queue pairs that ice PF can give to
an iavf VF is 16, which allows up to 8 XDP TxQs, so without
XDP TxQ sharing, some redirected packets can be dropped even
on a 10 CPU system.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c | 27 ++++++++++++++++
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 34 ++++++++++++++++++---
 drivers/net/ethernet/intel/iavf/iavf_txrx.h | 15 +++++++--
 3 files changed, 70 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 6a17037456b555..4de89810c12f38 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -1698,9 +1698,31 @@ static void iavf_init_tx_ring(struct iavf_adapter *adapter,
 	if (xdp_ring) {
 		tx_ring->queue_index += adapter->num_active_queues;
 		tx_ring->flags |= IAVF_TXRX_FLAGS_XDP;
+		spin_lock_init(&tx_ring->tx_lock);
 	}
 }
 
+/**
+ * iavf_xdp_cfg_tx_sharing - Enable XDP TxQ sharing, if needed
+ * @adapter: board private structure
+ *
+ * If there is more CPUs than rings, sharing XDP TxQ allows us
+ * to handle XDP_REDIRECT from other interfaces.
+ **/
+static void iavf_xdp_cfg_tx_sharing(struct iavf_adapter *adapter)
+{
+	u32 num_active_queues = adapter->num_active_queues;
+	u32 num_cpus = num_online_cpus();
+
+	if (!iavf_adapter_xdp_active(adapter) || num_active_queues >= num_cpus)
+		return;
+
+	netdev_warn(adapter->netdev,
+		    "System has %u CPUs, but only %u XDP queues can be configured, entering XDP TxQ sharing mode, performance is decreased\n",
+		    num_cpus, num_active_queues);
+	static_branch_inc(&iavf_xdp_locking_key);
+}
+
 /**
  * iavf_alloc_xdp_queues - Allocate memory for XDP rings
  * @adapter: board private structure to initialize
@@ -1727,6 +1749,8 @@ static int iavf_alloc_xdp_queues(struct iavf_adapter *adapter, u32 num_active_qu
 		adapter->rx_rings[i].xdp_ring = &adapter->xdp_rings[i];
 	}
 
+	iavf_xdp_cfg_tx_sharing(adapter);
+
 	return 0;
 }
 
@@ -3534,6 +3558,9 @@ void iavf_free_all_tx_resources(struct iavf_adapter *adapter)
 	if (!adapter->tx_rings)
 		return;
 
+	if (static_key_enabled(&iavf_xdp_locking_key))
+		static_branch_dec(&iavf_xdp_locking_key);
+
 	for (i = 0; i < adapter->num_active_queues; i++)
 		if (adapter->tx_rings[i].desc)
 			iavf_free_tx_resources(&adapter->tx_rings[i]);
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 2d9bbadfd2e591..4ae512a1bccca2 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -11,9 +11,10 @@
 #include "iavf_trace.h"
 #include "iavf_prototype.h"
 
-static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
-			      struct iavf_ring *xdp_ring,
-			      bool map);
+DEFINE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
+
+static bool iavf_xdp_xmit_back(const struct xdp_buff *buff,
+			       struct iavf_ring *xdp_ring);
 
 static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
 				u32 td_tag)
@@ -1139,7 +1140,7 @@ iavf_run_xdp(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 	case XDP_DROP:
 		break;
 	case XDP_TX:
-		if (unlikely(iavf_xmit_xdp_buff(xdp, xdp_ring, false)))
+		if (unlikely(!iavf_xdp_xmit_back(xdp, xdp_ring)))
 			goto xdp_err;
 
 		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_TX;
@@ -2407,6 +2408,23 @@ static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
 	return 0;
 }
 
+static bool iavf_xdp_xmit_back(const struct xdp_buff *buff,
+			       struct iavf_ring *xdp_ring)
+{
+	bool ret;
+
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_lock(&xdp_ring->tx_lock);
+
+	/* TODO: improve XDP_TX by batching */
+	ret = !iavf_xmit_xdp_buff(buff, xdp_ring, false);
+
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_unlock(&xdp_ring->tx_lock);
+
+	return ret;
+}
+
 int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags)
 {
@@ -2425,11 +2443,16 @@ int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		return -ENXIO;
 
 	queue_index = smp_processor_id();
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		queue_index %= adapter->num_xdp_tx_queues;
 	if (queue_index >= adapter->num_active_queues)
 		return -ENXIO;
 
 	xdp_ring = &adapter->xdp_rings[queue_index];
 
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_lock(&xdp_ring->tx_lock);
+
 	for (u32 i = 0; i < n; i++) {
 		struct xdp_frame *xdpf = frames[i];
 		struct xdp_buff xdp;
@@ -2445,5 +2468,8 @@ int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 	if (flags & XDP_XMIT_FLUSH)
 		iavf_xdp_ring_update_tail(xdp_ring);
 
+	if (static_branch_unlikely(&iavf_xdp_locking_key))
+		spin_unlock(&xdp_ring->tx_lock);
+
 	return nxmit;
 }
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 3cb6b712a99500..8bfd7b07df46fd 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -257,7 +257,8 @@ struct iavf_ring {
 
 	struct bpf_prog __rcu *xdp_prog;
 	struct iavf_ring *xdp_ring;
-	struct sk_buff *skb;		/* When iavf_clean_rx_ring_irq() must
+	union {
+		struct sk_buff *skb;	/* When iavf_clean_rx_ring_irq() must
 					 * return before it sees the EOP for
 					 * the current packet, we save that skb
 					 * here and resume receiving this
@@ -265,6 +266,8 @@ struct iavf_ring {
 					 * iavf_clean_rx_ring_irq() is called
 					 * for this ring.
 					 */
+		spinlock_t tx_lock;	/* Protect XDP TX ring, when shared */
+	};
 
 	/* stats structs */
 	union {
@@ -321,6 +324,8 @@ void iavf_detect_recover_hung(struct iavf_vsi *vsi);
 int __iavf_maybe_stop_tx(struct iavf_ring *tx_ring, int size);
 bool __iavf_chk_linearize(struct sk_buff *skb);
 
+DECLARE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
+
 int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags);
 
@@ -442,8 +447,14 @@ static inline void iavf_finalize_xdp_rx(struct iavf_ring *xdp_ring,
 {
 	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_REDIR)
 		xdp_do_flush();
-	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_TX)
+
+	if (rxq_xdp_act & IAVF_RXQ_XDP_ACT_FINALIZE_TX) {
+		if (static_branch_unlikely(&iavf_xdp_locking_key))
+			spin_lock(&xdp_ring->tx_lock);
 		iavf_xdp_ring_update_tail(xdp_ring);
+		if (static_branch_unlikely(&iavf_xdp_locking_key))
+			spin_unlock(&xdp_ring->tx_lock);
+	}
 }
 
 #endif /* _IAVF_TXRX_H_ */

From 8074b51b10987e953b5d63a53cbe73940f19cb6f Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Wed, 30 Nov 2022 12:01:26 +0100
Subject: [PATCH 27/32] iavf: Add AF_XDP initialization

Add necessary functions and data structures to support
AF_XDP feature.
Implement handling of 'XDP_SETUP_XSK_POOL' in .ndo_bpf().
Also, implement functions for selectively stopping only
those queues which take part in XDP socket creation.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/Makefile    |   3 +-
 drivers/net/ethernet/intel/iavf/iavf.h      |   3 +
 drivers/net/ethernet/intel/iavf/iavf_main.c |  16 +-
 drivers/net/ethernet/intel/iavf/iavf_xsk.c  | 365 ++++++++++++++++++++
 drivers/net/ethernet/intel/iavf/iavf_xsk.h  |  15 +
 5 files changed, 400 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/iavf/iavf_xsk.c
 create mode 100644 drivers/net/ethernet/intel/iavf/iavf_xsk.h

diff --git a/drivers/net/ethernet/intel/iavf/Makefile b/drivers/net/ethernet/intel/iavf/Makefile
index 9c3e45c54d0133..19eb29005e7a06 100644
--- a/drivers/net/ethernet/intel/iavf/Makefile
+++ b/drivers/net/ethernet/intel/iavf/Makefile
@@ -13,4 +13,5 @@ obj-$(CONFIG_IAVF) += iavf.o
 
 iavf-objs := iavf_main.o iavf_ethtool.o iavf_virtchnl.o iavf_fdir.o \
 	     iavf_adv_rss.o \
-	     iavf_txrx.o iavf_common.o iavf_adminq.o iavf_client.o
+	     iavf_txrx.o iavf_common.o iavf_adminq.o iavf_client.o \
+	     iavf_xsk.o
diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h
index 171a9b8efd12ce..f522f5a6166b25 100644
--- a/drivers/net/ethernet/intel/iavf/iavf.h
+++ b/drivers/net/ethernet/intel/iavf/iavf.h
@@ -36,7 +36,9 @@
 #include <net/tc_act/tc_gact.h>
 #include <net/tc_act/tc_mirred.h>
 #include <net/xdp.h>
+#include <net/xdp_sock_drv.h>
 
+#include "iavf_xsk.h"
 #include "iavf_type.h"
 #include <linux/avf/virtchnl.h>
 #include "iavf_txrx.h"
@@ -270,6 +272,7 @@ struct iavf_adapter {
 	u32 num_xdp_tx_queues;
 	u32 num_req_queues;
 	struct bpf_prog *xdp_prog;
+	unsigned long *af_xdp_zc_qps;
 
 	/* TX */
 	struct iavf_ring *tx_rings;
diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 4de89810c12f38..8a064e981efba9 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -2824,6 +2824,11 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
 	set_bit(__IAVF_VSI_DOWN, adapter->vsi.state);
 	rtnl_unlock();
 
+	adapter->af_xdp_zc_qps = bitmap_zalloc(adapter->num_active_queues,
+					       GFP_KERNEL);
+	if (!adapter->af_xdp_zc_qps)
+		goto err_zc_qps;
+
 	iavf_misc_irq_enable(adapter);
 	wake_up(&adapter->down_waitqueue);
 
@@ -2845,6 +2850,8 @@ static void iavf_init_config_adapter(struct iavf_adapter *adapter)
 	return;
 err_mem:
 	iavf_free_rss(adapter);
+err_zc_qps:
+	bitmap_free(adapter->af_xdp_zc_qps);
 err_register:
 	iavf_free_misc_irq(adapter);
 err_sw_init:
@@ -3105,6 +3112,7 @@ static void iavf_disable_vf(struct iavf_adapter *adapter)
 	}
 
 	iavf_free_xdp_prog(adapter);
+	bitmap_free(adapter->af_xdp_zc_qps);
 
 	spin_lock_bh(&adapter->mac_vlan_list_lock);
 
@@ -5036,7 +5044,7 @@ static int iavf_xdp_resume_if(struct iavf_adapter *adapter)
 static int iavf_prepare_xdp_rings(struct iavf_adapter *adapter,
 				  struct bpf_prog *prog)
 {
-	int err;
+	int i, err;
 
 	iavf_unmap_rings_from_vectors(adapter);
 	iavf_free_xdp_queues(adapter);
@@ -5054,6 +5062,9 @@ static int iavf_prepare_xdp_rings(struct iavf_adapter *adapter,
 
 	iavf_map_rings_to_vectors(adapter);
 
+	for_each_set_bit(i, adapter->af_xdp_zc_qps, adapter->num_active_queues)
+		napi_schedule(&adapter->rx_rings[i].q_vector->napi);
+
 	return 0;
 
 err_alloc_queues:
@@ -5173,6 +5184,9 @@ static int iavf_xdp(struct net_device *netdev, struct netdev_bpf *xdp)
 	switch (xdp->command) {
 	case XDP_SETUP_PROG:
 		return iavf_setup_xdp(adapter, xdp->prog, xdp->extack);
+	case XDP_SETUP_XSK_POOL:
+		return iavf_xsk_pool_setup(adapter, xdp->xsk.pool,
+					   xdp->xsk.queue_id);
 	default:
 		return -EINVAL;
 	}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.c b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
new file mode 100644
index 00000000000000..adf154840260c6
--- /dev/null
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
@@ -0,0 +1,365 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Copyright(c) 2022 Intel Corporation. */
+
+#include <net/xdp_sock_drv.h>
+#include <net/xdp_sock.h>
+#include "iavf.h"
+#include "iavf_xsk.h"
+
+#define IAVF_PF_REQ_TIMEOUT_MS		300
+#define IAVF_CRIT_LOCK_WAIT_TIMEOUT_MS	1000
+#define IAVF_VC_MSG_TIMEOUT_MS		3000
+
+/**
+ * iavf_max_xdp_queues_count - Returns the maximal number of XDP queues
+ *			       that can be created for current configuration
+ *			       of a given adapter.
+ * @adapter: adapter where XDP socket will be set up
+ */
+static u32
+iavf_max_xdp_queues_count(struct iavf_adapter *adapter)
+{
+	u32 max_qp_num = adapter->vsi_res->num_queue_pairs;
+	u32 num_active_queues = adapter->num_active_queues;
+
+	return num_active_queues * 2 > max_qp_num ? max_qp_num / 2 :
+						    num_active_queues;
+}
+
+/**
+ * iavf_qp_reset_stats - Resets all stats for rings of given index
+ * @adapter: adapter that contains rings of interest
+ * @q_idx: ring index in array
+ */
+static void
+iavf_qp_reset_stats(struct iavf_adapter *adapter, u16 q_idx)
+{
+	memset(&adapter->rx_rings[q_idx].stats, 0,
+	       sizeof(adapter->rx_rings[q_idx].stats));
+	memset(&adapter->tx_rings[q_idx].stats, 0,
+	       sizeof(adapter->tx_rings[q_idx].stats));
+	if (iavf_adapter_xdp_active(adapter))
+		memset(&adapter->xdp_rings[q_idx].stats, 0,
+		       sizeof(adapter->xdp_rings[q_idx].stats));
+}
+
+/**
+ * iavf_qp_clean_rings - Cleans all the rings of a given index
+ * @adapter: adapter that contains rings of interest
+ * @q_idx: ring index in array
+ */
+static void
+iavf_qp_clean_rings(struct iavf_adapter *adapter, u16 q_idx)
+{
+	iavf_clean_tx_ring(&adapter->tx_rings[q_idx]);
+	if (iavf_adapter_xdp_active(adapter)) {
+		synchronize_rcu();
+		iavf_clean_tx_ring(&adapter->xdp_rings[q_idx]);
+	}
+	iavf_clean_rx_ring(&adapter->rx_rings[q_idx]);
+}
+
+/**
+ * iavf_qvec_toggle_napi - Enables/disables NAPI for a given q_vector
+ * @adapter: adapter that has netdev
+ * @q_vector: q_vector that has NAPI context
+ * @enable: true for enable, false for disable
+ */
+static void
+iavf_qvec_toggle_napi(struct iavf_adapter *adapter,
+		      struct iavf_q_vector *q_vector, bool enable)
+{
+	if (!adapter->vsi.netdev || !q_vector)
+		return;
+
+	if (enable)
+		napi_enable(&q_vector->napi);
+	else
+		napi_disable(&q_vector->napi);
+}
+
+/**
+ * iavf_qvec_dis_irq - Mask off queue interrupt generation on given ring
+ * @adapter: the adapter that contains queue vector being un-configured
+ * @q_vector: queue vector
+ */
+static void
+iavf_qvec_dis_irq(struct iavf_adapter *adapter, struct iavf_q_vector *q_vector)
+{
+	int base = adapter->vsi.base_vector;
+	struct iavf_hw *hw = &adapter->hw;
+	u16 reg = q_vector->reg_idx;
+
+	wr32(hw, IAVF_VFINT_DYN_CTLN1(reg), 0);
+	synchronize_irq(adapter->msix_entries[reg + base].vector);
+	iavf_flush(hw);
+}
+
+/**
+ * iavf_qvec_ena_irq - Enable IRQ for given queue vector
+ * @adapter: the adapter that contains queue vector
+ * @q_vector: queue vector
+ */
+static void
+iavf_qvec_ena_irq(struct iavf_adapter *adapter, struct iavf_q_vector *q_vector)
+{
+	struct iavf_hw *hw = &adapter->hw;
+
+	if (adapter)
+		if (adapter->state == __IAVF_DOWN)
+			return;
+
+	wr32(hw, IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx),
+	     IAVF_VFINT_DYN_CTLN1_INTENA_MASK |
+	     IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK);
+
+	iavf_flush(hw);
+}
+
+/**
+ * iavf_cfg_qp_in_pf - Configure selected queue pairs in PF.
+ * @adapter: adapter of interest
+ * @qp_mask: mask of queue pairs that shall be configured
+ *
+ * Returns 0 on success, negative on failure or timeout.
+ */
+static int
+iavf_cfg_qp_in_pf(struct iavf_adapter *adapter, u32 qp_mask)
+{
+	iavf_configure_selected_queues(adapter, qp_mask);
+	return iavf_get_configure_queues_result(adapter,
+						IAVF_PF_REQ_TIMEOUT_MS);
+}
+
+/**
+ * iavf_ena_queues_in_pf - Enable selected queues in PF.
+ * @adapter: adapter of interest
+ * @rxq_mask: mask of Rx queues that shall be enabled
+ * @txq_mask: mask of Tx queues that shall be enabled
+ *
+ * Returns 0 on success, negative on failure or timeout.
+ */
+static int
+iavf_ena_queues_in_pf(struct iavf_adapter *adapter, u32 rxq_mask, u32 txq_mask)
+{
+	iavf_enable_selected_queues(adapter, rxq_mask, txq_mask);
+	return iavf_get_queue_enable_result(adapter, IAVF_PF_REQ_TIMEOUT_MS);
+}
+/**
+ * iavf_dis_queues_in_pf - Disable selected queues in PF.
+ * @adapter: adapter of interest
+ * @rxq_mask: mask of Rx queues that shall be disabled
+ * @txq_mask: mask of Tx queues that shall be disabled
+ *
+ * Returns 0 on success, negative on failure or timeout.
+ */
+
+static int
+iavf_dis_queues_in_pf(struct iavf_adapter *adapter, u32 rxq_mask, u32 txq_mask)
+{
+	iavf_disable_selected_queues(adapter, rxq_mask, txq_mask);
+	return iavf_get_queue_disable_result(adapter, IAVF_PF_REQ_TIMEOUT_MS);
+}
+
+/**
+ * iavf_qp_dis - Disables a queue pair
+ * @adapter: adapter of interest
+ * @q_idx: ring index in array
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int iavf_qp_dis(struct iavf_adapter *adapter, u16 q_idx)
+{
+	struct iavf_vsi *vsi = &adapter->vsi;
+	struct iavf_q_vector *q_vector;
+	struct iavf_ring *rx_ring;
+	u32 rx_queues, tx_queues;
+	int err;
+
+	if (q_idx >= adapter->num_active_queues)
+		return -EINVAL;
+
+	rx_ring = &adapter->rx_rings[q_idx];
+	q_vector = rx_ring->q_vector;
+
+	rx_queues = BIT(q_idx);
+	tx_queues = rx_queues;
+
+	netif_tx_stop_queue(netdev_get_tx_queue(vsi->netdev, q_idx));
+
+	iavf_qvec_toggle_napi(adapter, q_vector, false);
+	iavf_qvec_dis_irq(adapter, q_vector);
+
+	if (iavf_adapter_xdp_active(adapter))
+		tx_queues |= BIT(q_idx + adapter->num_active_queues);
+
+	err = iavf_dis_queues_in_pf(adapter, rx_queues, tx_queues);
+	if (err)
+		goto dis_exit;
+
+	iavf_qp_clean_rings(adapter, q_idx);
+	iavf_qp_reset_stats(adapter, q_idx);
+dis_exit:
+	return err;
+}
+
+/**
+ * iavf_qp_ena - Enables a queue pair
+ * @adapter: adapter of interest
+ * @q_idx: ring index in array
+ *
+ * Returns 0 on success, negative on failure.
+ */
+static int iavf_qp_ena(struct iavf_adapter *adapter, u16 q_idx)
+{
+	struct iavf_vsi *vsi = &adapter->vsi;
+	struct iavf_q_vector *q_vector;
+	struct iavf_ring *rx_ring;
+	u32 rx_queues, tx_queues;
+	int err = 0;
+
+	if (q_idx >= adapter->num_active_queues)
+		return -EINVAL;
+
+	rx_ring = &adapter->rx_rings[q_idx];
+	q_vector = rx_ring->q_vector;
+
+	rx_queues = BIT(q_idx);
+	tx_queues = rx_queues;
+
+	if (iavf_adapter_xdp_active(adapter))
+		tx_queues |= BIT(q_idx + adapter->num_active_queues);
+
+	/* Use 'tx_queues' mask as a queue pair mask to configure
+	 * also an extra XDP Tx queue.
+	 */
+	err = iavf_cfg_qp_in_pf(adapter, tx_queues);
+	if (err)
+		goto ena_exit;
+
+	iavf_configure_rx_ring(adapter, rx_ring);
+
+	err = iavf_ena_queues_in_pf(adapter, rx_queues, tx_queues);
+	if (err)
+		goto ena_exit;
+
+	iavf_qvec_toggle_napi(adapter, q_vector, true);
+	iavf_qvec_ena_irq(adapter, q_vector);
+
+	netif_tx_start_queue(netdev_get_tx_queue(vsi->netdev, q_idx));
+ena_exit:
+	return err;
+}
+
+/**
+ * iavf_xsk_pool_disable - disable a buffer pool region
+ * @adapter: Current adapter
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int iavf_xsk_pool_disable(struct iavf_adapter *adapter, u16 qid)
+{
+	struct xsk_buff_pool *pool = xsk_get_pool_from_qid(adapter->vsi.netdev,
+							   qid);
+	if (!pool)
+		return -EINVAL;
+
+	clear_bit(qid, adapter->af_xdp_zc_qps);
+	xsk_pool_dma_unmap(pool, IAVF_RX_DMA_ATTR);
+
+	return 0;
+}
+
+/**
+ * iavf_xsk_pool_enable - enable a buffer pool region
+ * @adapter: Current adapter
+ * @pool: pointer to a requested buffer pool region
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+static int
+iavf_xsk_pool_enable(struct iavf_adapter *adapter, struct xsk_buff_pool *pool,
+		     u16 qid)
+{
+	struct iavf_vsi *vsi = &adapter->vsi;
+	int err;
+
+	if (qid >= vsi->netdev->real_num_rx_queues ||
+	    qid >= vsi->netdev->real_num_tx_queues)
+		return -EINVAL;
+
+	err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IAVF_RX_DMA_ATTR);
+	if (err)
+		return err;
+
+	set_bit(qid, adapter->af_xdp_zc_qps);
+
+	return 0;
+}
+
+/**
+ * iavf_xsk_pool_setup - enable/disable a buffer pool region depending
+ * 			 on its state
+ * @adapter: Current adapter
+ * @pool: buffer pool to enable/associate to a ring, NULL to disable
+ * @qid: queue ID
+ *
+ * Returns 0 on success, negative on failure
+ */
+int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
+			struct xsk_buff_pool *pool, u32 qid)
+{
+	bool if_running, pool_present = !!pool;
+	struct iavf_vsi *vsi = &adapter->vsi;
+	int ret = 0, pool_failure = 0;
+
+	if (qid >= iavf_max_xdp_queues_count(adapter)) {
+		netdev_err(vsi->netdev, "Wrong queue index for XDP.\n");
+		pool_failure = -EINVAL;
+		goto failure;
+	}
+
+	if_running = netif_running(vsi->netdev) &&
+		     iavf_adapter_xdp_active(adapter);
+
+	if (if_running) {
+		if (iavf_lock_timeout(&adapter->crit_lock,
+				      IAVF_CRIT_LOCK_WAIT_TIMEOUT_MS))
+			return -EBUSY;
+
+		ret = iavf_process_pending_pf_msg(adapter,
+						  IAVF_VC_MSG_TIMEOUT_MS);
+		if (ret)
+			goto xsk_pool_if_up;
+
+		ret = iavf_qp_dis(adapter, qid);
+		if (ret) {
+			netdev_err(vsi->netdev, "iavf_qp_dis error = %d\n", ret);
+			goto xsk_pool_if_up;
+		}
+	}
+
+	pool_failure = pool_present ? iavf_xsk_pool_enable(adapter, pool, qid) :
+				      iavf_xsk_pool_disable(adapter, qid);
+
+xsk_pool_if_up:
+	if (if_running) {
+		ret = iavf_qp_ena(adapter, qid);
+		mutex_unlock(&adapter->crit_lock);
+		if (!ret && pool_present)
+			napi_schedule(&adapter->rx_rings[qid].q_vector->napi);
+		else if (ret)
+			netdev_err(vsi->netdev, "iavf_qp_ena error = %d\n", ret);
+	}
+
+failure:
+	if (pool_failure) {
+		netdev_err(vsi->netdev, "Could not %sable buffer pool, error = %d\n",
+			   pool_present ? "en" : "dis", pool_failure);
+		return pool_failure;
+	}
+
+	return ret;
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.h b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
new file mode 100644
index 00000000000000..c09cde98e36bc1
--- /dev/null
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/* Copyright(c) 2022 Intel Corporation. */
+
+#ifndef _IAVF_XSK_H_
+#define _IAVF_XSK_H_
+
+#include <linux/types.h>
+
+struct iavf_adapter;
+struct xsk_buff_pool;
+
+int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
+			struct xsk_buff_pool *pool, u32 qid);
+
+#endif /* !_IAVF_XSK_H_ */

From bdfea1ad88e8cd8085b0e6fe3d79890519478492 Mon Sep 17 00:00:00 2001
From: Michal Kubiak <michal.kubiak@intel.com>
Date: Wed, 30 Nov 2022 12:45:53 +0100
Subject: [PATCH 28/32] iavf: Implement Tx path for AF_XDP

Implement Tx handling for AF_XDP feature in zero-copy mode.
Add '.ndo_xdp_xmit()' and '.ndo_xsk_wakeup()' implementations
to support AF_XDP Tx path.
Also, add Tx interrupt handling function for zero-copy mode.

Signed-off-by: Michal Kubiak <michal.kubiak@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c |   1 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c |  50 +--
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |  17 +-
 drivers/net/ethernet/intel/iavf/iavf_xsk.c  | 334 ++++++++++++++++++++
 drivers/net/ethernet/intel/iavf/iavf_xsk.h  |  18 ++
 5 files changed, 397 insertions(+), 23 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 8a064e981efba9..8ff342d5581728 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -5209,6 +5209,7 @@ static const struct net_device_ops iavf_netdev_ops = {
 	.ndo_setup_tc		= iavf_setup_tc,
 	.ndo_bpf		= iavf_xdp,
 	.ndo_xdp_xmit		= iavf_xdp_xmit,
+	.ndo_xsk_wakeup		= iavf_xsk_wakeup
 };
 
 /**
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 4ae512a1bccca2..1f1579351dc5fa 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -16,16 +16,6 @@ DEFINE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
 static bool iavf_xdp_xmit_back(const struct xdp_buff *buff,
 			       struct iavf_ring *xdp_ring);
 
-static inline __le64 build_ctob(u32 td_cmd, u32 td_offset, unsigned int size,
-				u32 td_tag)
-{
-	return cpu_to_le64(IAVF_TX_DESC_DTYPE_DATA |
-			   ((u64)td_cmd  << IAVF_TXD_QW1_CMD_SHIFT) |
-			   ((u64)td_offset << IAVF_TXD_QW1_OFFSET_SHIFT) |
-			   ((u64)size  << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT) |
-			   ((u64)td_tag  << IAVF_TXD_QW1_L2TAG1_SHIFT));
-}
-
 #define IAVF_TXD_CMD (IAVF_TX_DESC_CMD_EOP | IAVF_TX_DESC_CMD_RS)
 
 /**
@@ -68,15 +58,19 @@ static void iavf_unmap_and_free_tx_resource(struct iavf_ring *ring,
 void iavf_clean_tx_ring(struct iavf_ring *tx_ring)
 {
 	unsigned long bi_size;
-	u16 i;
 
 	/* ring already cleared, nothing to do */
 	if (!tx_ring->tx_bi)
 		return;
 
-	/* Free all the Tx ring sk_buffs */
-	for (i = 0; i < tx_ring->count; i++)
-		iavf_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
+	if (tx_ring->flags & IAVF_TXRX_FLAGS_XSK) {
+		iavf_xsk_clean_xdp_ring(tx_ring);
+	} else {
+		/* Free all the Tx ring sk_buffs */
+		for (u32 i = 0; i < tx_ring->count; i++)
+			iavf_unmap_and_free_tx_resource(tx_ring,
+							&tx_ring->tx_bi[i]);
+	}
 
 	bi_size = sizeof(struct iavf_tx_buffer) * tx_ring->count;
 	memset(tx_ring->tx_bi, 0, bi_size);
@@ -701,6 +695,8 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 		tx_desc->cmd_type_offset_bsz = 0;
 	}
 
+	iavf_xsk_setup_xdp_ring(tx_ring);
+
 	return 0;
 
 err:
@@ -1482,7 +1478,16 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 	 * budget and be more aggressive about cleaning up the Tx descriptors.
 	 */
 	iavf_for_each_ring(ring, q_vector->tx) {
-		if (!iavf_clean_tx_irq(vsi, ring, budget)) {
+		bool wd;
+
+		if (ring->flags & IAVF_TXRX_FLAGS_XSK)
+			wd = iavf_xmit_zc(ring);
+		else if (ring->flags & IAVF_TXRX_FLAGS_XDP)
+			wd = true;
+		else
+			wd = iavf_clean_tx_irq(vsi, ring, budget);
+
+		if (!wd) {
 			clean_complete = false;
 			continue;
 		}
@@ -2061,8 +2066,8 @@ static inline void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb,
 
 		while (unlikely(size > IAVF_MAX_DATA_PER_TXD)) {
 			tx_desc->cmd_type_offset_bsz =
-				build_ctob(td_cmd, td_offset,
-					   max_data, td_tag);
+				iavf_build_ctob(td_cmd, td_offset,
+						max_data, td_tag);
 
 			tx_desc++;
 			i++;
@@ -2082,8 +2087,9 @@ static inline void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb,
 		if (likely(!data_len))
 			break;
 
-		tx_desc->cmd_type_offset_bsz = build_ctob(td_cmd, td_offset,
-							  size, td_tag);
+		tx_desc->cmd_type_offset_bsz = iavf_build_ctob(td_cmd,
+							       td_offset,
+							       size, td_tag);
 
 		tx_desc++;
 		i++;
@@ -2115,7 +2121,7 @@ static inline void iavf_tx_map(struct iavf_ring *tx_ring, struct sk_buff *skb,
 	/* write last descriptor with RS and EOP bits */
 	td_cmd |= IAVF_TXD_CMD;
 	tx_desc->cmd_type_offset_bsz =
-			build_ctob(td_cmd, td_offset, size, td_tag);
+			iavf_build_ctob(td_cmd, td_offset, size, td_tag);
 
 	skb_tx_timestamp(skb);
 
@@ -2386,8 +2392,8 @@ static int iavf_xmit_xdp_buff(const struct xdp_buff *xdp,
 
 	tx_desc = IAVF_TX_DESC(xdp_ring, ntu);
 	tx_desc->buffer_addr = cpu_to_le64(dma);
-	tx_desc->cmd_type_offset_bsz = build_ctob(IAVF_TX_DESC_CMD_EOP, 0,
-						  size, 0);
+	tx_desc->cmd_type_offset_bsz = iavf_build_ctob(IAVF_TX_DESC_CMD_EOP, 0,
+						       size, 0);
 
 	ntu++;
 	if (ntu > xdp_ring->next_rs) {
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 8bfd7b07df46fd..5ff0a83229016d 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -217,6 +217,7 @@ struct iavf_ring {
 	struct iavf_ring *next;		/* pointer to next ring in q_vector */
 	void *desc;			/* Descriptor ring memory */
 	union {
+		struct xsk_buff_pool *xsk_pool; /* Used on XSk queue pairs */
 		struct page_pool *pool;	/* Used for Rx page management */
 		struct device *dev;	/* Used for DMA mapping on Tx */
 	};
@@ -254,8 +255,12 @@ struct iavf_ring {
 #define IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1	BIT(3)
 #define IAVF_TXR_FLAGS_VLAN_TAG_LOC_L2TAG2	BIT(4)
 #define IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2	BIT(5)
+#define IAVF_TXRX_FLAGS_XSK			BIT(6)
 
-	struct bpf_prog __rcu *xdp_prog;
+	union {
+		struct bpf_prog __rcu *xdp_prog;
+		u32 xdp_tx_active;		/* TODO: comment */
+	};
 	struct iavf_ring *xdp_ring;
 	union {
 		struct sk_buff *skb;	/* When iavf_clean_rx_ring_irq() must
@@ -329,6 +334,16 @@ DECLARE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
 int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags);
 
+static inline __le64 iavf_build_ctob(u32 td_cmd, u32 td_offset,
+				     unsigned int size, u32 td_tag)
+{
+	return cpu_to_le64(IAVF_TX_DESC_DTYPE_DATA |
+			   ((u64)td_cmd  << IAVF_TXD_QW1_CMD_SHIFT) |
+			   ((u64)td_offset << IAVF_TXD_QW1_OFFSET_SHIFT) |
+			   ((u64)size  << IAVF_TXD_QW1_TX_BUF_SZ_SHIFT) |
+			   ((u64)td_tag  << IAVF_TXD_QW1_L2TAG1_SHIFT));
+}
+
 /**
  * iavf_xmit_descriptor_count - calculate number of Tx descriptors needed
  * @skb:     send buffer
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.c b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
index adf154840260c6..232ae432c1ee95 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
@@ -78,6 +78,26 @@ iavf_qvec_toggle_napi(struct iavf_adapter *adapter,
 		napi_disable(&q_vector->napi);
 }
 
+/**
+ * iavf_trigger_sw_intr - trigger a software interrupt
+ * @adapter: adapter of interest
+ * @q_vector: interrupt vector to trigger the software interrupt for
+ */
+static void
+iavf_trigger_sw_intr(struct iavf_adapter *adapter,
+		     struct iavf_q_vector *q_vector)
+{
+        struct iavf_hw *hw = &adapter->hw;
+
+        wr32(hw, IAVF_VFINT_DYN_CTLN1(q_vector->reg_idx),
+             (IAVF_VFINT_DYN_CTLN1_INTENA_MASK |
+              IAVF_VFINT_DYN_CTLN1_ITR_INDX_MASK |
+              IAVF_VFINT_DYN_CTLN1_SWINT_TRIG_MASK |
+              IAVF_VFINT_DYN_CTLN1_SW_ITR_INDX_ENA_MASK));
+
+        iavf_flush(hw);
+}
+
 /**
  * iavf_qvec_dis_irq - Mask off queue interrupt generation on given ring
  * @adapter: the adapter that contains queue vector being un-configured
@@ -363,3 +383,317 @@ int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
 
 	return ret;
 }
+
+/**
+ * iavf_clean_xdp_tx_buf - Free and unmap XDP Tx buffer
+ * @xdp_ring: XDP Tx ring
+ * @tx_buf: Tx buffer to clean
+ */
+static void
+iavf_clean_xdp_tx_buf(struct iavf_ring *xdp_ring, struct iavf_tx_buffer *tx_buf)
+{
+	xdp_return_frame((struct xdp_frame *)tx_buf->raw_buf);
+	xdp_ring->xdp_tx_active--;
+	dma_unmap_single(xdp_ring->xsk_pool->dev, dma_unmap_addr(tx_buf, dma),
+			 dma_unmap_len(tx_buf, len), DMA_TO_DEVICE);
+	dma_unmap_len_set(tx_buf, len, 0);
+}
+
+/**
+ * iavf_clean_xdp_irq_zc - produce AF_XDP descriptors to CQ
+ * @xdp_ring: XDP Tx ring
+ */
+static void iavf_clean_xdp_irq_zc(struct iavf_ring *xdp_ring)
+{
+	u16 ntc = xdp_ring->next_to_clean;
+	struct iavf_tx_buffer *tx_buf;
+	struct iavf_tx_desc *tx_desc;
+	u16 cnt = xdp_ring->count;
+	u16 xsk_frames = 0;
+	u16 last_rs;
+	int i;
+
+	last_rs = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 : cnt - 1;
+	tx_desc = IAVF_TX_DESC(xdp_ring, last_rs);
+	if ((tx_desc->cmd_type_offset_bsz &
+	    cpu_to_le64(IAVF_TX_DESC_DTYPE_DESC_DONE))) {
+		if (last_rs >= ntc)
+			xsk_frames = last_rs - ntc + 1;
+		else
+			xsk_frames = last_rs + cnt - ntc + 1;
+	}
+
+	if (!xsk_frames)
+		return;
+
+	if (likely(!xdp_ring->xdp_tx_active))
+		goto skip;
+
+	ntc = xdp_ring->next_to_clean;
+	for (i = 0; i < xsk_frames; i++) {
+		tx_buf = &xdp_ring->tx_bi[ntc];
+
+		if (tx_buf->raw_buf) {
+			iavf_clean_xdp_tx_buf(xdp_ring, tx_buf);
+			tx_buf->raw_buf = NULL;
+		} else {
+			xsk_frames++;
+		}
+
+		ntc++;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+skip:
+	tx_desc->cmd_type_offset_bsz = 0;
+	xdp_ring->next_to_clean += xsk_frames;
+	if (xdp_ring->next_to_clean >= cnt)
+		xdp_ring->next_to_clean -= cnt;
+	if (xsk_frames)
+		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+}
+
+/**
+ * iavf_xmit_pkt - produce a single HW Tx descriptor out of AF_XDP descriptor
+ * @xdp_ring: XDP ring to produce the HW Tx descriptor on
+ * @desc: AF_XDP descriptor to pull the DMA address and length from
+ * @total_bytes: bytes accumulator that will be used for stats update
+ */
+static void iavf_xmit_pkt(struct iavf_ring *xdp_ring, struct xdp_desc *desc,
+			  unsigned int *total_bytes)
+{
+	struct iavf_tx_desc *tx_desc;
+	dma_addr_t dma;
+
+	dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, desc->addr);
+	xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, desc->len);
+
+	tx_desc = IAVF_TX_DESC(xdp_ring, xdp_ring->next_to_use++);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = iavf_build_ctob(IAVF_TX_DESC_CMD_EOP,
+						       0, desc->len, 0);
+
+	*total_bytes += desc->len;
+}
+
+/**
+ * iavf_xmit_pkt_batch - produce a batch of HW Tx descriptors out
+ * 			 of AF_XDP descriptors
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from
+ * @total_bytes: bytes accumulator that will be used for stats update
+ */
+static void iavf_xmit_pkt_batch(struct iavf_ring *xdp_ring,
+				struct xdp_desc *descs,
+				unsigned int *total_bytes)
+{
+	u16 ntu = xdp_ring->next_to_use;
+	struct iavf_tx_desc *tx_desc;
+	u32 i;
+
+	loop_unrolled_for(i = 0; i < PKTS_PER_BATCH; i++) {
+		dma_addr_t dma;
+
+		dma = xsk_buff_raw_get_dma(xdp_ring->xsk_pool, descs[i].addr);
+		xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma,
+						 descs[i].len);
+
+		tx_desc = IAVF_TX_DESC(xdp_ring, ntu++);
+		tx_desc->buffer_addr = cpu_to_le64(dma);
+		tx_desc->cmd_type_offset_bsz =
+			iavf_build_ctob(IAVF_TX_DESC_CMD_EOP, 0,
+					descs[i].len, 0);
+
+		*total_bytes += descs[i].len;
+	}
+
+	xdp_ring->next_to_use = ntu;
+}
+
+/**
+ * iavf_set_rs_bit - set RS bit on last produced descriptor (one behind current NTU)
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ */
+static void iavf_set_rs_bit(struct iavf_ring *xdp_ring)
+{
+	u16 ntu = xdp_ring->next_to_use ? xdp_ring->next_to_use - 1 :
+					  xdp_ring->count - 1;
+	struct iavf_tx_desc *tx_desc;
+
+	tx_desc = IAVF_TX_DESC(xdp_ring, ntu);
+	tx_desc->cmd_type_offset_bsz |=
+		cpu_to_le64(IAVF_TX_DESC_CMD_RS << IAVF_TXD_QW1_CMD_SHIFT);
+}
+
+/**
+ * iavf_fill_tx_hw_ring - produce the number of Tx descriptors onto ring
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ * @descs: AF_XDP descriptors to pull the DMA addresses and lengths from
+ * @nb_pkts: count of packets to be send
+ * @total_bytes: bytes accumulator that will be used for stats update
+ */
+static void iavf_fill_tx_hw_ring(struct iavf_ring *xdp_ring,
+				 struct xdp_desc *descs, u32 nb_pkts,
+				 unsigned int *total_bytes)
+{
+	u16 tx_thresh = IAVF_RING_QUARTER(xdp_ring);
+	u32 batched, leftover, i;
+
+	batched = ALIGN_DOWN(nb_pkts, PKTS_PER_BATCH);
+	leftover = nb_pkts & (PKTS_PER_BATCH - 1);
+
+	for (i = 0; i < batched; i += PKTS_PER_BATCH)
+		iavf_xmit_pkt_batch(xdp_ring, &descs[i], total_bytes);
+	for (; i < batched + leftover; i++)
+		iavf_xmit_pkt(xdp_ring, &descs[i], total_bytes);
+
+	if (xdp_ring->next_to_use > xdp_ring->next_rs) {
+		struct iavf_tx_desc *tx_desc;
+
+		tx_desc = IAVF_TX_DESC(xdp_ring, xdp_ring->next_rs);
+		tx_desc->cmd_type_offset_bsz |=
+			cpu_to_le64(IAVF_TX_DESC_CMD_RS <<
+					IAVF_TXD_QW1_CMD_SHIFT);
+		xdp_ring->next_rs += tx_thresh;
+	}
+}
+
+/**
+ * iavf_xmit_zc - take entries from XSK Tx ring and place them onto HW Tx ring
+ * @xdp_ring: XDP ring to produce the HW Tx descriptors on
+ *
+ * Returns true if there is no more work that needs to be done, false otherwise
+ */
+bool iavf_xmit_zc(struct iavf_ring *xdp_ring)
+{
+	struct xdp_desc *descs = xdp_ring->xsk_pool->tx_descs;
+	u32 nb_pkts, nb_processed = 0;
+	unsigned int total_bytes = 0;
+	int budget;
+
+	iavf_clean_xdp_irq_zc(xdp_ring);
+
+	budget = IAVF_DESC_UNUSED(xdp_ring);
+	budget = min_t(u16, budget, IAVF_RING_QUARTER(xdp_ring));
+
+	nb_pkts = xsk_tx_peek_release_desc_batch(xdp_ring->xsk_pool, budget);
+	if (!nb_pkts)
+		return true;
+
+	if (xdp_ring->next_to_use + nb_pkts >= xdp_ring->count) {
+		nb_processed = xdp_ring->count - xdp_ring->next_to_use;
+		iavf_fill_tx_hw_ring(xdp_ring, descs, nb_processed,
+				     &total_bytes);
+		xdp_ring->next_to_use = 0;
+	}
+
+	iavf_fill_tx_hw_ring(xdp_ring, &descs[nb_processed],
+			     nb_pkts - nb_processed, &total_bytes);
+
+	iavf_set_rs_bit(xdp_ring);
+	iavf_xdp_ring_update_tail(xdp_ring);
+	iavf_update_tx_ring_stats(xdp_ring, nb_pkts, total_bytes);
+
+	if (xsk_uses_need_wakeup(xdp_ring->xsk_pool))
+		xsk_set_tx_need_wakeup(xdp_ring->xsk_pool);
+
+	return nb_pkts < budget;
+}
+
+/**
+ * iavf_xsk_wakeup - Implements ndo_xsk_wakeup
+ * @netdev: net_device
+ * @queue_id: queue to wake up
+ * @flags: ignored in our case, since we have Rx and Tx in the same NAPI
+ *
+ * Returns negative on error, zero otherwise.
+ */
+int iavf_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags)
+{
+	struct iavf_adapter *adapter = netdev_priv(netdev);
+	struct iavf_q_vector *q_vector;
+	struct iavf_ring *ring;
+
+	if (adapter->state == __IAVF_DOWN)
+		return -ENETDOWN;
+
+	if (!iavf_adapter_xdp_active(adapter))
+		return -EINVAL;
+
+	if (queue_id >= adapter->num_active_queues)
+		return -EINVAL;
+
+	ring = &adapter->rx_rings[queue_id];
+
+	if (!(ring->xdp_ring->flags & IAVF_TXRX_FLAGS_XSK))
+		return -EINVAL;
+
+	q_vector = ring->q_vector;
+	if (!napi_if_scheduled_mark_missed(&q_vector->napi))
+		iavf_trigger_sw_intr(adapter, q_vector);
+
+	return 0;
+}
+
+static u32 iavf_get_xdp_tx_qid(struct iavf_ring *ring)
+{
+	struct iavf_adapter *adapter = ring->vsi->back;
+
+	return ring->queue_index - adapter->num_active_queues;
+}
+
+static struct xsk_buff_pool *iavf_tx_xsk_pool(struct iavf_ring *ring)
+{
+	struct iavf_adapter *adapter = ring->vsi->back;
+	u32 qid;
+
+	if (!iavf_adapter_xdp_active(adapter) ||
+	    !(ring->flags & IAVF_TXRX_FLAGS_XDP))
+		return NULL;
+
+	qid = iavf_get_xdp_tx_qid(ring);
+	if (!test_bit(qid, adapter->af_xdp_zc_qps))
+		return NULL;
+
+	return xsk_get_pool_from_qid(adapter->netdev, qid);
+}
+
+void iavf_xsk_setup_xdp_ring(struct iavf_ring *xdp_ring)
+{
+	struct xsk_buff_pool *pool;
+
+	pool = iavf_tx_xsk_pool(xdp_ring);
+	if (!pool)
+		return;
+
+	xdp_ring->xsk_pool = pool;
+	xdp_ring->flags |= IAVF_TXRX_FLAGS_XSK;
+}
+
+/**
+ * iavf_xsk_clean_xdp_ring - Clean the XDP Tx ring and its buffer pool queues
+ * @xdp_ring: XDP_Tx ring
+ */
+void iavf_xsk_clean_xdp_ring(struct iavf_ring *xdp_ring)
+{
+	u16 ntc = xdp_ring->next_to_clean, ntu = xdp_ring->next_to_use;
+	u32 xsk_frames = 0;
+
+	while (ntc != ntu) {
+		struct iavf_tx_buffer *tx_buf = &xdp_ring->tx_bi[ntc];
+
+		if (tx_buf->raw_buf)
+			iavf_clean_xdp_tx_buf(xdp_ring, tx_buf);
+		else
+			xsk_frames++;
+
+		tx_buf->raw_buf = NULL;
+
+		ntc++;
+		if (ntc >= xdp_ring->count)
+			ntc = 0;
+	}
+
+	if (xsk_frames)
+		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.h b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
index c09cde98e36bc1..2c3c103ddd7781 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
@@ -6,10 +6,28 @@
 
 #include <linux/types.h>
 
+#define PKTS_PER_BATCH 8
+
+#ifdef __clang__
+#define loop_unrolled_for _Pragma("clang loop unroll_count(8)") for
+#elif __GNUC__ >= 8
+#define loop_unrolled_for _Pragma("GCC unroll 8") for
+#else
+#define loop_unrolled_for for
+#endif
+
 struct iavf_adapter;
+struct iavf_ring;
+struct net_device;
 struct xsk_buff_pool;
 
 int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
 			struct xsk_buff_pool *pool, u32 qid);
 
+int iavf_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
+bool iavf_xmit_zc(struct iavf_ring *xdp_ring);
+void iavf_xsk_clean_xdp_ring(struct iavf_ring *xdp_ring);
+
+void iavf_xsk_setup_xdp_ring(struct iavf_ring *xdp_ring);
+
 #endif /* !_IAVF_XSK_H_ */

From d5b13b46f4904ad224e8b47747e3a13307d47ef5 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 13:18:30 +0100
Subject: [PATCH 29/32] iavf: Implement AF_XDP RX processing

Implement RX packet processing specific to AF_XDP ZC.
All actions except XDP_PASS are supported, the skb path will
be implemented in later patches.

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_main.c   |  27 +-
 drivers/net/ethernet/intel/iavf/iavf_trace.h  |   8 +
 drivers/net/ethernet/intel/iavf/iavf_txrx.c   |  82 ++--
 drivers/net/ethernet/intel/iavf/iavf_txrx.h   |  40 ++
 .../net/ethernet/intel/iavf/iavf_virtchnl.c   |  14 +-
 drivers/net/ethernet/intel/iavf/iavf_xsk.c    | 433 +++++++++++++++++-
 drivers/net/ethernet/intel/iavf/iavf_xsk.h    |   6 +
 7 files changed, 562 insertions(+), 48 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c
index 8ff342d5581728..c3267f6968130f 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_main.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_main.c
@@ -778,14 +778,29 @@ void iavf_configure_rx_ring(struct iavf_adapter *adapter,
 				       rx_ring->queue_index,
 				       rx_ring->q_vector->napi.napi_id);
 
-	err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq, MEM_TYPE_PAGE_POOL,
-					 rx_ring->pool);
-	if (err)
-		netdev_err(adapter->netdev, "Could not register XDP memory model for RX queue %u, error: %d\n",
-			   queue_idx, err);
+	if (rx_ring->flags & IAVF_TXRX_FLAGS_XSK) {
+		err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
+						 MEM_TYPE_XSK_BUFF_POOL,
+						 NULL);
+		if (err)
+			netdev_err(adapter->netdev, "xdp_rxq_info_reg_mem_model returned %d\n",
+				   err);
+
+		xsk_pool_set_rxq_info(rx_ring->xsk_pool, &rx_ring->xdp_rxq);
+
+		iavf_check_alloc_rx_buffers_zc(adapter, rx_ring);
+	} else {
+		err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
+						 MEM_TYPE_PAGE_POOL,
+						 rx_ring->pool);
+		if (err)
+			netdev_err(adapter->netdev, "Could not register XDP memory model for RX queue %u, error: %d\n",
+				   queue_idx, err);
+
+		iavf_alloc_rx_pages(rx_ring);
+	}
 
 	RCU_INIT_POINTER(rx_ring->xdp_prog, adapter->xdp_prog);
-	iavf_alloc_rx_pages(rx_ring);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/iavf/iavf_trace.h b/drivers/net/ethernet/intel/iavf/iavf_trace.h
index 82fda6f5abf043..ac46fbe55bd2e5 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_trace.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_trace.h
@@ -145,6 +145,14 @@ DEFINE_EVENT(
 
 	TP_ARGS(ring, desc, skb));
 
+DEFINE_EVENT(
+	iavf_rx_template, iavf_clean_rx_irq_zc,
+	TP_PROTO(struct iavf_ring *ring,
+		 union iavf_32byte_rx_desc *desc,
+		 struct sk_buff *skb),
+
+	TP_ARGS(ring, desc, skb));
+
 DEFINE_EVENT(
 	iavf_rx_template, iavf_clean_rx_irq_rx,
 	TP_PROTO(struct iavf_ring *ring,
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 1f1579351dc5fa..56ef30cfa38bf5 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -103,6 +103,11 @@ void iavf_free_tx_resources(struct iavf_ring *tx_ring)
 	kfree(tx_ring->tx_bi);
 	tx_ring->tx_bi = NULL;
 
+	if (tx_ring->flags & IAVF_TXRX_FLAGS_XSK) {
+		tx_ring->dev = tx_ring->xsk_pool->dev;
+		tx_ring->flags &= ~IAVF_TXRX_FLAGS_XSK;
+	}
+
 	if (tx_ring->desc) {
 		dma_free_coherent(tx_ring->dev, tx_ring->size,
 				  tx_ring->desc, tx_ring->dma);
@@ -705,6 +710,22 @@ int iavf_setup_tx_descriptors(struct iavf_ring *tx_ring)
 	return -ENOMEM;
 }
 
+static void iavf_clean_rx_pages(struct iavf_ring *rx_ring)
+{
+	for (u32 i = 0; i < rx_ring->count; i++) {
+		struct page *page = rx_ring->rx_pages[i];
+
+		if (!page)
+			continue;
+
+		/* Invalidate cache lines that may have been written to by
+		 * device so that we avoid corrupting memory.
+		 */
+		page_pool_dma_sync_full_for_cpu(rx_ring->pool, page);
+		page_pool_put_full_page(rx_ring->pool, page, false);
+	}
+}
+
 /**
  * iavf_clean_rx_ring - Free Rx buffers
  * @rx_ring: ring to be cleaned
@@ -720,19 +741,10 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
 		rx_ring->skb = NULL;
 	}
 
-	/* Free all the Rx ring sk_buffs */
-	for (u32 i = 0; i < rx_ring->count; i++) {
-		struct page *page = rx_ring->rx_pages[i];
-
-		if (!page)
-			continue;
-
-		/* Invalidate cache lines that may have been written to by
-		 * device so that we avoid corrupting memory.
-		 */
-		page_pool_dma_sync_full_for_cpu(rx_ring->pool, page);
-		page_pool_put_full_page(rx_ring->pool, page, false);
-	}
+	if (rx_ring->flags & IAVF_TXRX_FLAGS_XSK)
+		iavf_xsk_clean_rx_ring(rx_ring);
+	else
+		iavf_clean_rx_pages(rx_ring);
 
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
@@ -746,7 +758,7 @@ void iavf_clean_rx_ring(struct iavf_ring *rx_ring)
  **/
 void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 {
-	struct device *dev = rx_ring->pool->p.dev;
+	struct device *dev;
 
 	iavf_clean_rx_ring(rx_ring);
 	kfree(rx_ring->rx_pages);
@@ -756,7 +768,14 @@ void iavf_free_rx_resources(struct iavf_ring *rx_ring)
 	if (xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
 		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
 
-	libie_rx_page_pool_destroy(rx_ring->pool, &rx_ring->rq_stats);
+	if (rx_ring->flags & IAVF_TXRX_FLAGS_XSK) {
+		dev = rx_ring->xsk_pool->dev;
+		rx_ring->flags &= ~IAVF_TXRX_FLAGS_XSK;
+	} else {
+		dev = rx_ring->pool->p.dev;
+		libie_rx_page_pool_destroy(rx_ring->pool, &rx_ring->rq_stats);
+	}
+
 	rx_ring->dev = dev;
 
 	if (rx_ring->desc) {
@@ -791,6 +810,8 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 
 	/* warn if we are about to overwrite the pointer */
 	WARN_ON(rx_ring->rx_pages);
+
+	/* Both iavf_ring::rx_pages and ::xdp_buff are arrays of pointers */
 	rx_ring->rx_pages = kcalloc(rx_ring->count, sizeof(*rx_ring->rx_pages),
 				    GFP_KERNEL);
 	if (!rx_ring->rx_pages)
@@ -808,6 +829,10 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 		goto err;
 	}
 
+	iavf_xsk_setup_rx_ring(rx_ring);
+	if (rx_ring->flags & IAVF_TXRX_FLAGS_XSK)
+		goto finish;
+
 	pool = libie_rx_page_pool_create(rx_ring->netdev, rx_ring->count,
 					 iavf_is_xdp_enabled(rx_ring));
 	if (IS_ERR(pool)) {
@@ -817,6 +842,7 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 
 	rx_ring->pool = pool;
 
+finish:
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
@@ -831,24 +857,6 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 	return ret;
 }
 
-/**
- * iavf_release_rx_desc - Store the new tail and head values
- * @rx_ring: ring to bump
- * @val: new head index
- **/
-static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
-{
-	rx_ring->next_to_use = val;
-
-	/* Force memory writes to complete before letting h/w
-	 * know there are new descriptors to fetch.  (Only
-	 * applicable for weak-ordered memory model archs,
-	 * such as IA-64).
-	 */
-	wmb();
-	writel(val, rx_ring->tail);
-}
-
 /**
  * iavf_receive_skb - Send a completed packet up the stack
  * @rx_ring:  rx ring in play
@@ -1345,9 +1353,7 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		}
 	}
 
-	libie_rq_napi_stats_add(&rx_ring->rq_stats, &stats);
-	rx_ring->q_vector->rx.total_packets += stats.packets;
-	rx_ring->q_vector->rx.total_bytes += stats.bytes;
+	iavf_update_rx_ring_stats(rx_ring, &stats);
 
 	return cleaned_count;
 }
@@ -1507,7 +1513,9 @@ int iavf_napi_poll(struct napi_struct *napi, int budget)
 	rcu_read_lock();
 
 	iavf_for_each_ring(ring, q_vector->rx) {
-		int cleaned = iavf_clean_rx_irq(ring, budget_per_ring);
+		int cleaned = !!(ring->flags & IAVF_TXRX_FLAGS_XSK) ?
+			      iavf_clean_rx_irq_zc(ring, budget_per_ring) :
+			      iavf_clean_rx_irq(ring, budget_per_ring);
 
 		work_done += cleaned;
 		/* if we clean as many as budgeted, we must not be done */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 5ff0a83229016d..0ff15aefce230c 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -224,6 +224,7 @@ struct iavf_ring {
 	struct net_device *netdev;	/* netdev ring maps to */
 	union {
 		struct iavf_tx_buffer *tx_bi;
+		struct xdp_buff **xdp_buff;
 		struct page **rx_pages;
 	};
 	u8 __iomem *tail;
@@ -449,8 +450,47 @@ __iavf_update_tx_ring_stats(struct iavf_ring *tx_ring,
 #define iavf_update_tx_ring_stats(r, s) \
 	__iavf_update_tx_ring_stats(r, &(r)->q_vector->tx, s)
 
+/**
+ * iavf_update_rx_ring_stats - Update RX ring stats
+ * @rx_ring: ring to bump
+ * @rc: TODO
+ * @rx_bytes: number of bytes processed since last update
+ * @rx_packets: number of packets processed since last update
+ **/
+static inline void
+__iavf_update_rx_ring_stats(struct iavf_ring *rx_ring,
+			    struct iavf_ring_container *rc,
+			    const struct libie_rq_onstack_stats *stats)
+{
+	libie_rq_napi_stats_add(&rx_ring->rq_stats, stats);
+	rc->total_packets += stats->packets;
+	rc->total_bytes += stats->bytes;
+}
+
+#define iavf_update_rx_ring_stats(r, s) \
+	__iavf_update_rx_ring_stats(r, &(r)->q_vector->rx, s)
+
+/**
+ * iavf_release_rx_desc - Store the new tail and head values
+ * @rx_ring: ring to bump
+ * @val: new head index
+ **/
+static inline void iavf_release_rx_desc(struct iavf_ring *rx_ring, u32 val)
+{
+	rx_ring->next_to_use = val;
+
+	/* Force memory writes to complete before letting h/w
+	 * know there are new descriptors to fetch.  (Only
+	 * applicable for weak-ordered memory model archs,
+	 * such as IA-64).
+	 */
+	wmb();
+	writel(val, rx_ring->tail);
+}
+
 #define IAVF_RXQ_XDP_ACT_FINALIZE_TX	BIT(0)
 #define IAVF_RXQ_XDP_ACT_FINALIZE_REDIR	BIT(1)
+#define IAVF_RXQ_XDP_ACT_STOP_NOW	BIT(2)
 
 /**
  * iavf_finalize_xdp_rx - Finalize XDP actions once per RX ring clean
diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
index d7d37e2e7ff4b3..b84cfa95728cca 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c
@@ -413,8 +413,8 @@ static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 				    bool xdp_pair)
 {
 	struct iavf_ring *rxq = &adapter->rx_rings[queue_index];
-	const struct page_pool_params *pp = &rxq->pool->p;
 	struct iavf_ring *txq;
+	u32 hr, max_len;
 	int xdpq_idx;
 
 	if (xdp_pair) {
@@ -435,12 +435,20 @@ static void iavf_set_qp_config_info(struct virtchnl_queue_pair_info *vqpi,
 		return;
 	}
 
-	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN(pp->offset));
+	if (rxq->flags & IAVF_TXRX_FLAGS_XSK) {
+		hr = xsk_pool_get_headroom(rxq->xsk_pool);
+		max_len = xsk_pool_get_rx_frame_size(rxq->xsk_pool);
+	} else {
+		hr = rxq->pool->p.offset;
+		max_len = rxq->pool->p.max_len;
+	}
+
+	max_frame = min_not_zero(max_frame, LIBIE_MAX_RX_FRM_LEN(hr));
 
 	vqpi->rxq.ring_len = rxq->count;
 	vqpi->rxq.dma_ring_addr = rxq->dma;
 	vqpi->rxq.max_pkt_size = max_frame;
-	vqpi->rxq.databuffer_size = pp->max_len;
+	vqpi->rxq.databuffer_size = max_len;
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.c b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
index 232ae432c1ee95..7b9f3fff1c5f81 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
@@ -1,9 +1,13 @@
 // SPDX-License-Identifier: GPL-2.0
 /* Copyright(c) 2022 Intel Corporation. */
 
+#include <linux/bpf_trace.h>
+#include <linux/filter.h>
+#include <linux/net/intel/libie.h>
 #include <net/xdp_sock_drv.h>
 #include <net/xdp_sock.h>
 #include "iavf.h"
+#include "iavf_trace.h"
 #include "iavf_xsk.h"
 
 #define IAVF_PF_REQ_TIMEOUT_MS		300
@@ -286,7 +290,7 @@ static int iavf_xsk_pool_disable(struct iavf_adapter *adapter, u16 qid)
 		return -EINVAL;
 
 	clear_bit(qid, adapter->af_xdp_zc_qps);
-	xsk_pool_dma_unmap(pool, IAVF_RX_DMA_ATTR);
+	xsk_pool_dma_unmap(pool, LIBIE_RX_DMA_ATTR);
 
 	return 0;
 }
@@ -310,7 +314,7 @@ iavf_xsk_pool_enable(struct iavf_adapter *adapter, struct xsk_buff_pool *pool,
 	    qid >= vsi->netdev->real_num_tx_queues)
 		return -EINVAL;
 
-	err = xsk_pool_dma_map(pool, &adapter->pdev->dev, IAVF_RX_DMA_ATTR);
+	err = xsk_pool_dma_map(pool, &adapter->pdev->dev, LIBIE_RX_DMA_ATTR);
 	if (err)
 		return err;
 
@@ -345,6 +349,8 @@ int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
 		     iavf_adapter_xdp_active(adapter);
 
 	if (if_running) {
+		struct iavf_ring *rx_ring = &adapter->rx_rings[qid];
+
 		if (iavf_lock_timeout(&adapter->crit_lock,
 				      IAVF_CRIT_LOCK_WAIT_TIMEOUT_MS))
 			return -EBUSY;
@@ -359,6 +365,15 @@ int iavf_xsk_pool_setup(struct iavf_adapter *adapter,
 			netdev_err(vsi->netdev, "iavf_qp_dis error = %d\n", ret);
 			goto xsk_pool_if_up;
 		}
+
+		iavf_free_rx_resources(rx_ring);
+
+		ret = iavf_setup_rx_descriptors(rx_ring);
+		if (ret) {
+			netdev_err(vsi->netdev,
+				   "iavf rx re-allocation error = %d\n", ret);
+			goto xsk_pool_if_up;
+		}
 	}
 
 	pool_failure = pool_present ? iavf_xsk_pool_enable(adapter, pool, qid) :
@@ -697,3 +712,417 @@ void iavf_xsk_clean_xdp_ring(struct iavf_ring *xdp_ring)
 	if (xsk_frames)
 		xsk_tx_completed(xdp_ring->xsk_pool, xsk_frames);
 }
+
+/**
+ * iavf_init_rx_descs_zc - pick buffers from XSK buffer pool and use it
+ * @pool: XSK Buffer pool to pull the buffers from
+ * @xdp: SW ring of xdp_buff that will hold the buffers
+ * @rx_desc: Pointer to Rx descriptors that will be filled
+ * @count: The number of buffers to allocate
+ *
+ * This function allocates a number of Rx buffers from the fill ring
+ * or the internal recycle mechanism and places them on the Rx ring.
+ *
+ * Note that ring wrap should be handled by caller of this function.
+ *
+ * Returns the amount of allocated Rx descriptors
+ */
+static u16 iavf_init_rx_descs_zc(struct xsk_buff_pool *pool,
+				 struct xdp_buff **xdp,
+				 union iavf_rx_desc *rx_desc, u16 count)
+{
+	dma_addr_t dma;
+	u16 num_buffs;
+	u16 i;
+
+	num_buffs = xsk_buff_alloc_batch(pool, xdp, count);
+	for (i = 0; i < num_buffs; i++) {
+		dma = xsk_buff_xdp_get_dma(*xdp);
+		rx_desc->read.pkt_addr = cpu_to_le64(dma);
+		rx_desc->wb.qword1.status_error_len = 0;
+
+		rx_desc++;
+		xdp++;
+	}
+
+	return num_buffs;
+}
+
+static struct xdp_buff **iavf_get_xdp_buff(struct iavf_ring *ring, u32 idx)
+{
+	return &ring->xdp_buff[idx];
+}
+
+/**
+ * __iavf_alloc_rx_buffers_zc - allocate a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ *
+ * Place the @count of descriptors onto Rx ring. Handle the ring wrap
+ * for case where space from next_to_use up to the end of ring is less
+ * than @count. Finally do a tail bump.
+ *
+ * Returns true if all allocations were successful, false if any fail.
+ */
+static bool __iavf_alloc_rx_buffers_zc(struct iavf_ring *rx_ring, u16 count)
+{
+	u32 nb_buffs_extra = 0, nb_buffs = 0;
+	u16 ntu = rx_ring->next_to_use;
+	union iavf_rx_desc *rx_desc;
+	u16 total_count = count;
+	struct xdp_buff **xdp;
+
+	rx_desc = IAVF_RX_DESC(rx_ring, ntu);
+	xdp = iavf_get_xdp_buff(rx_ring, ntu);
+
+	if (ntu + count >= rx_ring->count) {
+		nb_buffs_extra = iavf_init_rx_descs_zc(rx_ring->xsk_pool, xdp,
+						       rx_desc,
+						       rx_ring->count - ntu);
+		if (nb_buffs_extra != rx_ring->count - ntu) {
+			ntu += nb_buffs_extra;
+			goto exit;
+		}
+		rx_desc = IAVF_RX_DESC(rx_ring, 0);
+		xdp = iavf_get_xdp_buff(rx_ring, 0);
+		ntu = 0;
+		count -= nb_buffs_extra;
+		iavf_release_rx_desc(rx_ring, 0);
+
+		if (!count)
+			goto exit;
+	}
+
+	nb_buffs = iavf_init_rx_descs_zc(rx_ring->xsk_pool, xdp, rx_desc, count);
+
+	ntu += nb_buffs;
+	if (ntu == rx_ring->count)
+		ntu = 0;
+
+exit:
+	if (rx_ring->next_to_use != ntu)
+		iavf_release_rx_desc(rx_ring, ntu);
+
+	return total_count == (nb_buffs_extra + nb_buffs);
+}
+
+/**
+ * iavf_alloc_rx_buffers_zc - allocate a number of Rx buffers
+ * @rx_ring: Rx ring
+ * @count: The number of buffers to allocate
+ *
+ * Wrapper for internal allocation routine; figure out how many tail
+ * bumps should take place based on the given threshold
+ *
+ * Returns true if all calls to internal alloc routine succeeded
+ */
+static bool iavf_alloc_rx_buffers_zc(struct iavf_ring *rx_ring, u16 count)
+{
+	u16 rx_thresh = IAVF_RING_QUARTER(rx_ring);
+	u16 leftover, i, tail_bumps;
+
+	tail_bumps = count / rx_thresh;
+	leftover = count - (tail_bumps * rx_thresh);
+
+	for (i = 0; i < tail_bumps; i++)
+		if (!__iavf_alloc_rx_buffers_zc(rx_ring, rx_thresh))
+			return false;
+	return __iavf_alloc_rx_buffers_zc(rx_ring, leftover);
+}
+
+/**
+ * iavf_check_alloc_rx_buffers_zc - allocate a number of Rx buffers with logs
+ * @adapter: board private structure
+ * @rx_ring: Rx ring
+ *
+ * Wrapper for internal allocation routine; Prints out logs, if allocation
+ * did not go as expected
+ */
+void iavf_check_alloc_rx_buffers_zc(struct iavf_adapter *adapter,
+				    struct iavf_ring *rx_ring)
+{
+	u32 count = IAVF_DESC_UNUSED(rx_ring);
+
+	if (!xsk_buff_can_alloc(rx_ring->xsk_pool, count)) {
+		netdev_warn(adapter->netdev,
+			    "XSK buffer pool does not provide enough addresses to fill %d buffers on Rx ring %d\n",
+			    count, rx_ring->queue_index);
+		netdev_warn(adapter->netdev,
+			    "Change Rx ring/fill queue size to avoid performance issues\n");
+	}
+
+	if (!iavf_alloc_rx_buffers_zc(rx_ring, count))
+		netdev_warn(adapter->netdev,
+			    "Failed to allocate some buffers on XSK buffer pool enabled Rx ring %d\n",
+			    rx_ring->queue_index);
+}
+
+/**
+ * iavf_rx_xsk_pool - Get a valid xsk pool for RX ring
+ * @ring: Rx ring being configured
+ *
+ * Do not return a xsk pool, if socket is TX-only
+ **/
+static struct xsk_buff_pool *iavf_rx_xsk_pool(struct iavf_ring *ring)
+{
+	struct iavf_adapter *adapter = ring->vsi->back;
+	u16 qid = ring->queue_index;
+	struct xsk_buff_pool *pool;
+
+	if (!iavf_adapter_xdp_active(adapter) ||
+	    !test_bit(qid, adapter->af_xdp_zc_qps))
+		return NULL;
+
+	pool = xsk_get_pool_from_qid(adapter->netdev, qid);
+	if (!pool || !xsk_buff_can_alloc(pool, 1))
+		return NULL;
+
+	return pool;
+}
+
+void iavf_xsk_setup_rx_ring(struct iavf_ring *rx_ring)
+{
+	struct xsk_buff_pool *pool;
+
+	pool = iavf_rx_xsk_pool(rx_ring);
+	if (!pool)
+		return;
+
+	rx_ring->xsk_pool = pool;
+	rx_ring->flags |= IAVF_TXRX_FLAGS_XSK;
+}
+
+/**
+ * iavf_xsk_clean_rx_ring - clean buffer pool queues connected to a given Rx ring
+ * @rx_ring: ring to be cleaned
+ */
+void iavf_xsk_clean_rx_ring(struct iavf_ring *rx_ring)
+{
+	u16 ntc = rx_ring->next_to_clean;
+	u16 ntu = rx_ring->next_to_use;
+
+	while (ntc != ntu) {
+		struct xdp_buff *xdp = *iavf_get_xdp_buff(rx_ring, ntc);
+
+		xsk_buff_free(xdp);
+		ntc++;
+		if (ntc >= rx_ring->count)
+			ntc = 0;
+	}
+}
+
+static int iavf_xmit_xdp_buff_zc(struct xdp_buff *xdp,
+				 struct iavf_ring *xdp_ring)
+{
+	u32 batch_sz = IAVF_RING_QUARTER(xdp_ring);
+	u32 size = xdp->data_end - xdp->data;
+	u32 ntu = xdp_ring->next_to_use;
+	struct iavf_tx_buffer *tx_buff;
+	struct iavf_tx_desc *tx_desc;
+	void *data = xdp->data;
+	dma_addr_t dma;
+
+	if (unlikely(IAVF_DESC_UNUSED(xdp_ring) < batch_sz))
+		iavf_clean_xdp_irq_zc(xdp_ring);
+
+	if (unlikely(!IAVF_DESC_UNUSED(xdp_ring))) {
+		xdp_ring->tx_stats.tx_busy++;
+		return -EBUSY;
+	}
+
+	dma = xsk_buff_xdp_get_dma(xdp);
+	xsk_buff_raw_dma_sync_for_device(xdp_ring->xsk_pool, dma, size);
+
+	tx_buff = &xdp_ring->tx_bi[ntu];
+	tx_buff->bytecount = size;
+	tx_buff->gso_segs = 1;
+	/* TODO: set type to XSK_TX or XDP_XMIT depending on @map and assign
+	 * @xdp here.
+	 */
+	tx_buff->raw_buf = data;
+
+	/* record length, and DMA address */
+	dma_unmap_len_set(tx_buff, len, size);
+	dma_unmap_addr_set(tx_buff, dma, dma);
+
+	tx_desc = IAVF_TX_DESC(xdp_ring, ntu);
+	tx_desc->buffer_addr = cpu_to_le64(dma);
+	tx_desc->cmd_type_offset_bsz = iavf_build_ctob(IAVF_TX_DESC_CMD_EOP, 0,
+						       size, 0);
+
+	ntu++;
+	if (ntu > xdp_ring->next_rs) {
+		tx_desc = IAVF_TX_DESC(xdp_ring, xdp_ring->next_rs);
+		tx_desc->cmd_type_offset_bsz |=
+			cpu_to_le64(IAVF_TX_DESC_CMD_RS <<
+				    IAVF_TXD_QW1_CMD_SHIFT);
+		xdp_ring->next_rs += batch_sz;
+	}
+
+	if (ntu == xdp_ring->count) {
+		ntu = 0;
+		xdp_ring->next_rs = batch_sz - 1;
+	}
+
+	xdp_ring->next_to_use = ntu;
+	xdp_ring->xdp_tx_active++;
+
+	return 0;
+}
+
+/**
+ * iavf_run_xdp_zc - Run XDP program and perform resulting action for ZC
+ * @rx_ring: RX descriptor ring to transact packets on
+ * @xdp: a prepared XDP buffer
+ * @xdp_prog: an XDP program assigned to the interface
+ * @xdp_ring: XDP TX queue assigned to the RX ring
+ * @rxq_xdp_act: Logical OR of flags of XDP actions that require finalization
+ *
+ * Returns resulting XDP action.
+ **/
+static unsigned int
+iavf_run_xdp_zc(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
+		struct bpf_prog *xdp_prog, struct iavf_ring *xdp_ring,
+		u32 *rxq_xdp_act)
+{
+	unsigned int xdp_act;
+	int err;
+
+	xdp_act = bpf_prog_run_xdp(xdp_prog, xdp);
+
+	if (likely(xdp_act == XDP_REDIRECT)) {
+		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
+		if (likely(!err)) {
+			*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_REDIR;
+			return XDP_REDIRECT;
+		}
+
+		if (xsk_uses_need_wakeup(rx_ring->xsk_pool) && err == -ENOBUFS)
+			*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_STOP_NOW;
+
+		goto xdp_err;
+	}
+
+	switch (xdp_act) {
+	case XDP_TX:
+		err = iavf_xmit_xdp_buff_zc(xdp, xdp_ring);
+		if (unlikely(err))
+			goto xdp_err;
+
+		*rxq_xdp_act |= IAVF_RXQ_XDP_ACT_FINALIZE_TX;
+		break;
+	default:
+		bpf_warn_invalid_xdp_action(rx_ring->netdev, xdp_prog, xdp_act);
+
+		fallthrough;
+	case XDP_ABORTED:
+xdp_err:
+		trace_xdp_exception(rx_ring->netdev, xdp_prog, xdp_act);
+
+		fallthrough;
+	case XDP_DROP:
+		xsk_buff_free(xdp);
+
+		return XDP_DROP;
+	}
+
+	return xdp_act;
+}
+
+/**
+ * iavf_clean_rx_irq_zc - consumes packets from the hardware ring
+ * @rx_ring: AF_XDP Rx ring
+ * @budget: NAPI budget
+ *
+ * Returns number of processed packets on success, remaining budget on failure.
+ */
+int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget)
+{
+	unsigned int total_rx_bytes = 0, total_rx_packets = 0;
+	u32 ntc = rx_ring->next_to_clean;
+	u32 ring_size = rx_ring->count;
+	struct iavf_ring *xdp_ring;
+	struct bpf_prog *xdp_prog;
+	u32 cleaned_count = 0;
+	bool failure = false;
+	u32 rxq_xdp_act = 0;
+	u32 to_refill;
+
+	xdp_prog = rcu_dereference(rx_ring->xdp_prog);
+	xdp_ring = rx_ring->xdp_ring;
+
+	while (likely(cleaned_count < budget)) {
+		union iavf_rx_desc *rx_desc;
+		struct xdp_buff *xdp;
+		unsigned int size;
+		u64 qword;
+
+		rx_desc = IAVF_RX_DESC(rx_ring, ntc);
+
+		/* status_error_len will always be zero for unused descriptors
+		 * because it's cleared in cleanup, and overlaps with hdr_addr
+		 * which is always zero because packet split isn't used, if the
+		 * hardware wrote DD then the length will be non-zero
+		 */
+		qword = le64_to_cpu(rx_desc->wb.qword1.status_error_len);
+		if (!iavf_test_staterr(qword, IAVF_RX_DESC_STATUS_DD_SHIFT))
+			break;
+
+		/* This memory barrier is needed to keep us from reading
+		 * any other fields out of the rx_desc until we have
+		 * verified the descriptor has been written back.
+		 */
+		dma_rmb();
+
+		size = (qword & IAVF_RXD_QW1_LENGTH_PBUF_MASK) >>
+		       IAVF_RXD_QW1_LENGTH_PBUF_SHIFT;
+
+		xdp = *iavf_get_xdp_buff(rx_ring, ntc);
+		iavf_trace(clean_rx_irq_zc, rx_ring, rx_desc, NULL);
+
+		if (unlikely(!size)) {
+			xsk_buff_free(xdp);
+			goto next;
+		}
+
+		xsk_buff_set_size(xdp, size);
+		xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool);
+
+		iavf_run_xdp_zc(rx_ring, xdp, xdp_prog, xdp_ring,
+				&rxq_xdp_act);
+
+		if (unlikely(rxq_xdp_act & IAVF_RXQ_XDP_ACT_STOP_NOW)) {
+			failure = true;
+			break;
+		}
+
+		total_rx_bytes += size;
+		total_rx_packets++;
+
+next:
+		cleaned_count++;
+		if (unlikely(++ntc == ring_size))
+			ntc = 0;
+	}
+
+	rx_ring->next_to_clean = ntc;
+
+	iavf_finalize_xdp_rx(xdp_ring, rxq_xdp_act);
+
+	to_refill = IAVF_DESC_UNUSED(rx_ring);
+	if (to_refill > IAVF_RING_QUARTER(rx_ring))
+		failure |= !iavf_alloc_rx_buffers_zc(rx_ring, to_refill);
+
+	iavf_update_rx_ring_stats(rx_ring, total_rx_bytes, total_rx_packets);
+
+	if (xsk_uses_need_wakeup(rx_ring->xsk_pool)) {
+		if (failure || rx_ring->next_to_clean == rx_ring->next_to_use)
+			xsk_set_rx_need_wakeup(rx_ring->xsk_pool);
+		else
+			xsk_clear_rx_need_wakeup(rx_ring->xsk_pool);
+
+		return cleaned_count;
+	}
+
+	return unlikely(failure) ? budget : cleaned_count;
+}
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.h b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
index 2c3c103ddd7781..65aae299db4c0a 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.h
@@ -28,6 +28,12 @@ int iavf_xsk_wakeup(struct net_device *netdev, u32 queue_id, u32 flags);
 bool iavf_xmit_zc(struct iavf_ring *xdp_ring);
 void iavf_xsk_clean_xdp_ring(struct iavf_ring *xdp_ring);
 
+void iavf_xsk_clean_rx_ring(struct iavf_ring *rx_ring);
+int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget);
+void iavf_check_alloc_rx_buffers_zc(struct iavf_adapter *adapter,
+				    struct iavf_ring *rx_ring);
+
 void iavf_xsk_setup_xdp_ring(struct iavf_ring *xdp_ring);
+void iavf_xsk_setup_rx_ring(struct iavf_ring *rx_ring);
 
 #endif /* !_IAVF_XSK_H_ */

From 43294b7d0a674e80ffde85b5973c8c52e3484e2e Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 23 Feb 2023 19:02:22 +0100
Subject: [PATCH 30/32] iavf: consolidate skb fields processing

For now, filling the skb fields on Rx is a bit scattered across RQ
polling function. This makes it harder to reuse the code on XSk Rx
path and also sometimes costs some CPU (e.g. doing a lookup for the
decoded packet type two times).
Make it consistent and do everything in iavf_process_skb_fields(). First
of all, get the packet type and decode it. Then, move to hash, csum and
VLAN, which is moved here too. iavf_receive_skb() becomes then the
classic eth_type_trans() + napi_gro_receive() pair.
Finally, make the fields processing function global and the skb receive
function static inline in order to call them from a different file later
on.

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 103 +++++++++-----------
 drivers/net/ethernet/intel/iavf/iavf_txrx.h |   3 +
 2 files changed, 49 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 56ef30cfa38bf5..6d5f7b2a332077 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -857,27 +857,6 @@ int iavf_setup_rx_descriptors(struct iavf_ring *rx_ring)
 	return ret;
 }
 
-/**
- * iavf_receive_skb - Send a completed packet up the stack
- * @rx_ring:  rx ring in play
- * @skb: packet to send up
- * @vlan_tag: vlan tag for packet
- **/
-static void iavf_receive_skb(struct iavf_ring *rx_ring,
-			     struct sk_buff *skb, u16 vlan_tag)
-{
-	struct iavf_q_vector *q_vector = rx_ring->q_vector;
-
-	if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
-	    (vlan_tag & VLAN_VID_MASK))
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
-	else if ((rx_ring->netdev->features & NETIF_F_HW_VLAN_STAG_RX) &&
-		 vlan_tag & VLAN_VID_MASK)
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
-
-	napi_gro_receive(&q_vector->napi, skb);
-}
-
 /**
  * __iavf_alloc_rx_pages - Replace used receive pages
  * @rx_ring: ring to place buffers on
@@ -944,18 +923,14 @@ void iavf_alloc_rx_pages(struct iavf_ring *rxr)
  * @vsi: the VSI we care about
  * @skb: skb currently being received and modified
  * @qword: `wb.qword1.status_error_len` from the descriptor
+ * @parsed: TODO
  **/
-static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
-				    struct sk_buff *skb,
-				    u64 qword)
+static void iavf_rx_checksum(struct iavf_vsi *vsi, struct sk_buff *skb,
+			     u64 qword, struct libie_rx_ptype_parsed parsed)
 {
-	struct libie_rx_ptype_parsed parsed;
-	u32 ptype, rx_error, rx_status;
+	u32 rx_error, rx_status;
 	bool ipv4, ipv6;
 
-	ptype = (qword & IAVF_RXD_QW1_PTYPE_MASK) >> IAVF_RXD_QW1_PTYPE_SHIFT;
-
-	parsed = libie_parse_rx_ptype(ptype);
 	if (!libie_has_rx_checksum(vsi->netdev, parsed))
 		return;
 
@@ -1006,20 +981,17 @@ static inline void iavf_rx_checksum(struct iavf_vsi *vsi,
  * @rx_desc: specific descriptor
  * @skb: skb currently being received and modified
  * @qword: `wb.qword1.status_error_len` from the descriptor
+ * @parsed: TODO
  **/
-static inline void iavf_rx_hash(struct iavf_ring *ring,
-				union iavf_rx_desc *rx_desc,
-				struct sk_buff *skb,
-				u64 qword)
+static void iavf_rx_hash(const struct iavf_ring *ring,
+			 const union iavf_rx_desc *rx_desc,
+			 struct sk_buff *skb, u64 qword,
+			 struct libie_rx_ptype_parsed parsed)
 {
 	const u64 rss_mask = (u64)IAVF_RX_DESC_FLTSTAT_RSS_HASH <<
 			     IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT;
-	struct libie_rx_ptype_parsed parsed;
-	u32 rx_ptype, hash;
-
-	rx_ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
+	u32 hash;
 
-	parsed = libie_parse_rx_ptype(rx_ptype);
 	if (!libie_has_rx_hash(ring->netdev, parsed))
 		return;
 
@@ -1029,6 +1001,29 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
 	}
 }
 
+static void iavf_rx_vlan(const struct iavf_ring *rx_ring,
+			 const union iavf_rx_desc *rx_desc,
+			 struct sk_buff *skb, u64 qword)
+{
+	u16 vlan_tag = 0;
+
+	if ((qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT)) &&
+	    (rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1))
+		vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1);
+	if ((rx_desc->wb.qword2.ext_status &
+	     cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT))) &&
+	    (rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2))
+		vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2);
+
+	if (!(vlan_tag & VLAN_VID_MASK))
+		return;
+
+	if (rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
+	else if (rx_ring->netdev->features & NETIF_F_HW_VLAN_STAG_RX)
+		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
+}
+
 /**
  * iavf_process_skb_fields - Populate skb header fields from Rx descriptor
  * @rx_ring: rx descriptor ring packet is being transacted on
@@ -1040,19 +1035,21 @@ static inline void iavf_rx_hash(struct iavf_ring *ring,
  * order to populate the hash, checksum, VLAN, protocol, and
  * other fields within the skb.
  **/
-static inline
-void iavf_process_skb_fields(struct iavf_ring *rx_ring,
-			     union iavf_rx_desc *rx_desc, struct sk_buff *skb,
-			     u64 qword)
+void iavf_process_skb_fields(const struct iavf_ring *rx_ring,
+			     const union iavf_rx_desc *rx_desc,
+			     struct sk_buff *skb, u64 qword)
 {
-	iavf_rx_hash(rx_ring, rx_desc, skb, qword);
+	struct libie_rx_ptype_parsed parsed;
+	u32 ptype;
 
-	iavf_rx_checksum(rx_ring->vsi, skb, qword);
+	ptype = FIELD_GET(IAVF_RXD_QW1_PTYPE_MASK, qword);
+	parsed = libie_parse_rx_ptype(ptype);
 
-	skb_record_rx_queue(skb, rx_ring->queue_index);
+	iavf_rx_hash(rx_ring, rx_desc, skb, qword, parsed);
+	iavf_rx_checksum(rx_ring->vsi, skb, qword, parsed);
+	iavf_rx_vlan(rx_ring, rx_desc, skb, qword);
 
-	/* modifies the skb - consumes the enet header */
-	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+	skb_record_rx_queue(skb, rx_ring->queue_index);
 }
 
 /**
@@ -1207,7 +1204,6 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		union iavf_rx_desc *rx_desc;
 		u32 size, put_size;
 		struct page *page;
-		u16 vlan_tag = 0;
 		u64 qword;
 
 		/* return some buffers to hardware, one at a time is too slow */
@@ -1320,16 +1316,9 @@ static int iavf_clean_rx_irq(struct iavf_ring *rx_ring, int budget)
 		/* populate checksum, VLAN, and protocol */
 		iavf_process_skb_fields(rx_ring, rx_desc, skb, qword);
 
-		if (qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT) &&
-		    rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1)
-			vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1);
-		if (rx_desc->wb.qword2.ext_status &
-		    cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT)) &&
-		    rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2)
-			vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2);
-
 		iavf_trace(clean_rx_irq_rx, rx_ring, rx_desc, skb);
-		iavf_receive_skb(rx_ring, skb, vlan_tag);
+		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+		napi_gro_receive(&rx_ring->q_vector->napi, skb);
 		skb = NULL;
 
 		/* update budget accounting */
diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.h b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
index 0ff15aefce230c..28c31f5f6e11f1 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.h
@@ -332,6 +332,9 @@ bool __iavf_chk_linearize(struct sk_buff *skb);
 
 DECLARE_STATIC_KEY_FALSE(iavf_xdp_locking_key);
 
+void iavf_process_skb_fields(const struct iavf_ring *rx_ring,
+			     const union iavf_rx_desc *rx_desc,
+			     struct sk_buff *skb, u64 qword);
 int iavf_xdp_xmit(struct net_device *dev, int n, struct xdp_frame **frames,
 		  u32 flags);
 

From 112110e416cb50a128e24bfabd2a8cbe87f4bb89 Mon Sep 17 00:00:00 2001
From: Larysa Zaremba <larysa.zaremba@intel.com>
Date: Wed, 30 Nov 2022 13:52:19 +0100
Subject: [PATCH 31/32] iavf: Implement XDP_PASS path in AF_XDP processing

Construct skb and fill in its fields, when AF_XDP
is enabled on the ring, if XDP program returns XDP_PASS.
(will be fixed up).

Signed-off-by: Larysa Zaremba <larysa.zaremba@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_trace.h |  8 +++
 drivers/net/ethernet/intel/iavf/iavf_xsk.c   | 73 +++++++++++++++++++-
 2 files changed, 79 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_trace.h b/drivers/net/ethernet/intel/iavf/iavf_trace.h
index ac46fbe55bd2e5..383a5375392a20 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_trace.h
+++ b/drivers/net/ethernet/intel/iavf/iavf_trace.h
@@ -161,6 +161,14 @@ DEFINE_EVENT(
 
 	TP_ARGS(ring, desc, skb));
 
+DEFINE_EVENT(
+	iavf_rx_template, iavf_clean_rx_irq_zc_rx,
+	TP_PROTO(struct iavf_ring *ring,
+		 union iavf_32byte_rx_desc *desc,
+		 struct sk_buff *skb),
+
+	TP_ARGS(ring, desc, skb));
+
 DECLARE_EVENT_CLASS(
 	iavf_xmit_template,
 
diff --git a/drivers/net/ethernet/intel/iavf/iavf_xsk.c b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
index 7b9f3fff1c5f81..133f2c4a74d969 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_xsk.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_xsk.c
@@ -1004,6 +1004,8 @@ iavf_run_xdp_zc(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 	}
 
 	switch (xdp_act) {
+	case XDP_PASS:
+		break;
 	case XDP_TX:
 		err = iavf_xmit_xdp_buff_zc(xdp, xdp_ring);
 		if (unlikely(err))
@@ -1029,6 +1031,42 @@ iavf_run_xdp_zc(struct iavf_ring *rx_ring, struct xdp_buff *xdp,
 	return xdp_act;
 }
 
+/**
+ * iavf_construct_skb_zc - Create an sk_buff from zero-copy buffer
+ * @rx_ring: Rx ring
+ * @xdp: Pointer to XDP buffer
+ *
+ * This function allocates a new skb from a zero-copy Rx buffer.
+ *
+ * Returns the skb on success, NULL on failure.
+ */
+static struct sk_buff *
+iavf_construct_skb_zc(struct iavf_ring *rx_ring, struct xdp_buff *xdp)
+{
+	unsigned int totalsize = xdp->data_end - xdp->data_meta;
+	unsigned int metasize = xdp->data - xdp->data_meta;
+	struct sk_buff *skb;
+
+	net_prefetch(xdp->data_meta);
+
+	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, totalsize,
+			       GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!skb))
+		return NULL;
+
+	memcpy(__skb_put(skb, totalsize), xdp->data_meta,
+	       ALIGN(totalsize, sizeof(long)));
+
+	if (metasize) {
+		skb_metadata_set(skb, metasize);
+		__skb_pull(skb, metasize);
+	}
+
+	xsk_buff_free(xdp);
+
+	return skb;
+}
+
 /**
  * iavf_clean_rx_irq_zc - consumes packets from the hardware ring
  * @rx_ring: AF_XDP Rx ring
@@ -1054,6 +1092,8 @@ int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget)
 	while (likely(cleaned_count < budget)) {
 		union iavf_rx_desc *rx_desc;
 		struct xdp_buff *xdp;
+		unsigned int xdp_act;
+		struct sk_buff *skb;
 		unsigned int size;
 		u64 qword;
 
@@ -1088,8 +1128,10 @@ int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget)
 		xsk_buff_set_size(xdp, size);
 		xsk_buff_dma_sync_for_cpu(xdp, rx_ring->xsk_pool);
 
-		iavf_run_xdp_zc(rx_ring, xdp, xdp_prog, xdp_ring,
-				&rxq_xdp_act);
+		xdp_act = iavf_run_xdp_zc(rx_ring, xdp, xdp_prog, xdp_ring,
+					  &rxq_xdp_act);
+		if (xdp_act == XDP_PASS)
+			goto construct_skb;
 
 		if (unlikely(rxq_xdp_act & IAVF_RXQ_XDP_ACT_STOP_NOW)) {
 			failure = true;
@@ -1103,6 +1145,33 @@ int iavf_clean_rx_irq_zc(struct iavf_ring *rx_ring, int budget)
 		cleaned_count++;
 		if (unlikely(++ntc == ring_size))
 			ntc = 0;
+
+		continue;
+
+construct_skb:
+		skb = iavf_construct_skb_zc(rx_ring, xdp);
+		if (!skb) {
+			rx_ring->rx_stats.alloc_buff_failed++;
+			break;
+		}
+
+		cleaned_count++;
+		if (unlikely(++ntc == ring_size))
+			ntc = 0;
+
+		prefetch(rx_desc);
+
+		/* probably a little skewed due to removing CRC */
+		total_rx_bytes += skb->len;
+
+		/* populate checksum, VLAN, and protocol */
+		iavf_process_skb_fields(rx_ring, rx_desc, skb, qword);
+
+		iavf_trace(clean_rx_irq_zc_rx, rx_ring, rx_desc, skb);
+		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+		napi_gro_receive(&rx_ring->q_vector->napi, skb);
+
+		total_rx_packets++;
 	}
 
 	rx_ring->next_to_clean = ntc;

From 0ec8e9a495f0ba5d1bff47983aa1d39ba703833b Mon Sep 17 00:00:00 2001
From: Alexander Lobakin <aleksander.lobakin@intel.com>
Date: Thu, 23 Mar 2023 16:03:52 +0100
Subject: [PATCH 32/32] iavf: fixup for optimize vol. 2

Signed-off-by: Alexander Lobakin <aleksander.lobakin@intel.com>
---
 drivers/net/ethernet/intel/iavf/iavf_txrx.c | 50 ++++++++++-----------
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/intel/iavf/iavf_txrx.c b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
index 6d5f7b2a332077..77d100298f8190 100644
--- a/drivers/net/ethernet/intel/iavf/iavf_txrx.c
+++ b/drivers/net/ethernet/intel/iavf/iavf_txrx.c
@@ -929,13 +929,10 @@ static void iavf_rx_checksum(struct iavf_vsi *vsi, struct sk_buff *skb,
 			     u64 qword, struct libie_rx_ptype_parsed parsed)
 {
 	u32 rx_error, rx_status;
-	bool ipv4, ipv6;
 
 	if (!libie_has_rx_checksum(vsi->netdev, parsed))
 		return;
 
-	rx_error = (qword & IAVF_RXD_QW1_ERROR_MASK) >>
-		   IAVF_RXD_QW1_ERROR_SHIFT;
 	rx_status = (qword & IAVF_RXD_QW1_STATUS_MASK) >>
 		    IAVF_RXD_QW1_STATUS_SHIFT;
 
@@ -943,17 +940,16 @@ static void iavf_rx_checksum(struct iavf_vsi *vsi, struct sk_buff *skb,
 	if (!(rx_status & BIT(IAVF_RX_DESC_STATUS_L3L4P_SHIFT)))
 		return;
 
-	ipv4 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4;
-	ipv6 = parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6;
+	rx_error = (qword & IAVF_RXD_QW1_ERROR_MASK) >>
+		   IAVF_RXD_QW1_ERROR_SHIFT;
 
-	if (ipv4 &&
+	if (parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV4 &&
 	    (rx_error & (BIT(IAVF_RX_DESC_ERROR_IPE_SHIFT) |
 			 BIT(IAVF_RX_DESC_ERROR_EIPE_SHIFT))))
 		goto checksum_fail;
-
 	/* likely incorrect csum if alternate IP extension headers found */
-	if (ipv6 &&
-	    rx_status & BIT(IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT))
+	else if (parsed.outer_ip == LIBIE_RX_PTYPE_OUTER_IPV6 &&
+		 (rx_status & BIT(IAVF_RX_DESC_STATUS_IPV6EXADD_SHIFT)))
 		/* don't increment checksum err here, non-fatal err */
 		return;
 
@@ -992,36 +988,40 @@ static void iavf_rx_hash(const struct iavf_ring *ring,
 			     IAVF_RX_DESC_STATUS_FLTSTAT_SHIFT;
 	u32 hash;
 
-	if (!libie_has_rx_hash(ring->netdev, parsed))
+	if (!libie_has_rx_hash(ring->netdev, parsed) ||
+	    (qword & rss_mask) != rss_mask)
 		return;
 
-	if ((qword & rss_mask) == rss_mask) {
-		hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
-		libie_skb_set_hash(skb, hash, parsed);
-	}
+	hash = le32_to_cpu(rx_desc->wb.qword0.hi_dword.rss);
+	libie_skb_set_hash(skb, hash, parsed);
 }
 
 static void iavf_rx_vlan(const struct iavf_ring *rx_ring,
 			 const union iavf_rx_desc *rx_desc,
 			 struct sk_buff *skb, u64 qword)
 {
-	u16 vlan_tag = 0;
+	u16 vlan_tag;
+	__be16 prot;
+
+	if (rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
+		prot = htons(ETH_P_8021Q);
+	else if (rx_ring->netdev->features & NETIF_F_HW_VLAN_STAG_RX)
+		prot = htons(ETH_P_8021AD);
+	else
+		return;
 
 	if ((qword & BIT(IAVF_RX_DESC_STATUS_L2TAG1P_SHIFT)) &&
 	    (rx_ring->flags & IAVF_TXRX_FLAGS_VLAN_TAG_LOC_L2TAG1))
 		vlan_tag = le16_to_cpu(rx_desc->wb.qword0.lo_dword.l2tag1);
-	if ((rx_desc->wb.qword2.ext_status &
-	     cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT))) &&
-	    (rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2))
+	else if ((rx_ring->flags & IAVF_RXR_FLAGS_VLAN_TAG_LOC_L2TAG2_2) &&
+		 (rx_desc->wb.qword2.ext_status &
+		  cpu_to_le16(BIT(IAVF_RX_DESC_EXT_STATUS_L2TAG2P_SHIFT))))
 		vlan_tag = le16_to_cpu(rx_desc->wb.qword2.l2tag2_2);
+	else
+		vlan_tag = 0;
 
-	if (!(vlan_tag & VLAN_VID_MASK))
-		return;
-
-	if (rx_ring->netdev->features & NETIF_F_HW_VLAN_CTAG_RX)
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag);
-	else if (rx_ring->netdev->features & NETIF_F_HW_VLAN_STAG_RX)
-		__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021AD), vlan_tag);
+	if (vlan_tag & VLAN_VID_MASK)
+		__vlan_hwaccel_put_tag(skb, prot, vlan_tag);
 }
 
 /**