From 1e5d1f69d9fb8ea0679f9e85915e8e7fdacfbe7a Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Mar 2021 20:59:52 -0700 Subject: [PATCH 01/36] ethtool: support FEC settings over netlink Add FEC API to netlink. This is not a 1-to-1 conversion. FEC settings already depend on link modes to tell user which modes are supported. Take this further an use link modes for manual configuration. Old struct ethtool_fecparam is still used to talk to the drivers, so we need to translate back and forth. We can revisit the internal API if number of FEC encodings starts to grow. Enforce only one active FEC bit (by using a bit position rather than another mask). Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- Documentation/networking/ethtool-netlink.rst | 62 ++++- include/uapi/linux/ethtool_netlink.h | 17 ++ net/ethtool/Makefile | 2 +- net/ethtool/fec.c | 238 +++++++++++++++++++ net/ethtool/netlink.c | 19 ++ net/ethtool/netlink.h | 4 + 6 files changed, 339 insertions(+), 3 deletions(-) create mode 100644 net/ethtool/fec.c diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 05073482db055..4bdb4298f1783 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -208,6 +208,8 @@ Userspace to kernel: ``ETHTOOL_MSG_CABLE_TEST_ACT`` action start cable test ``ETHTOOL_MSG_CABLE_TEST_TDR_ACT`` action start raw TDR cable test ``ETHTOOL_MSG_TUNNEL_INFO_GET`` get tunnel offload info + ``ETHTOOL_MSG_FEC_GET`` get FEC settings + ``ETHTOOL_MSG_FEC_SET`` set FEC settings ===================================== ================================ Kernel to userspace: @@ -242,6 +244,8 @@ Kernel to userspace: ``ETHTOOL_MSG_CABLE_TEST_NTF`` Cable test results ``ETHTOOL_MSG_CABLE_TEST_TDR_NTF`` Cable test TDR results ``ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY`` tunnel offload info + ``ETHTOOL_MSG_FEC_GET_REPLY`` FEC settings + ``ETHTOOL_MSG_FEC_NTF`` FEC settings ===================================== ================================= ``GET`` requests are sent by userspace applications to retrieve device @@ -1280,6 +1284,60 @@ Kernel response contents: For UDP tunnel table empty ``ETHTOOL_A_TUNNEL_UDP_TABLE_TYPES`` indicates that the table contains static entries, hard-coded by the NIC. +FEC_GET +======= + +Gets FEC configuration and state like ``ETHTOOL_GFECPARAM`` ioctl request. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_FEC_HEADER`` nested request header + ===================================== ====== ========================== + +Kernel response contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_FEC_HEADER`` nested request header + ``ETHTOOL_A_FEC_MODES`` bitset configured modes + ``ETHTOOL_A_FEC_AUTO`` bool FEC mode auto selection + ``ETHTOOL_A_FEC_ACTIVE`` u32 index of active FEC mode + ===================================== ====== ========================== + +``ETHTOOL_A_FEC_ACTIVE`` is the bit index of the FEC link mode currently +active on the interface. This attribute may not be present if device does +not support FEC. + +``ETHTOOL_A_FEC_MODES`` and ``ETHTOOL_A_FEC_AUTO`` are only meaningful when +autonegotiation is disabled. If ``ETHTOOL_A_FEC_AUTO`` is non-zero driver will +select the FEC mode automatically based on the parameters of the SFP module. +This is equivalent to the ``ETHTOOL_FEC_AUTO`` bit of the ioctl interface. +``ETHTOOL_A_FEC_MODES`` carry the current FEC configuration using link mode +bits (rather than old ``ETHTOOL_FEC_*`` bits). + +FEC_SET +======= + +Sets FEC parameters like ``ETHTOOL_SFECPARAM`` ioctl request. + +Request contents: + + ===================================== ====== ========================== + ``ETHTOOL_A_FEC_HEADER`` nested request header + ``ETHTOOL_A_FEC_MODES`` bitset configured modes + ``ETHTOOL_A_FEC_AUTO`` bool FEC mode auto selection + ===================================== ====== ========================== + +``FEC_SET`` is only meaningful when autonegotiation is disabled. Otherwise +FEC mode is selected as part of autonegotiation. + +``ETHTOOL_A_FEC_MODES`` selects which FEC mode should be used. It's recommended +to set only one bit, if multiple bits are set driver may choose between them +in an implementation specific way. + +``ETHTOOL_A_FEC_AUTO`` requests the driver to choose FEC mode based on SFP +module parameters. This does not mean autonegotiation. + Request translation =================== @@ -1373,8 +1431,8 @@ are netlink only. ``ETHTOOL_MSG_LINKMODES_SET`` ``ETHTOOL_PHY_GTUNABLE`` n/a ``ETHTOOL_PHY_STUNABLE`` n/a - ``ETHTOOL_GFECPARAM`` n/a - ``ETHTOOL_SFECPARAM`` n/a + ``ETHTOOL_GFECPARAM`` ``ETHTOOL_MSG_FEC_GET`` + ``ETHTOOL_SFECPARAM`` ``ETHTOOL_MSG_FEC_SET`` n/a ''ETHTOOL_MSG_CABLE_TEST_ACT'' n/a ''ETHTOOL_MSG_CABLE_TEST_TDR_ACT'' n/a ``ETHTOOL_MSG_TUNNEL_INFO_GET`` diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index a286635ac9b8f..7f1bdb5b31ba5 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -42,6 +42,8 @@ enum { ETHTOOL_MSG_CABLE_TEST_ACT, ETHTOOL_MSG_CABLE_TEST_TDR_ACT, ETHTOOL_MSG_TUNNEL_INFO_GET, + ETHTOOL_MSG_FEC_GET, + ETHTOOL_MSG_FEC_SET, /* add new constants above here */ __ETHTOOL_MSG_USER_CNT, @@ -80,6 +82,8 @@ enum { ETHTOOL_MSG_CABLE_TEST_NTF, ETHTOOL_MSG_CABLE_TEST_TDR_NTF, ETHTOOL_MSG_TUNNEL_INFO_GET_REPLY, + ETHTOOL_MSG_FEC_GET_REPLY, + ETHTOOL_MSG_FEC_NTF, /* add new constants above here */ __ETHTOOL_MSG_KERNEL_CNT, @@ -629,6 +633,19 @@ enum { ETHTOOL_A_TUNNEL_INFO_MAX = (__ETHTOOL_A_TUNNEL_INFO_CNT - 1) }; +/* FEC */ + +enum { + ETHTOOL_A_FEC_UNSPEC, + ETHTOOL_A_FEC_HEADER, /* nest - _A_HEADER_* */ + ETHTOOL_A_FEC_MODES, /* bitset */ + ETHTOOL_A_FEC_AUTO, /* u8 */ + ETHTOOL_A_FEC_ACTIVE, /* u32 */ + + __ETHTOOL_A_FEC_CNT, + ETHTOOL_A_FEC_MAX = (__ETHTOOL_A_FEC_CNT - 1) +}; + /* generic netlink info */ #define ETHTOOL_GENL_NAME "ethtool" #define ETHTOOL_GENL_VERSION 1 diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index 7a849ff22dada..c2dc9033a8f7d 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,4 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o + tunnels.o fec.o diff --git a/net/ethtool/fec.c b/net/ethtool/fec.c new file mode 100644 index 0000000000000..31454b9188bdb --- /dev/null +++ b/net/ethtool/fec.c @@ -0,0 +1,238 @@ +// SPDX-License-Identifier: GPL-2.0-only + +#include "netlink.h" +#include "common.h" +#include "bitset.h" + +struct fec_req_info { + struct ethnl_req_info base; +}; + +struct fec_reply_data { + struct ethnl_reply_data base; + __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes); + u32 active_fec; + u8 fec_auto; +}; + +#define FEC_REPDATA(__reply_base) \ + container_of(__reply_base, struct fec_reply_data, base) + +#define ETHTOOL_FEC_MASK ((ETHTOOL_FEC_LLRS << 1) - 1) + +const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1] = { + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static void +ethtool_fec_to_link_modes(u32 fec, unsigned long *link_modes, u8 *fec_auto) +{ + if (fec_auto) + *fec_auto = !!(fec & ETHTOOL_FEC_AUTO); + + if (fec & ETHTOOL_FEC_OFF) + __set_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes); + if (fec & ETHTOOL_FEC_RS) + __set_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes); + if (fec & ETHTOOL_FEC_BASER) + __set_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes); + if (fec & ETHTOOL_FEC_LLRS) + __set_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes); +} + +static int +ethtool_link_modes_to_fecparam(struct ethtool_fecparam *fec, + unsigned long *link_modes, u8 fec_auto) +{ + memset(fec, 0, sizeof(*fec)); + + if (fec_auto) + fec->fec |= ETHTOOL_FEC_AUTO; + + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_NONE_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_OFF; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_RS_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_RS; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_BASER_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_BASER; + if (__test_and_clear_bit(ETHTOOL_LINK_MODE_FEC_LLRS_BIT, link_modes)) + fec->fec |= ETHTOOL_FEC_LLRS; + + if (!bitmap_empty(link_modes, __ETHTOOL_LINK_MODE_MASK_NBITS)) + return -EINVAL; + + return 0; +} + +static int fec_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + __ETHTOOL_DECLARE_LINK_MODE_MASK(active_fec_modes) = {}; + struct fec_reply_data *data = FEC_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + struct ethtool_fecparam fec = {}; + int ret; + + if (!dev->ethtool_ops->get_fecparam) + return -EOPNOTSUPP; + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + ret = dev->ethtool_ops->get_fecparam(dev, &fec); + ethnl_ops_complete(dev); + if (ret) + return ret; + + WARN_ON_ONCE(fec.reserved); + + ethtool_fec_to_link_modes(fec.fec, data->fec_link_modes, + &data->fec_auto); + + ethtool_fec_to_link_modes(fec.active_fec, active_fec_modes, NULL); + data->active_fec = find_first_bit(active_fec_modes, + __ETHTOOL_LINK_MODE_MASK_NBITS); + /* Don't report attr if no FEC mode set. Note that + * ethtool_fecparam_to_link_modes() ignores NONE and AUTO. + */ + if (data->active_fec == __ETHTOOL_LINK_MODE_MASK_NBITS) + data->active_fec = 0; + + return 0; +} + +static int fec_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct fec_reply_data *data = FEC_REPDATA(reply_base); + int len = 0; + int ret; + + ret = ethnl_bitset_size(data->fec_link_modes, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + len += ret; + + len += nla_total_size(sizeof(u8)) + /* _FEC_AUTO */ + nla_total_size(sizeof(u32)); /* _FEC_ACTIVE */ + + return len; +} + +static int fec_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + bool compact = req_base->flags & ETHTOOL_FLAG_COMPACT_BITSETS; + const struct fec_reply_data *data = FEC_REPDATA(reply_base); + int ret; + + ret = ethnl_put_bitset(skb, ETHTOOL_A_FEC_MODES, + data->fec_link_modes, NULL, + __ETHTOOL_LINK_MODE_MASK_NBITS, + link_mode_names, compact); + if (ret < 0) + return ret; + + if (nla_put_u8(skb, ETHTOOL_A_FEC_AUTO, data->fec_auto) || + (data->active_fec && + nla_put_u32(skb, ETHTOOL_A_FEC_ACTIVE, data->active_fec))) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_fec_request_ops = { + .request_cmd = ETHTOOL_MSG_FEC_GET, + .reply_cmd = ETHTOOL_MSG_FEC_GET_REPLY, + .hdr_attr = ETHTOOL_A_FEC_HEADER, + .req_info_size = sizeof(struct fec_req_info), + .reply_data_size = sizeof(struct fec_reply_data), + + .prepare_data = fec_prepare_data, + .reply_size = fec_reply_size, + .fill_reply = fec_fill_reply, +}; + +/* FEC_SET */ + +const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1] = { + [ETHTOOL_A_FEC_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_FEC_MODES] = { .type = NLA_NESTED }, + [ETHTOOL_A_FEC_AUTO] = NLA_POLICY_MAX(NLA_U8, 1), +}; + +int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info) +{ + __ETHTOOL_DECLARE_LINK_MODE_MASK(fec_link_modes) = {}; + struct ethnl_req_info req_info = {}; + struct nlattr **tb = info->attrs; + struct ethtool_fecparam fec = {}; + const struct ethtool_ops *ops; + struct net_device *dev; + bool mod = false; + u8 fec_auto; + int ret; + + ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_FEC_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + dev = req_info.dev; + ops = dev->ethtool_ops; + ret = -EOPNOTSUPP; + if (!ops->get_fecparam || !ops->set_fecparam) + goto out_dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + ret = ops->get_fecparam(dev, &fec); + if (ret < 0) + goto out_ops; + + ethtool_fec_to_link_modes(fec.fec, fec_link_modes, &fec_auto); + + ret = ethnl_update_bitset(fec_link_modes, + __ETHTOOL_LINK_MODE_MASK_NBITS, + tb[ETHTOOL_A_FEC_MODES], + link_mode_names, info->extack, &mod); + if (ret < 0) + goto out_ops; + ethnl_update_u8(&fec_auto, tb[ETHTOOL_A_FEC_AUTO], &mod); + + ret = 0; + if (!mod) + goto out_ops; + + ret = ethtool_link_modes_to_fecparam(&fec, fec_link_modes, fec_auto); + if (ret) { + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], + "invalid FEC modes requested"); + goto out_ops; + } + if (!fec.fec) { + ret = -EINVAL; + NL_SET_ERR_MSG_ATTR(info->extack, tb[ETHTOOL_A_FEC_MODES], + "no FEC modes set"); + goto out_ops; + } + + ret = dev->ethtool_ops->set_fecparam(dev, &fec); + if (ret < 0) + goto out_ops; + ethtool_notify(dev, ETHTOOL_MSG_FEC_NTF, NULL); + +out_ops: + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); +out_dev: + dev_put(dev); + return ret; +} diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index 50d3c8896f917..705a4b201564a 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -244,6 +244,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_COALESCE_GET] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_GET] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_GET] = ðnl_eee_request_ops, + [ETHTOOL_MSG_FEC_GET] = ðnl_fec_request_ops, [ETHTOOL_MSG_TSINFO_GET] = ðnl_tsinfo_request_ops, }; @@ -551,6 +552,7 @@ ethnl_default_notify_ops[ETHTOOL_MSG_KERNEL_MAX + 1] = { [ETHTOOL_MSG_COALESCE_NTF] = ðnl_coalesce_request_ops, [ETHTOOL_MSG_PAUSE_NTF] = ðnl_pause_request_ops, [ETHTOOL_MSG_EEE_NTF] = ðnl_eee_request_ops, + [ETHTOOL_MSG_FEC_NTF] = ðnl_fec_request_ops, }; /* default notification handler */ @@ -643,6 +645,7 @@ static const ethnl_notify_handler_t ethnl_notify_handlers[] = { [ETHTOOL_MSG_COALESCE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_PAUSE_NTF] = ethnl_default_notify, [ETHTOOL_MSG_EEE_NTF] = ethnl_default_notify, + [ETHTOOL_MSG_FEC_NTF] = ethnl_default_notify, }; void ethtool_notify(struct net_device *dev, unsigned int cmd, const void *data) @@ -912,6 +915,22 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_tunnel_info_get_policy, .maxattr = ARRAY_SIZE(ethnl_tunnel_info_get_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_FEC_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_fec_get_policy, + .maxattr = ARRAY_SIZE(ethnl_fec_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_FEC_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_fec, + .policy = ethnl_fec_set_policy, + .maxattr = ARRAY_SIZE(ethnl_fec_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 6eabd58d81bfe..785f7ee459302 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -344,6 +344,7 @@ extern const struct ethnl_request_ops ethnl_coalesce_request_ops; extern const struct ethnl_request_ops ethnl_pause_request_ops; extern const struct ethnl_request_ops ethnl_eee_request_ops; extern const struct ethnl_request_ops ethnl_tsinfo_request_ops; +extern const struct ethnl_request_ops ethnl_fec_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -375,6 +376,8 @@ extern const struct nla_policy ethnl_tsinfo_get_policy[ETHTOOL_A_TSINFO_HEADER + extern const struct nla_policy ethnl_cable_test_act_policy[ETHTOOL_A_CABLE_TEST_HEADER + 1]; extern const struct nla_policy ethnl_cable_test_tdr_act_policy[ETHTOOL_A_CABLE_TEST_TDR_CFG + 1]; extern const struct nla_policy ethnl_tunnel_info_get_policy[ETHTOOL_A_TUNNEL_INFO_HEADER + 1]; +extern const struct nla_policy ethnl_fec_get_policy[ETHTOOL_A_FEC_HEADER + 1]; +extern const struct nla_policy ethnl_fec_set_policy[ETHTOOL_A_FEC_AUTO + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -392,5 +395,6 @@ int ethnl_act_cable_test_tdr(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_doit(struct sk_buff *skb, struct genl_info *info); int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); +int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); #endif /* _NET_ETHTOOL_NETLINK_H */ From 0d7f76dc11e6df6b883f625c8343aa8fa1f6874b Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Mar 2021 20:59:53 -0700 Subject: [PATCH 02/36] netdevsim: add FEC settings support Add support for ethtool FEC and some ethtool error injection. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- drivers/net/netdevsim/ethtool.c | 36 +++++++++++++++++++++++++++++++ drivers/net/netdevsim/netdevsim.h | 3 +++ 2 files changed, 39 insertions(+) diff --git a/drivers/net/netdevsim/ethtool.c b/drivers/net/netdevsim/ethtool.c index 166f0d6cbcf71..c9ae52595a8f9 100644 --- a/drivers/net/netdevsim/ethtool.c +++ b/drivers/net/netdevsim/ethtool.c @@ -77,6 +77,34 @@ static int nsim_set_ringparam(struct net_device *dev, return 0; } +static int +nsim_get_fecparam(struct net_device *dev, struct ethtool_fecparam *fecparam) +{ + struct netdevsim *ns = netdev_priv(dev); + + if (ns->ethtool.get_err) + return -ns->ethtool.get_err; + memcpy(fecparam, &ns->ethtool.fec, sizeof(ns->ethtool.fec)); + return 0; +} + +static int +nsim_set_fecparam(struct net_device *dev, struct ethtool_fecparam *fecparam) +{ + struct netdevsim *ns = netdev_priv(dev); + u32 fec; + + if (ns->ethtool.set_err) + return -ns->ethtool.set_err; + memcpy(&ns->ethtool.fec, fecparam, sizeof(ns->ethtool.fec)); + fec = fecparam->fec; + if (fec == ETHTOOL_FEC_AUTO) + fec |= ETHTOOL_FEC_OFF; + fec |= ETHTOOL_FEC_NONE; + ns->ethtool.fec.active_fec = 1 << (fls(fec) - 1); + return 0; +} + static const struct ethtool_ops nsim_ethtool_ops = { .supported_coalesce_params = ETHTOOL_COALESCE_ALL_PARAMS, .get_pause_stats = nsim_get_pause_stats, @@ -86,6 +114,8 @@ static const struct ethtool_ops nsim_ethtool_ops = { .get_coalesce = nsim_get_coalesce, .get_ringparam = nsim_get_ringparam, .set_ringparam = nsim_set_ringparam, + .get_fecparam = nsim_get_fecparam, + .set_fecparam = nsim_set_fecparam, }; static void nsim_ethtool_ring_init(struct netdevsim *ns) @@ -104,8 +134,14 @@ void nsim_ethtool_init(struct netdevsim *ns) nsim_ethtool_ring_init(ns); + ns->ethtool.fec.fec = ETHTOOL_FEC_NONE; + ns->ethtool.fec.active_fec = ETHTOOL_FEC_NONE; + ethtool = debugfs_create_dir("ethtool", ns->nsim_dev_port->ddir); + debugfs_create_u32("get_err", 0600, ethtool, &ns->ethtool.get_err); + debugfs_create_u32("set_err", 0600, ethtool, &ns->ethtool.set_err); + dir = debugfs_create_dir("pause", ethtool); debugfs_create_bool("report_stats_rx", 0600, dir, &ns->ethtool.pauseparam.report_stats_rx); diff --git a/drivers/net/netdevsim/netdevsim.h b/drivers/net/netdevsim/netdevsim.h index d735c21def4b1..7ff24e03577b5 100644 --- a/drivers/net/netdevsim/netdevsim.h +++ b/drivers/net/netdevsim/netdevsim.h @@ -60,9 +60,12 @@ struct nsim_ethtool_pauseparam { }; struct nsim_ethtool { + u32 get_err; + u32 set_err; struct nsim_ethtool_pauseparam pauseparam; struct ethtool_coalesce coalesce; struct ethtool_ringparam ring; + struct ethtool_fecparam fec; }; struct netdevsim { From 1da07e5db3564789eb598bc772ea50b547194691 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 29 Mar 2021 20:59:54 -0700 Subject: [PATCH 03/36] selftests: ethtool: add a netdevsim FEC test Test FEC settings, iterate over configs. Signed-off-by: Jakub Kicinski Signed-off-by: David S. Miller --- .../drivers/net/netdevsim/ethtool-common.sh | 5 +- .../drivers/net/netdevsim/ethtool-fec.sh | 110 ++++++++++++++++++ 2 files changed, 114 insertions(+), 1 deletion(-) create mode 100755 tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh index 9f64d5c7107b1..7ca1f030d2099 100644 --- a/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-common.sh @@ -24,8 +24,11 @@ function check { local code=$1 local str=$2 local exp_str=$3 + local exp_fail=$4 - if [ $code -ne 0 ]; then + [ -z "$exp_fail" ] && cop="-ne" || cop="-eq" + + if [ $code $cop 0 ]; then ((num_errors++)) return fi diff --git a/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh new file mode 100755 index 0000000000000..0c56746e9ce0e --- /dev/null +++ b/tools/testing/selftests/drivers/net/netdevsim/ethtool-fec.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0-only + +source ethtool-common.sh + +NSIM_NETDEV=$(make_netdev) +[ a$ETHTOOL == a ] && ETHTOOL=ethtool + +set -o pipefail + +# netdevsim starts out with None/None +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: None +Active FEC encoding: None" + +# Test Auto +$ETHTOOL --set-fec $NSIM_NETDEV encoding auto +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: Auto +Active FEC encoding: Off" + +# Test case in-sensitivity +for o in off Off OFF; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: Off +Active FEC encoding: Off" +done + +for o in BaseR baser BAser; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: BaseR +Active FEC encoding: BaseR" +done + +for o in llrs rs; do + $ETHTOOL --set-fec $NSIM_NETDEV encoding $o + check $? + s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) + check $? "$s" "Configured FEC encodings: ${o^^} +Active FEC encoding: ${o^^}" +done + +# Test mutliple bits +$ETHTOOL --set-fec $NSIM_NETDEV encoding rs llrs +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: RS LLRS +Active FEC encoding: LLRS" + +$ETHTOOL --set-fec $NSIM_NETDEV encoding rs off auto +check $? +s=$($ETHTOOL --show-fec $NSIM_NETDEV | tail -2) +check $? "$s" "Configured FEC encodings: Auto Off RS +Active FEC encoding: RS" + +# Make sure other link modes are rejected +$ETHTOOL --set-fec $NSIM_NETDEV encoding FIBRE 2>/dev/null +check $? '' '' 1 + +$ETHTOOL --set-fec $NSIM_NETDEV encoding bla-bla-bla 2>/dev/null +check $? '' '' 1 + +# Try JSON +$ETHTOOL --json --show-fec $NSIM_NETDEV | jq empty >>/dev/null 2>&1 +if [ $? -eq 0 ]; then + $ETHTOOL --set-fec $NSIM_NETDEV encoding auto + check $? + + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].config[]') + check $? "$s" '"Auto"' + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].active[]') + check $? "$s" '"Off"' + + $ETHTOOL --set-fec $NSIM_NETDEV encoding auto RS + check $? + + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].config[]') + check $? "$s" '"Auto" +"RS"' + s=$($ETHTOOL --json --show-fec $NSIM_NETDEV | jq '.[].active[]') + check $? "$s" '"RS"' +fi + +# Test error injection +echo 11 > $NSIM_DEV_DFS/ethtool/get_err + +$ETHTOOL --show-fec $NSIM_NETDEV >>/dev/null 2>&1 +check $? '' '' 1 + +echo 0 > $NSIM_DEV_DFS/ethtool/get_err +echo 11 > $NSIM_DEV_DFS/ethtool/set_err + +$ETHTOOL --show-fec $NSIM_NETDEV >>/dev/null 2>&1 +check $? + +$ETHTOOL --set-fec $NSIM_NETDEV encoding RS 2>/dev/null +check $? '' '' 1 + +if [ $num_errors -eq 0 ]; then + echo "PASSED all $((num_passes)) checks" + exit 0 +else + echo "FAILED $num_errors/$((num_errors+num_passes)) checks" + exit 1 +fi From 48bb5697269a7cbe5194dbb044dc38c517e34c58 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 29 Mar 2021 23:45:51 -0700 Subject: [PATCH 04/36] ip6_tunnel: sit: proper dev_{hold|put} in ndo_[un]init methods Same reasons than for the previous commits : 6289a98f0817 ("sit: proper dev_{hold|put} in ndo_[un]init methods") 40cb881b5aaa ("ip6_vti: proper dev_{hold|put} in ndo_[un]init methods") 7f700334be9a ("ip6_gre: proper dev_{hold|put} in ndo_[un]init methods") After adopting CONFIG_PCPU_DEV_REFCNT=n option, syzbot was able to trigger a warning [1] Issue here is that: - all dev_put() should be paired with a corresponding prior dev_hold(). - A driver doing a dev_put() in its ndo_uninit() MUST also do a dev_hold() in its ndo_init(), only when ndo_init() is returning 0. Otherwise, register_netdevice() would call ndo_uninit() in its error path and release a refcount too soon. [1] WARNING: CPU: 1 PID: 21059 at lib/refcount.c:31 refcount_warn_saturate+0xbf/0x1e0 lib/refcount.c:31 Modules linked in: CPU: 1 PID: 21059 Comm: syz-executor.4 Not tainted 5.12.0-rc4-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:refcount_warn_saturate+0xbf/0x1e0 lib/refcount.c:31 Code: 1d 6a 5a e8 09 31 ff 89 de e8 8d 1a ab fd 84 db 75 e0 e8 d4 13 ab fd 48 c7 c7 a0 e1 c1 89 c6 05 4a 5a e8 09 01 e8 2e 36 fb 04 <0f> 0b eb c4 e8 b8 13 ab fd 0f b6 1d 39 5a e8 09 31 ff 89 de e8 58 RSP: 0018:ffffc900025aefe8 EFLAGS: 00010282 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000040000 RSI: ffffffff815c51f5 RDI: fffff520004b5def RBP: 0000000000000004 R08: 0000000000000000 R09: 0000000000000000 R10: ffffffff815bdf8e R11: 0000000000000000 R12: ffff888023488568 R13: ffff8880254e9000 R14: 00000000dfd82cfd R15: ffff88802ee2d7c0 FS: 00007f13bc590700(0000) GS:ffff8880b9c00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f0943e74000 CR3: 0000000025273000 CR4: 00000000001506f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __refcount_dec include/linux/refcount.h:344 [inline] refcount_dec include/linux/refcount.h:359 [inline] dev_put include/linux/netdevice.h:4135 [inline] ip6_tnl_dev_uninit+0x370/0x3d0 net/ipv6/ip6_tunnel.c:387 register_netdevice+0xadf/0x1500 net/core/dev.c:10308 ip6_tnl_create2+0x1b5/0x400 net/ipv6/ip6_tunnel.c:263 ip6_tnl_newlink+0x312/0x580 net/ipv6/ip6_tunnel.c:2052 __rtnl_newlink+0x1062/0x1710 net/core/rtnetlink.c:3443 rtnl_newlink+0x64/0xa0 net/core/rtnetlink.c:3491 rtnetlink_rcv_msg+0x44e/0xad0 net/core/rtnetlink.c:5553 netlink_rcv_skb+0x153/0x420 net/netlink/af_netlink.c:2502 netlink_unicast_kernel net/netlink/af_netlink.c:1312 [inline] netlink_unicast+0x533/0x7d0 net/netlink/af_netlink.c:1338 netlink_sendmsg+0x856/0xd90 net/netlink/af_netlink.c:1927 sock_sendmsg_nosec net/socket.c:654 [inline] sock_sendmsg+0xcf/0x120 net/socket.c:674 ____sys_sendmsg+0x6e8/0x810 net/socket.c:2350 ___sys_sendmsg+0xf3/0x170 net/socket.c:2404 __sys_sendmsg+0xe5/0x1b0 net/socket.c:2433 do_syscall_64+0x2d/0x70 arch/x86/entry/common.c:46 entry_SYSCALL_64_after_hwframe+0x44/0xae Fixes: 919067cc845f ("net: add CONFIG_PCPU_DEV_REFCNT") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index cd78f5b2cd75e..67ee9d58ec5ef 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -266,7 +266,6 @@ static int ip6_tnl_create2(struct net_device *dev) strcpy(t->parms.name, dev->name); - dev_hold(dev); ip6_tnl_link(ip6n, t); return 0; @@ -1882,6 +1881,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev) dev->min_mtu = ETH_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len; + dev_hold(dev); return 0; destroy_dst: From 53f7c5e1406110b9b8da4b7e2c66023a16bb8714 Mon Sep 17 00:00:00 2001 From: Wan Jiabing Date: Wed, 31 Mar 2021 10:35:53 +0800 Subject: [PATCH 05/36] net: ethernet: stmicro: Remove duplicate struct declaration struct stmmac_safety_stats is declared twice. One has been declared at 29th line. Remove the duplicate. Signed-off-by: Wan Jiabing Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/hwif.h | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/hwif.h b/drivers/net/ethernet/stmicro/stmmac/hwif.h index c9cf89e21a411..2b5022ef1e52e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/hwif.h +++ b/drivers/net/ethernet/stmicro/stmmac/hwif.h @@ -280,7 +280,6 @@ struct stmmac_dma_ops { struct mac_device_info; struct net_device; struct rgmii_adv; -struct stmmac_safety_stats; struct stmmac_tc_entry; struct stmmac_pps_cfg; struct stmmac_rss; From 3cbf7530a163d048a6376cd22fecb9cdcb23b192 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 31 Mar 2021 05:36:42 +0100 Subject: [PATCH 06/36] qrtr: Convert qrtr_ports from IDR to XArray The XArray interface is easier for this driver to use. Also fixes a bug reported by the improper use of GFP_ATOMIC. Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: Manivannan Sadhasivam Signed-off-by: David S. Miller --- net/qrtr/qrtr.c | 42 ++++++++++++++---------------------------- 1 file changed, 14 insertions(+), 28 deletions(-) diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index dfc820ee553a0..4b46c69e14abd 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -20,6 +20,8 @@ /* auto-bind range */ #define QRTR_MIN_EPH_SOCKET 0x4000 #define QRTR_MAX_EPH_SOCKET 0x7fff +#define QRTR_EPH_PORT_RANGE \ + XA_LIMIT(QRTR_MIN_EPH_SOCKET, QRTR_MAX_EPH_SOCKET) /** * struct qrtr_hdr_v1 - (I|R)PCrouter packet header version 1 @@ -106,8 +108,7 @@ static LIST_HEAD(qrtr_all_nodes); static DEFINE_MUTEX(qrtr_node_lock); /* local port allocation management */ -static DEFINE_IDR(qrtr_ports); -static DEFINE_MUTEX(qrtr_port_lock); +static DEFINE_XARRAY_ALLOC(qrtr_ports); /** * struct qrtr_node - endpoint node @@ -653,7 +654,7 @@ static struct qrtr_sock *qrtr_port_lookup(int port) port = 0; rcu_read_lock(); - ipc = idr_find(&qrtr_ports, port); + ipc = xa_load(&qrtr_ports, port); if (ipc) sock_hold(&ipc->sk); rcu_read_unlock(); @@ -695,9 +696,7 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) __sock_put(&ipc->sk); - mutex_lock(&qrtr_port_lock); - idr_remove(&qrtr_ports, port); - mutex_unlock(&qrtr_port_lock); + xa_erase(&qrtr_ports, port); /* Ensure that if qrtr_port_lookup() did enter the RCU read section we * wait for it to up increment the refcount */ @@ -716,29 +715,20 @@ static void qrtr_port_remove(struct qrtr_sock *ipc) */ static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) { - u32 min_port; int rc; - mutex_lock(&qrtr_port_lock); if (!*port) { - min_port = QRTR_MIN_EPH_SOCKET; - rc = idr_alloc_u32(&qrtr_ports, ipc, &min_port, QRTR_MAX_EPH_SOCKET, GFP_ATOMIC); - if (!rc) - *port = min_port; + rc = xa_alloc(&qrtr_ports, port, ipc, QRTR_EPH_PORT_RANGE, + GFP_KERNEL); } else if (*port < QRTR_MIN_EPH_SOCKET && !capable(CAP_NET_ADMIN)) { rc = -EACCES; } else if (*port == QRTR_PORT_CTRL) { - min_port = 0; - rc = idr_alloc_u32(&qrtr_ports, ipc, &min_port, 0, GFP_ATOMIC); + rc = xa_insert(&qrtr_ports, 0, ipc, GFP_KERNEL); } else { - min_port = *port; - rc = idr_alloc_u32(&qrtr_ports, ipc, &min_port, *port, GFP_ATOMIC); - if (!rc) - *port = min_port; + rc = xa_insert(&qrtr_ports, *port, ipc, GFP_KERNEL); } - mutex_unlock(&qrtr_port_lock); - if (rc == -ENOSPC) + if (rc == -EBUSY) return -EADDRINUSE; else if (rc < 0) return rc; @@ -752,20 +742,16 @@ static int qrtr_port_assign(struct qrtr_sock *ipc, int *port) static void qrtr_reset_ports(void) { struct qrtr_sock *ipc; - int id; - - mutex_lock(&qrtr_port_lock); - idr_for_each_entry(&qrtr_ports, ipc, id) { - /* Don't reset control port */ - if (id == 0) - continue; + unsigned long index; + rcu_read_lock(); + xa_for_each_start(&qrtr_ports, index, ipc, 1) { sock_hold(&ipc->sk); ipc->sk.sk_err = ENETRESET; ipc->sk.sk_error_report(&ipc->sk); sock_put(&ipc->sk); } - mutex_unlock(&qrtr_port_lock); + rcu_read_unlock(); } /* Bind socket to address. From b788ff0a7d7dc04da4c938d56cbe96c7fa261983 Mon Sep 17 00:00:00 2001 From: Yixing Liu Date: Wed, 31 Mar 2021 16:18:28 +0800 Subject: [PATCH 07/36] net: ena: fix inaccurate print type Use "%u" to replace "hu%". Signed-off-by: Yixing Liu Signed-off-by: Weihang Li Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_com.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_com.c b/drivers/net/ethernet/amazon/ena/ena_com.c index 02087d443e730..764852ead1d6b 100644 --- a/drivers/net/ethernet/amazon/ena/ena_com.c +++ b/drivers/net/ethernet/amazon/ena/ena_com.c @@ -863,7 +863,7 @@ static u32 ena_com_reg_bar_read32(struct ena_com_dev *ena_dev, u16 offset) if (unlikely(i == timeout)) { netdev_err(ena_dev->net_device, - "Reading reg failed for timeout. expected: req id[%hu] offset[%hu] actual: req id[%hu] offset[%hu]\n", + "Reading reg failed for timeout. expected: req id[%u] offset[%u] actual: req id[%u] offset[%u]\n", mmio_read->seq_num, offset, read_resp->req_id, read_resp->reg_off); ret = ENA_MMIO_READ_TIMEOUT; @@ -2396,7 +2396,7 @@ int ena_com_fill_hash_function(struct ena_com_dev *ena_dev, if (key) { if (key_len != sizeof(hash_key->key)) { netdev_err(ena_dev->net_device, - "key len (%hu) doesn't equal the supported size (%zu)\n", + "key len (%u) doesn't equal the supported size (%zu)\n", key_len, sizeof(hash_key->key)); return -EINVAL; } From e355fa6a3f405d3a3a1d86f2e2e332fb3d4e05d8 Mon Sep 17 00:00:00 2001 From: Yixing Liu Date: Wed, 31 Mar 2021 16:18:29 +0800 Subject: [PATCH 08/36] net: ena: remove extra words from comments Remove the redundant "for" from the commment. Signed-off-by: Yixing Liu Signed-off-by: Weihang Li Signed-off-by: David S. Miller --- drivers/net/ethernet/amazon/ena/ena_netdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/amazon/ena/ena_netdev.c b/drivers/net/ethernet/amazon/ena/ena_netdev.c index 5c062c51b4cb9..881f88754bf6b 100644 --- a/drivers/net/ethernet/amazon/ena/ena_netdev.c +++ b/drivers/net/ethernet/amazon/ena/ena_netdev.c @@ -3975,7 +3975,7 @@ static u32 ena_calc_max_io_queue_num(struct pci_dev *pdev, max_num_io_queues = min_t(u32, max_num_io_queues, io_rx_num); max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_sq_num); max_num_io_queues = min_t(u32, max_num_io_queues, io_tx_cq_num); - /* 1 IRQ for for mgmnt and 1 IRQs for each IO direction */ + /* 1 IRQ for mgmnt and 1 IRQs for each IO direction */ max_num_io_queues = min_t(u32, max_num_io_queues, pci_msix_vec_count(pdev) - 1); if (unlikely(!max_num_io_queues)) { dev_err(&pdev->dev, "The device doesn't have io queues\n"); From ca3fc0aa08370260a1180ac4366cf58fbefc841c Mon Sep 17 00:00:00 2001 From: Yixing Liu Date: Wed, 31 Mar 2021 16:18:30 +0800 Subject: [PATCH 09/36] net: amd8111e: fix inappropriate spaces Delete unncecessary spaces and add some reasonable spaces according to the coding-style of kernel. Signed-off-by: Yixing Liu Signed-off-by: Weihang Li Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/amd8111e.c | 362 ++++++++++++++-------------- 1 file changed, 177 insertions(+), 185 deletions(-) diff --git a/drivers/net/ethernet/amd/amd8111e.c b/drivers/net/ethernet/amd/amd8111e.c index 960d483e89972..4a1220cc6f10a 100644 --- a/drivers/net/ethernet/amd/amd8111e.c +++ b/drivers/net/ethernet/amd/amd8111e.c @@ -100,19 +100,19 @@ static int amd8111e_read_phy(struct amd8111e_priv *lp, { void __iomem *mmio = lp->mmio; unsigned int reg_val; - unsigned int repeat= REPEAT_CNT; + unsigned int repeat = REPEAT_CNT; reg_val = readl(mmio + PHY_ACCESS); while (reg_val & PHY_CMD_ACTIVE) - reg_val = readl( mmio + PHY_ACCESS ); + reg_val = readl(mmio + PHY_ACCESS); - writel( PHY_RD_CMD | ((phy_id & 0x1f) << 21) | - ((reg & 0x1f) << 16), mmio +PHY_ACCESS); - do{ + writel(PHY_RD_CMD | ((phy_id & 0x1f) << 21) | + ((reg & 0x1f) << 16), mmio + PHY_ACCESS); + do { reg_val = readl(mmio + PHY_ACCESS); udelay(30); /* It takes 30 us to read/write data */ } while (--repeat && (reg_val & PHY_CMD_ACTIVE)); - if(reg_val & PHY_RD_ERR) + if (reg_val & PHY_RD_ERR) goto err_phy_read; *val = reg_val & 0xffff; @@ -133,17 +133,17 @@ static int amd8111e_write_phy(struct amd8111e_priv *lp, reg_val = readl(mmio + PHY_ACCESS); while (reg_val & PHY_CMD_ACTIVE) - reg_val = readl( mmio + PHY_ACCESS ); + reg_val = readl(mmio + PHY_ACCESS); - writel( PHY_WR_CMD | ((phy_id & 0x1f) << 21) | + writel(PHY_WR_CMD | ((phy_id & 0x1f) << 21) | ((reg & 0x1f) << 16)|val, mmio + PHY_ACCESS); - do{ + do { reg_val = readl(mmio + PHY_ACCESS); udelay(30); /* It takes 30 us to read/write the data */ } while (--repeat && (reg_val & PHY_CMD_ACTIVE)); - if(reg_val & PHY_RD_ERR) + if (reg_val & PHY_RD_ERR) goto err_phy_write; return 0; @@ -159,7 +159,7 @@ static int amd8111e_mdio_read(struct net_device *dev, int phy_id, int reg_num) struct amd8111e_priv *lp = netdev_priv(dev); unsigned int reg_val; - amd8111e_read_phy(lp,phy_id,reg_num,®_val); + amd8111e_read_phy(lp, phy_id, reg_num, ®_val); return reg_val; } @@ -179,17 +179,17 @@ static void amd8111e_mdio_write(struct net_device *dev, static void amd8111e_set_ext_phy(struct net_device *dev) { struct amd8111e_priv *lp = netdev_priv(dev); - u32 bmcr,advert,tmp; + u32 bmcr, advert, tmp; /* Determine mii register values to set the speed */ advert = amd8111e_mdio_read(dev, lp->ext_phy_addr, MII_ADVERTISE); tmp = advert & ~(ADVERTISE_ALL | ADVERTISE_100BASE4); - switch (lp->ext_phy_option){ + switch (lp->ext_phy_option) { default: case SPEED_AUTONEG: /* advertise all values */ - tmp |= ( ADVERTISE_10HALF|ADVERTISE_10FULL| - ADVERTISE_100HALF|ADVERTISE_100FULL) ; + tmp |= (ADVERTISE_10HALF | ADVERTISE_10FULL | + ADVERTISE_100HALF | ADVERTISE_100FULL); break; case SPEED10_HALF: tmp |= ADVERTISE_10HALF; @@ -224,20 +224,20 @@ static int amd8111e_free_skbs(struct net_device *dev) int i; /* Freeing transmit skbs */ - for(i = 0; i < NUM_TX_BUFFERS; i++){ - if(lp->tx_skbuff[i]){ + for (i = 0; i < NUM_TX_BUFFERS; i++) { + if (lp->tx_skbuff[i]) { dma_unmap_single(&lp->pci_dev->dev, lp->tx_dma_addr[i], lp->tx_skbuff[i]->len, DMA_TO_DEVICE); - dev_kfree_skb (lp->tx_skbuff[i]); + dev_kfree_skb(lp->tx_skbuff[i]); lp->tx_skbuff[i] = NULL; lp->tx_dma_addr[i] = 0; } } /* Freeing previously allocated receive buffers */ - for (i = 0; i < NUM_RX_BUFFERS; i++){ + for (i = 0; i < NUM_RX_BUFFERS; i++) { rx_skbuff = lp->rx_skbuff[i]; - if(rx_skbuff != NULL){ + if (rx_skbuff != NULL) { dma_unmap_single(&lp->pci_dev->dev, lp->rx_dma_addr[i], lp->rx_buff_len - 2, DMA_FROM_DEVICE); @@ -258,13 +258,13 @@ static inline void amd8111e_set_rx_buff_len(struct net_device *dev) struct amd8111e_priv *lp = netdev_priv(dev); unsigned int mtu = dev->mtu; - if (mtu > ETH_DATA_LEN){ + if (mtu > ETH_DATA_LEN) { /* MTU + ethernet header + FCS * + optional VLAN tag + skb reserve space 2 */ lp->rx_buff_len = mtu + ETH_HLEN + 10; lp->options |= OPTION_JUMBO_ENABLE; - } else{ + } else { lp->rx_buff_len = PKT_BUFF_SZ; lp->options &= ~OPTION_JUMBO_ENABLE; } @@ -285,11 +285,11 @@ static int amd8111e_init_ring(struct net_device *dev) lp->tx_ring_idx = 0; - if(lp->opened) + if (lp->opened) /* Free previously allocated transmit and receive skbs */ amd8111e_free_skbs(dev); - else{ + else { /* allocate the tx and rx descriptors */ lp->tx_ring = dma_alloc_coherent(&lp->pci_dev->dev, sizeof(struct amd8111e_tx_dr) * NUM_TX_RING_DR, @@ -312,12 +312,12 @@ static int amd8111e_init_ring(struct net_device *dev) lp->rx_skbuff[i] = netdev_alloc_skb(dev, lp->rx_buff_len); if (!lp->rx_skbuff[i]) { - /* Release previos allocated skbs */ - for(--i; i >= 0 ;i--) - dev_kfree_skb(lp->rx_skbuff[i]); - goto err_free_rx_ring; + /* Release previos allocated skbs */ + for (--i; i >= 0; i--) + dev_kfree_skb(lp->rx_skbuff[i]); + goto err_free_rx_ring; } - skb_reserve(lp->rx_skbuff[i],2); + skb_reserve(lp->rx_skbuff[i], 2); } /* Initilaizing receive descriptors */ for (i = 0; i < NUM_RX_BUFFERS; i++) { @@ -375,40 +375,40 @@ static int amd8111e_set_coalesce(struct net_device *dev, enum coal_mode cmod) case RX_INTR_COAL : timeout = coal_conf->rx_timeout; event_count = coal_conf->rx_event_count; - if( timeout > MAX_TIMEOUT || - event_count > MAX_EVENT_COUNT ) + if (timeout > MAX_TIMEOUT || + event_count > MAX_EVENT_COUNT) return -EINVAL; timeout = timeout * DELAY_TIMER_CONV; writel(VAL0|STINTEN, mmio+INTEN0); - writel((u32)DLY_INT_A_R0|( event_count<< 16 )|timeout, - mmio+DLY_INT_A); + writel((u32)DLY_INT_A_R0 | (event_count << 16) | + timeout, mmio + DLY_INT_A); break; - case TX_INTR_COAL : + case TX_INTR_COAL: timeout = coal_conf->tx_timeout; event_count = coal_conf->tx_event_count; - if( timeout > MAX_TIMEOUT || - event_count > MAX_EVENT_COUNT ) + if (timeout > MAX_TIMEOUT || + event_count > MAX_EVENT_COUNT) return -EINVAL; timeout = timeout * DELAY_TIMER_CONV; - writel(VAL0|STINTEN,mmio+INTEN0); - writel((u32)DLY_INT_B_T0|( event_count<< 16 )|timeout, - mmio+DLY_INT_B); + writel(VAL0 | STINTEN, mmio + INTEN0); + writel((u32)DLY_INT_B_T0 | (event_count << 16) | + timeout, mmio + DLY_INT_B); break; case DISABLE_COAL: - writel(0,mmio+STVAL); - writel(STINTEN, mmio+INTEN0); - writel(0, mmio +DLY_INT_B); - writel(0, mmio+DLY_INT_A); + writel(0, mmio + STVAL); + writel(STINTEN, mmio + INTEN0); + writel(0, mmio + DLY_INT_B); + writel(0, mmio + DLY_INT_A); break; case ENABLE_COAL: /* Start the timer */ - writel((u32)SOFT_TIMER_FREQ, mmio+STVAL); /* 0.5 sec */ - writel(VAL0|STINTEN, mmio+INTEN0); + writel((u32)SOFT_TIMER_FREQ, mmio + STVAL); /* 0.5 sec */ + writel(VAL0 | STINTEN, mmio + INTEN0); break; default: break; @@ -423,67 +423,67 @@ static int amd8111e_restart(struct net_device *dev) { struct amd8111e_priv *lp = netdev_priv(dev); void __iomem *mmio = lp->mmio; - int i,reg_val; + int i, reg_val; /* stop the chip */ writel(RUN, mmio + CMD0); - if(amd8111e_init_ring(dev)) + if (amd8111e_init_ring(dev)) return -ENOMEM; /* enable the port manager and set auto negotiation always */ - writel((u32) VAL1|EN_PMGR, mmio + CMD3 ); - writel((u32)XPHYANE|XPHYRST , mmio + CTRL2); + writel((u32)VAL1 | EN_PMGR, mmio + CMD3); + writel((u32)XPHYANE | XPHYRST, mmio + CTRL2); amd8111e_set_ext_phy(dev); /* set control registers */ reg_val = readl(mmio + CTRL1); reg_val &= ~XMTSP_MASK; - writel( reg_val| XMTSP_128 | CACHE_ALIGN, mmio + CTRL1 ); + writel(reg_val | XMTSP_128 | CACHE_ALIGN, mmio + CTRL1); /* enable interrupt */ - writel( APINT5EN | APINT4EN | APINT3EN | APINT2EN | APINT1EN | + writel(APINT5EN | APINT4EN | APINT3EN | APINT2EN | APINT1EN | APINT0EN | MIIPDTINTEN | MCCIINTEN | MCCINTEN | MREINTEN | SPNDINTEN | MPINTEN | SINTEN | STINTEN, mmio + INTEN0); writel(VAL3 | LCINTEN | VAL1 | TINTEN0 | VAL0 | RINTEN0, mmio + INTEN0); /* initialize tx and rx ring base addresses */ - writel((u32)lp->tx_ring_dma_addr,mmio + XMT_RING_BASE_ADDR0); - writel((u32)lp->rx_ring_dma_addr,mmio+ RCV_RING_BASE_ADDR0); + writel((u32)lp->tx_ring_dma_addr, mmio + XMT_RING_BASE_ADDR0); + writel((u32)lp->rx_ring_dma_addr, mmio + RCV_RING_BASE_ADDR0); writew((u32)NUM_TX_RING_DR, mmio + XMT_RING_LEN0); writew((u16)NUM_RX_RING_DR, mmio + RCV_RING_LEN0); /* set default IPG to 96 */ - writew((u32)DEFAULT_IPG,mmio+IPG); + writew((u32)DEFAULT_IPG, mmio + IPG); writew((u32)(DEFAULT_IPG-IFS1_DELTA), mmio + IFS1); - if(lp->options & OPTION_JUMBO_ENABLE){ + if (lp->options & OPTION_JUMBO_ENABLE) { writel((u32)VAL2|JUMBO, mmio + CMD3); /* Reset REX_UFLO */ - writel( REX_UFLO, mmio + CMD2); + writel(REX_UFLO, mmio + CMD2); /* Should not set REX_UFLO for jumbo frames */ - writel( VAL0 | APAD_XMT|REX_RTRY , mmio + CMD2); - }else{ - writel( VAL0 | APAD_XMT | REX_RTRY|REX_UFLO, mmio + CMD2); + writel(VAL0 | APAD_XMT | REX_RTRY, mmio + CMD2); + } else { + writel(VAL0 | APAD_XMT | REX_RTRY | REX_UFLO, mmio + CMD2); writel((u32)JUMBO, mmio + CMD3); } #if AMD8111E_VLAN_TAG_USED - writel((u32) VAL2|VSIZE|VL_TAG_DEL, mmio + CMD3); + writel((u32)VAL2 | VSIZE | VL_TAG_DEL, mmio + CMD3); #endif - writel( VAL0 | APAD_XMT | REX_RTRY, mmio + CMD2 ); + writel(VAL0 | APAD_XMT | REX_RTRY, mmio + CMD2); /* Setting the MAC address to the device */ for (i = 0; i < ETH_ALEN; i++) - writeb( dev->dev_addr[i], mmio + PADR + i ); + writeb(dev->dev_addr[i], mmio + PADR + i); /* Enable interrupt coalesce */ - if(lp->options & OPTION_INTR_COAL_ENABLE){ + if (lp->options & OPTION_INTR_COAL_ENABLE) { netdev_info(dev, "Interrupt Coalescing Enabled.\n"); - amd8111e_set_coalesce(dev,ENABLE_COAL); + amd8111e_set_coalesce(dev, ENABLE_COAL); } /* set RUN bit to start the chip */ @@ -499,11 +499,11 @@ static int amd8111e_restart(struct net_device *dev) static void amd8111e_init_hw_default(struct amd8111e_priv *lp) { unsigned int reg_val; - unsigned int logic_filter[2] ={0,}; + unsigned int logic_filter[2] = {0,}; void __iomem *mmio = lp->mmio; - /* stop the chip */ + /* stop the chip */ writel(RUN, mmio + CMD0); /* AUTOPOLL0 Register *//*TBD default value is 8100 in FPS */ @@ -519,13 +519,13 @@ static void amd8111e_init_hw_default(struct amd8111e_priv *lp) writel(0, mmio + XMT_RING_BASE_ADDR3); /* Clear CMD0 */ - writel(CMD0_CLEAR,mmio + CMD0); + writel(CMD0_CLEAR, mmio + CMD0); /* Clear CMD2 */ - writel(CMD2_CLEAR, mmio +CMD2); + writel(CMD2_CLEAR, mmio + CMD2); /* Clear CMD7 */ - writel(CMD7_CLEAR , mmio + CMD7); + writel(CMD7_CLEAR, mmio + CMD7); /* Clear DLY_INT_A and DLY_INT_B */ writel(0x0, mmio + DLY_INT_A); @@ -542,16 +542,16 @@ static void amd8111e_init_hw_default(struct amd8111e_priv *lp) writel(0x0, mmio + STVAL); /* Clear INTEN0 */ - writel( INTEN0_CLEAR, mmio + INTEN0); + writel(INTEN0_CLEAR, mmio + INTEN0); /* Clear LADRF */ - writel(0x0 , mmio + LADRF); + writel(0x0, mmio + LADRF); /* Set SRAM_SIZE & SRAM_BOUNDARY registers */ - writel( 0x80010,mmio + SRAM_SIZE); + writel(0x80010, mmio + SRAM_SIZE); /* Clear RCV_RING0_LEN */ - writel(0x0, mmio + RCV_RING_LEN0); + writel(0x0, mmio + RCV_RING_LEN0); /* Clear XMT_RING0/1/2/3_LEN */ writel(0x0, mmio + XMT_RING_LEN0); @@ -571,10 +571,10 @@ static void amd8111e_init_hw_default(struct amd8111e_priv *lp) /* SRAM_SIZE register */ reg_val = readl(mmio + SRAM_SIZE); - if(lp->options & OPTION_JUMBO_ENABLE) - writel( VAL2|JUMBO, mmio + CMD3); + if (lp->options & OPTION_JUMBO_ENABLE) + writel(VAL2 | JUMBO, mmio + CMD3); #if AMD8111E_VLAN_TAG_USED - writel(VAL2|VSIZE|VL_TAG_DEL, mmio + CMD3 ); + writel(VAL2 | VSIZE | VL_TAG_DEL, mmio + CMD3); #endif /* Set default value to CTRL1 Register */ writel(CTRL1_DEFAULT, mmio + CTRL1); @@ -616,14 +616,14 @@ static void amd8111e_stop_chip(struct amd8111e_priv *lp) static void amd8111e_free_ring(struct amd8111e_priv *lp) { /* Free transmit and receive descriptor rings */ - if(lp->rx_ring){ + if (lp->rx_ring) { dma_free_coherent(&lp->pci_dev->dev, sizeof(struct amd8111e_rx_dr) * NUM_RX_RING_DR, lp->rx_ring, lp->rx_ring_dma_addr); lp->rx_ring = NULL; } - if(lp->tx_ring){ + if (lp->tx_ring) { dma_free_coherent(&lp->pci_dev->dev, sizeof(struct amd8111e_tx_dr) * NUM_TX_RING_DR, lp->tx_ring, lp->tx_ring_dma_addr); @@ -643,11 +643,11 @@ static int amd8111e_tx(struct net_device *dev) int tx_index; int status; /* Complete all the transmit packet */ - while (lp->tx_complete_idx != lp->tx_idx){ + while (lp->tx_complete_idx != lp->tx_idx) { tx_index = lp->tx_complete_idx & TX_RING_DR_MOD_MASK; status = le16_to_cpu(lp->tx_ring[tx_index].tx_flags); - if(status & OWN_BIT) + if (status & OWN_BIT) break; /* It still hasn't been Txed */ lp->tx_ring[tx_index].buff_phy_addr = 0; @@ -669,10 +669,10 @@ static int amd8111e_tx(struct net_device *dev) le16_to_cpu(lp->tx_ring[tx_index].buff_count); if (netif_queue_stopped(dev) && - lp->tx_complete_idx > lp->tx_idx - NUM_TX_BUFFERS +2){ + lp->tx_complete_idx > lp->tx_idx - NUM_TX_BUFFERS + 2) { /* The ring is no longer full, clear tbusy. */ /* lp->tx_full = 0; */ - netif_wake_queue (dev); + netif_wake_queue(dev); } } return 0; @@ -685,7 +685,7 @@ static int amd8111e_rx_poll(struct napi_struct *napi, int budget) struct net_device *dev = lp->amd8111e_net_dev; int rx_index = lp->rx_idx & RX_RING_DR_MOD_MASK; void __iomem *mmio = lp->mmio; - struct sk_buff *skb,*new_skb; + struct sk_buff *skb, *new_skb; int min_pkt_len, status; int num_rx_pkt = 0; short pkt_len; @@ -710,7 +710,7 @@ static int amd8111e_rx_poll(struct napi_struct *napi, int budget) goto err_next_pkt; } /* check for STP and ENP */ - if (!((status & STP_BIT) && (status & ENP_BIT))){ + if (!((status & STP_BIT) && (status & ENP_BIT))) { /* resetting flags */ lp->rx_ring[rx_index].rx_flags &= RESET_RX_FLAGS; goto err_next_pkt; @@ -755,7 +755,7 @@ static int amd8111e_rx_poll(struct napi_struct *napi, int budget) skb->protocol = eth_type_trans(skb, dev); #if AMD8111E_VLAN_TAG_USED - if (vtag == TT_VLAN_TAGGED){ + if (vtag == TT_VLAN_TAGGED) { u16 vlan_tag = le16_to_cpu(lp->rx_ring[rx_index].tag_ctrl_info); __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), vlan_tag); } @@ -793,25 +793,25 @@ static int amd8111e_rx_poll(struct napi_struct *napi, int budget) static int amd8111e_link_change(struct net_device *dev) { struct amd8111e_priv *lp = netdev_priv(dev); - int status0,speed; + int status0, speed; /* read the link change */ - status0 = readl(lp->mmio + STAT0); + status0 = readl(lp->mmio + STAT0); - if(status0 & LINK_STATS){ - if(status0 & AUTONEG_COMPLETE) + if (status0 & LINK_STATS) { + if (status0 & AUTONEG_COMPLETE) lp->link_config.autoneg = AUTONEG_ENABLE; else lp->link_config.autoneg = AUTONEG_DISABLE; - if(status0 & FULL_DPLX) + if (status0 & FULL_DPLX) lp->link_config.duplex = DUPLEX_FULL; else lp->link_config.duplex = DUPLEX_HALF; speed = (status0 & SPEED_MASK) >> 7; - if(speed == PHY_SPEED_10) + if (speed == PHY_SPEED_10) lp->link_config.speed = SPEED_10; - else if(speed == PHY_SPEED_100) + else if (speed == PHY_SPEED_100) lp->link_config.speed = SPEED_100; netdev_info(dev, "Link is Up. Speed is %s Mbps %s Duplex\n", @@ -821,8 +821,7 @@ static int amd8111e_link_change(struct net_device *dev) "Full" : "Half"); netif_carrier_on(dev); - } - else{ + } else { lp->link_config.speed = SPEED_INVALID; lp->link_config.duplex = DUPLEX_INVALID; lp->link_config.autoneg = AUTONEG_INVALID; @@ -840,7 +839,7 @@ static int amd8111e_read_mib(void __iomem *mmio, u8 MIB_COUNTER) unsigned int data; unsigned int repeat = REPEAT_CNT; - writew( MIB_RD_CMD | MIB_COUNTER, mmio + MIB_ADDR); + writew(MIB_RD_CMD | MIB_COUNTER, mmio + MIB_ADDR); do { status = readw(mmio + MIB_ADDR); udelay(2); /* controller takes MAX 2 us to get mib data */ @@ -863,7 +862,7 @@ static struct net_device_stats *amd8111e_get_stats(struct net_device *dev) if (!lp->opened) return new_stats; - spin_lock_irqsave (&lp->lock, flags); + spin_lock_irqsave(&lp->lock, flags); /* stats.rx_packets */ new_stats->rx_packets = amd8111e_read_mib(mmio, rcv_broadcast_pkts)+ @@ -943,7 +942,7 @@ static struct net_device_stats *amd8111e_get_stats(struct net_device *dev) /* Reset the mibs for collecting new statistics */ /* writew(MIB_CLEAR, mmio + MIB_ADDR);*/ - spin_unlock_irqrestore (&lp->lock, flags); + spin_unlock_irqrestore(&lp->lock, flags); return new_stats; } @@ -974,96 +973,90 @@ static int amd8111e_calc_coalesce(struct net_device *dev) rx_data_rate = coal_conf->rx_bytes - coal_conf->rx_prev_bytes; coal_conf->rx_prev_bytes = coal_conf->rx_bytes; - if(rx_pkt_rate < 800){ - if(coal_conf->rx_coal_type != NO_COALESCE){ + if (rx_pkt_rate < 800) { + if (coal_conf->rx_coal_type != NO_COALESCE) { coal_conf->rx_timeout = 0x0; coal_conf->rx_event_count = 0; - amd8111e_set_coalesce(dev,RX_INTR_COAL); + amd8111e_set_coalesce(dev, RX_INTR_COAL); coal_conf->rx_coal_type = NO_COALESCE; } - } - else{ + } else { rx_pkt_size = rx_data_rate/rx_pkt_rate; - if (rx_pkt_size < 128){ - if(coal_conf->rx_coal_type != NO_COALESCE){ + if (rx_pkt_size < 128) { + if (coal_conf->rx_coal_type != NO_COALESCE) { coal_conf->rx_timeout = 0; coal_conf->rx_event_count = 0; - amd8111e_set_coalesce(dev,RX_INTR_COAL); + amd8111e_set_coalesce(dev, RX_INTR_COAL); coal_conf->rx_coal_type = NO_COALESCE; } - } - else if ( (rx_pkt_size >= 128) && (rx_pkt_size < 512) ){ + } else if ((rx_pkt_size >= 128) && (rx_pkt_size < 512)) { - if(coal_conf->rx_coal_type != LOW_COALESCE){ + if (coal_conf->rx_coal_type != LOW_COALESCE) { coal_conf->rx_timeout = 1; coal_conf->rx_event_count = 4; - amd8111e_set_coalesce(dev,RX_INTR_COAL); + amd8111e_set_coalesce(dev, RX_INTR_COAL); coal_conf->rx_coal_type = LOW_COALESCE; } - } - else if ((rx_pkt_size >= 512) && (rx_pkt_size < 1024)){ + } else if ((rx_pkt_size >= 512) && (rx_pkt_size < 1024)) { - if(coal_conf->rx_coal_type != MEDIUM_COALESCE){ + if (coal_conf->rx_coal_type != MEDIUM_COALESCE) { coal_conf->rx_timeout = 1; coal_conf->rx_event_count = 4; - amd8111e_set_coalesce(dev,RX_INTR_COAL); + amd8111e_set_coalesce(dev, RX_INTR_COAL); coal_conf->rx_coal_type = MEDIUM_COALESCE; } - } - else if(rx_pkt_size >= 1024){ - if(coal_conf->rx_coal_type != HIGH_COALESCE){ + } else if (rx_pkt_size >= 1024) { + + if (coal_conf->rx_coal_type != HIGH_COALESCE) { coal_conf->rx_timeout = 2; coal_conf->rx_event_count = 3; - amd8111e_set_coalesce(dev,RX_INTR_COAL); + amd8111e_set_coalesce(dev, RX_INTR_COAL); coal_conf->rx_coal_type = HIGH_COALESCE; } } } - /* NOW FOR TX INTR COALESC */ - if(tx_pkt_rate < 800){ - if(coal_conf->tx_coal_type != NO_COALESCE){ + /* NOW FOR TX INTR COALESC */ + if (tx_pkt_rate < 800) { + if (coal_conf->tx_coal_type != NO_COALESCE) { coal_conf->tx_timeout = 0x0; coal_conf->tx_event_count = 0; - amd8111e_set_coalesce(dev,TX_INTR_COAL); + amd8111e_set_coalesce(dev, TX_INTR_COAL); coal_conf->tx_coal_type = NO_COALESCE; } - } - else{ + } else { tx_pkt_size = tx_data_rate/tx_pkt_rate; - if (tx_pkt_size < 128){ + if (tx_pkt_size < 128) { - if(coal_conf->tx_coal_type != NO_COALESCE){ + if (coal_conf->tx_coal_type != NO_COALESCE) { coal_conf->tx_timeout = 0; coal_conf->tx_event_count = 0; - amd8111e_set_coalesce(dev,TX_INTR_COAL); + amd8111e_set_coalesce(dev, TX_INTR_COAL); coal_conf->tx_coal_type = NO_COALESCE; } - } - else if ( (tx_pkt_size >= 128) && (tx_pkt_size < 512) ){ + } else if ((tx_pkt_size >= 128) && (tx_pkt_size < 512)) { - if(coal_conf->tx_coal_type != LOW_COALESCE){ + if (coal_conf->tx_coal_type != LOW_COALESCE) { coal_conf->tx_timeout = 1; coal_conf->tx_event_count = 2; - amd8111e_set_coalesce(dev,TX_INTR_COAL); + amd8111e_set_coalesce(dev, TX_INTR_COAL); coal_conf->tx_coal_type = LOW_COALESCE; } - } - else if ((tx_pkt_size >= 512) && (tx_pkt_size < 1024)){ + } else if ((tx_pkt_size >= 512) && (tx_pkt_size < 1024)) { - if(coal_conf->tx_coal_type != MEDIUM_COALESCE){ + if (coal_conf->tx_coal_type != MEDIUM_COALESCE) { coal_conf->tx_timeout = 2; coal_conf->tx_event_count = 5; - amd8111e_set_coalesce(dev,TX_INTR_COAL); + amd8111e_set_coalesce(dev, TX_INTR_COAL); coal_conf->tx_coal_type = MEDIUM_COALESCE; } } else if (tx_pkt_size >= 1024) { @@ -1091,7 +1084,7 @@ static irqreturn_t amd8111e_interrupt(int irq, void *dev_id) unsigned int intr0, intren0; unsigned int handled = 1; - if(unlikely(dev == NULL)) + if (unlikely(dev == NULL)) return IRQ_NONE; spin_lock(&lp->lock); @@ -1105,7 +1098,7 @@ static irqreturn_t amd8111e_interrupt(int irq, void *dev_id) /* Process all the INT event until INTR bit is clear. */ - if (!(intr0 & INTR)){ + if (!(intr0 & INTR)) { handled = 0; goto err_no_interrupt; } @@ -1140,7 +1133,7 @@ static irqreturn_t amd8111e_interrupt(int irq, void *dev_id) amd8111e_calc_coalesce(dev); err_no_interrupt: - writel( VAL0 | INTREN,mmio + CMD0); + writel(VAL0 | INTREN, mmio + CMD0); spin_unlock(&lp->lock); @@ -1180,7 +1173,7 @@ static int amd8111e_close(struct net_device *dev) netif_carrier_off(lp->amd8111e_net_dev); /* Delete ipg timer */ - if(lp->options & OPTION_DYN_IPG_ENABLE) + if (lp->options & OPTION_DYN_IPG_ENABLE) del_timer_sync(&lp->ipg_data.ipg_timer); spin_unlock_irq(&lp->lock); @@ -1200,8 +1193,8 @@ static int amd8111e_open(struct net_device *dev) { struct amd8111e_priv *lp = netdev_priv(dev); - if(dev->irq ==0 || request_irq(dev->irq, amd8111e_interrupt, IRQF_SHARED, - dev->name, dev)) + if (dev->irq == 0 || request_irq(dev->irq, amd8111e_interrupt, + IRQF_SHARED, dev->name, dev)) return -EAGAIN; napi_enable(&lp->napi); @@ -1210,7 +1203,7 @@ static int amd8111e_open(struct net_device *dev) amd8111e_init_hw_default(lp); - if(amd8111e_restart(dev)){ + if (amd8111e_restart(dev)) { spin_unlock_irq(&lp->lock); napi_disable(&lp->napi); if (dev->irq) @@ -1218,7 +1211,7 @@ static int amd8111e_open(struct net_device *dev) return -ENOMEM; } /* Start ipg timer */ - if(lp->options & OPTION_DYN_IPG_ENABLE){ + if (lp->options & OPTION_DYN_IPG_ENABLE) { add_timer(&lp->ipg_data.ipg_timer); netdev_info(dev, "Dynamic IPG Enabled\n"); } @@ -1289,10 +1282,10 @@ static netdev_tx_t amd8111e_start_xmit(struct sk_buff *skb, lp->tx_idx++; /* Trigger an immediate send poll. */ - writel( VAL1 | TDMD0, lp->mmio + CMD0); - writel( VAL2 | RDMD0,lp->mmio + CMD0); + writel(VAL1 | TDMD0, lp->mmio + CMD0); + writel(VAL2 | RDMD0, lp->mmio + CMD0); - if(amd8111e_tx_queue_avail(lp) < 0){ + if (amd8111e_tx_queue_avail(lp) < 0) { netif_stop_queue(dev); } spin_unlock_irqrestore(&lp->lock, flags); @@ -1326,15 +1319,15 @@ static void amd8111e_set_multicast_list(struct net_device *dev) { struct netdev_hw_addr *ha; struct amd8111e_priv *lp = netdev_priv(dev); - u32 mc_filter[2] ; + u32 mc_filter[2]; int bit_num; - if(dev->flags & IFF_PROMISC){ - writel( VAL2 | PROM, lp->mmio + CMD2); + if (dev->flags & IFF_PROMISC) { + writel(VAL2 | PROM, lp->mmio + CMD2); return; } else - writel( PROM, lp->mmio + CMD2); + writel(PROM, lp->mmio + CMD2); if (dev->flags & IFF_ALLMULTI || netdev_mc_count(dev) > MAX_FILTER_SIZE) { /* get all multicast packet */ @@ -1439,7 +1432,7 @@ static int amd8111e_set_wol(struct net_device *dev, struct ethtool_wolinfo *wol_ if (wol_info->wolopts & WAKE_MAGIC) lp->options |= (OPTION_WOL_ENABLE | OPTION_WAKE_MAGIC_ENABLE); - else if(wol_info->wolopts & WAKE_PHY) + else if (wol_info->wolopts & WAKE_PHY) lp->options |= (OPTION_WOL_ENABLE | OPTION_WAKE_PHY_ENABLE); else @@ -1464,14 +1457,14 @@ static const struct ethtool_ops ops = { * gets/sets driver speed, gets memory mapped register values, forces * auto negotiation, sets/gets WOL options for ethtool application. */ -static int amd8111e_ioctl(struct net_device *dev , struct ifreq *ifr, int cmd) +static int amd8111e_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) { struct mii_ioctl_data *data = if_mii(ifr); struct amd8111e_priv *lp = netdev_priv(dev); int err; u32 mii_regval; - switch(cmd) { + switch (cmd) { case SIOCGMIIPHY: data->phy_id = lp->ext_phy_addr; @@ -1511,7 +1504,7 @@ static int amd8111e_set_mac_address(struct net_device *dev, void *p) spin_lock_irq(&lp->lock); /* Setting the MAC address to the device */ for (i = 0; i < ETH_ALEN; i++) - writeb( dev->dev_addr[i], lp->mmio + PADR + i ); + writeb(dev->dev_addr[i], lp->mmio + PADR + i); spin_unlock_irq(&lp->lock); @@ -1536,22 +1529,22 @@ static int amd8111e_change_mtu(struct net_device *dev, int new_mtu) spin_lock_irq(&lp->lock); - /* stop the chip */ + /* stop the chip */ writel(RUN, lp->mmio + CMD0); dev->mtu = new_mtu; err = amd8111e_restart(dev); spin_unlock_irq(&lp->lock); - if(!err) + if (!err) netif_start_queue(dev); return err; } static int amd8111e_enable_magicpkt(struct amd8111e_priv *lp) { - writel( VAL1|MPPLBA, lp->mmio + CMD3); - writel( VAL0|MPEN_SW, lp->mmio + CMD7); + writel(VAL1 | MPPLBA, lp->mmio + CMD3); + writel(VAL0 | MPEN_SW, lp->mmio + CMD7); /* To eliminate PCI posting bug */ readl(lp->mmio + CMD7); @@ -1562,7 +1555,7 @@ static int amd8111e_enable_link_change(struct amd8111e_priv *lp) { /* Adapter is already stoped/suspended/interrupt-disabled */ - writel(VAL0|LCMODE_SW,lp->mmio + CMD7); + writel(VAL0 | LCMODE_SW, lp->mmio + CMD7); /* To eliminate PCI posting bug */ readl(lp->mmio + CMD7); @@ -1584,7 +1577,7 @@ static void amd8111e_tx_timeout(struct net_device *dev, unsigned int txqueue) spin_lock_irq(&lp->lock); err = amd8111e_restart(dev); spin_unlock_irq(&lp->lock); - if(!err) + if (!err) netif_wake_queue(dev); } @@ -1605,22 +1598,21 @@ static int __maybe_unused amd8111e_suspend(struct device *dev_d) /* stop chip */ spin_lock_irq(&lp->lock); - if(lp->options & OPTION_DYN_IPG_ENABLE) + if (lp->options & OPTION_DYN_IPG_ENABLE) del_timer_sync(&lp->ipg_data.ipg_timer); amd8111e_stop_chip(lp); spin_unlock_irq(&lp->lock); - if(lp->options & OPTION_WOL_ENABLE){ + if (lp->options & OPTION_WOL_ENABLE) { /* enable wol */ - if(lp->options & OPTION_WAKE_MAGIC_ENABLE) + if (lp->options & OPTION_WAKE_MAGIC_ENABLE) amd8111e_enable_magicpkt(lp); - if(lp->options & OPTION_WAKE_PHY_ENABLE) + if (lp->options & OPTION_WAKE_PHY_ENABLE) amd8111e_enable_link_change(lp); device_set_wakeup_enable(dev_d, 1); - } - else{ + } else { device_set_wakeup_enable(dev_d, 0); } @@ -1640,7 +1632,7 @@ static int __maybe_unused amd8111e_resume(struct device *dev_d) spin_lock_irq(&lp->lock); amd8111e_restart(dev); /* Restart ipg timer */ - if(lp->options & OPTION_DYN_IPG_ENABLE) + if (lp->options & OPTION_DYN_IPG_ENABLE) mod_timer(&lp->ipg_data.ipg_timer, jiffies + IPG_CONVERGE_JIFFIES); spin_unlock_irq(&lp->lock); @@ -1657,14 +1649,14 @@ static void amd8111e_config_ipg(struct timer_list *t) unsigned int total_col_cnt; unsigned int tmp_ipg; - if(lp->link_config.duplex == DUPLEX_FULL){ + if (lp->link_config.duplex == DUPLEX_FULL) { ipg_data->ipg = DEFAULT_IPG; return; } - if(ipg_data->ipg_state == SSTATE){ + if (ipg_data->ipg_state == SSTATE) { - if(ipg_data->timer_tick == IPG_STABLE_TIME){ + if (ipg_data->timer_tick == IPG_STABLE_TIME) { ipg_data->timer_tick = 0; ipg_data->ipg = MIN_IPG - IPG_STEP; @@ -1676,7 +1668,7 @@ static void amd8111e_config_ipg(struct timer_list *t) ipg_data->timer_tick++; } - if(ipg_data->ipg_state == CSTATE){ + if (ipg_data->ipg_state == CSTATE) { /* Get the current collision count */ @@ -1684,10 +1676,10 @@ static void amd8111e_config_ipg(struct timer_list *t) amd8111e_read_mib(mmio, xmt_collisions); if ((total_col_cnt - prev_col_cnt) < - (ipg_data->diff_col_cnt)){ + (ipg_data->diff_col_cnt)) { ipg_data->diff_col_cnt = - total_col_cnt - prev_col_cnt ; + total_col_cnt - prev_col_cnt; ipg_data->ipg = ipg_data->current_ipg; } @@ -1696,7 +1688,7 @@ static void amd8111e_config_ipg(struct timer_list *t) if (ipg_data->current_ipg <= MAX_IPG) tmp_ipg = ipg_data->current_ipg; - else{ + else { tmp_ipg = ipg_data->ipg; ipg_data->ipg_state = SSTATE; } @@ -1748,24 +1740,24 @@ static int amd8111e_probe_one(struct pci_dev *pdev, const struct pci_device_id *ent) { int err, i; - unsigned long reg_addr,reg_len; + unsigned long reg_addr, reg_len; struct amd8111e_priv *lp; struct net_device *dev; err = pci_enable_device(pdev); - if(err){ + if (err) { dev_err(&pdev->dev, "Cannot enable new PCI device\n"); return err; } - if(!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)){ + if (!(pci_resource_flags(pdev, 0) & IORESOURCE_MEM)) { dev_err(&pdev->dev, "Cannot find PCI base address\n"); err = -ENODEV; goto err_disable_pdev; } err = pci_request_regions(pdev, MODULE_NAME); - if(err){ + if (err) { dev_err(&pdev->dev, "Cannot obtain PCI resources\n"); goto err_disable_pdev; } @@ -1798,7 +1790,7 @@ static int amd8111e_probe_one(struct pci_dev *pdev, SET_NETDEV_DEV(dev, &pdev->dev); #if AMD8111E_VLAN_TAG_USED - dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX ; + dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX; #endif lp = netdev_priv(dev); @@ -1821,16 +1813,16 @@ static int amd8111e_probe_one(struct pci_dev *pdev, /* Setting user defined parametrs */ lp->ext_phy_option = speed_duplex[card_idx]; - if(coalesce[card_idx]) + if (coalesce[card_idx]) lp->options |= OPTION_INTR_COAL_ENABLE; - if(dynamic_ipg[card_idx++]) + if (dynamic_ipg[card_idx++]) lp->options |= OPTION_DYN_IPG_ENABLE; /* Initialize driver entry points */ dev->netdev_ops = &amd8111e_netdev_ops; dev->ethtool_ops = &ops; - dev->irq =pdev->irq; + dev->irq = pdev->irq; dev->watchdog_timeo = AMD8111E_TX_TIMEOUT; dev->min_mtu = AMD8111E_MIN_MTU; dev->max_mtu = AMD8111E_MAX_MTU; @@ -1861,7 +1853,7 @@ static int amd8111e_probe_one(struct pci_dev *pdev, pci_set_drvdata(pdev, dev); /* Initialize software ipg timer */ - if(lp->options & OPTION_DYN_IPG_ENABLE){ + if (lp->options & OPTION_DYN_IPG_ENABLE) { timer_setup(&lp->ipg_data.ipg_timer, amd8111e_config_ipg, 0); lp->ipg_data.ipg_timer.expires = jiffies + IPG_CONVERGE_JIFFIES; @@ -1870,7 +1862,7 @@ static int amd8111e_probe_one(struct pci_dev *pdev, } /* display driver and device information */ - chip_version = (readl(lp->mmio + CHIPID) & 0xf0000000)>>28; + chip_version = (readl(lp->mmio + CHIPID) & 0xf0000000) >> 28; dev_info(&pdev->dev, "[ Rev %x ] PCI 10/100BaseT Ethernet %pM\n", chip_version, dev->dev_addr); if (lp->ext_phy_id) @@ -1879,7 +1871,7 @@ static int amd8111e_probe_one(struct pci_dev *pdev, else dev_info(&pdev->dev, "Couldn't detect MII PHY, assuming address 0x01\n"); - return 0; + return 0; err_free_dev: free_netdev(dev); @@ -1919,7 +1911,7 @@ MODULE_DEVICE_TABLE(pci, amd8111e_pci_tbl); static SIMPLE_DEV_PM_OPS(amd8111e_pm_ops, amd8111e_suspend, amd8111e_resume); static struct pci_driver amd8111e_driver = { - .name = MODULE_NAME, + .name = MODULE_NAME, .id_table = amd8111e_pci_tbl, .probe = amd8111e_probe_one, .remove = amd8111e_remove_one, From 3f6ebcffaf673490ec95024a8d6e67b890cc53e2 Mon Sep 17 00:00:00 2001 From: Yixing Liu Date: Wed, 31 Mar 2021 16:18:31 +0800 Subject: [PATCH 10/36] net: amd: correct some format issues There should be a blank line after declarations. Signed-off-by: Yixing Liu Signed-off-by: Weihang Li Signed-off-by: David S. Miller --- drivers/net/ethernet/amd/hplance.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/amd/hplance.c b/drivers/net/ethernet/amd/hplance.c index e10aceb2b7674..6784f8748638b 100644 --- a/drivers/net/ethernet/amd/hplance.c +++ b/drivers/net/ethernet/amd/hplance.c @@ -170,6 +170,7 @@ static void hplance_init(struct net_device *dev, struct dio_dev *d) static void hplance_writerap(void *priv, unsigned short value) { struct lance_private *lp = (struct lance_private *)priv; + do { out_be16(lp->base + HPLANCE_REGOFF + LANCE_RAP, value); } while ((in_8(lp->base + HPLANCE_STATUS) & LE_ACK) == 0); @@ -178,6 +179,7 @@ static void hplance_writerap(void *priv, unsigned short value) static void hplance_writerdp(void *priv, unsigned short value) { struct lance_private *lp = (struct lance_private *)priv; + do { out_be16(lp->base + HPLANCE_REGOFF + LANCE_RDP, value); } while ((in_8(lp->base + HPLANCE_STATUS) & LE_ACK) == 0); @@ -187,6 +189,7 @@ static unsigned short hplance_readrdp(void *priv) { struct lance_private *lp = (struct lance_private *)priv; __u16 value; + do { value = in_be16(lp->base + HPLANCE_REGOFF + LANCE_RDP); } while ((in_8(lp->base + HPLANCE_STATUS) & LE_ACK) == 0); From 1f78ff4ff7089b8265278d0bbf937fd8e5958dcf Mon Sep 17 00:00:00 2001 From: Yixing Liu Date: Wed, 31 Mar 2021 16:18:32 +0800 Subject: [PATCH 11/36] net: ocelot: fix a trailling format issue with block comments Use a tralling */ on a separate line for block comments. Signed-off-by: Yixing Liu Signed-off-by: Weihang Li Signed-off-by: David S. Miller --- drivers/net/ethernet/mscc/ocelot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c index 1a36b416fd9b6..8d06ffaf318a7 100644 --- a/drivers/net/ethernet/mscc/ocelot.c +++ b/drivers/net/ethernet/mscc/ocelot.c @@ -484,7 +484,8 @@ void ocelot_adjust_link(struct ocelot *ocelot, int port, DEV_MAC_ENA_CFG_TX_ENA, DEV_MAC_ENA_CFG); /* Take MAC, Port, Phy (intern) and PCS (SGMII/Serdes) clock out of - * reset */ + * reset + */ ocelot_port_writel(ocelot_port, DEV_CLOCK_CFG_LINK_SPEED(speed), DEV_CLOCK_CFG); From 142c1d2ed96604ec09bbc4076d2a8d09271850d3 Mon Sep 17 00:00:00 2001 From: Yixing Liu Date: Wed, 31 Mar 2021 16:18:33 +0800 Subject: [PATCH 12/36] net: toshiba: fix the trailing format of some block comments Use a trailling */ on a separate line for block comments. Signed-off-by: Yixing Liu Signed-off-by: Weihang Li Signed-off-by: David S. Miller --- drivers/net/ethernet/toshiba/spider_net.c | 42 +++++++++++++++-------- drivers/net/ethernet/toshiba/tc35815.c | 3 +- 2 files changed, 30 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/toshiba/spider_net.c b/drivers/net/ethernet/toshiba/spider_net.c index d5a75ef7e3ca9..226a76633e65c 100644 --- a/drivers/net/ethernet/toshiba/spider_net.c +++ b/drivers/net/ethernet/toshiba/spider_net.c @@ -146,7 +146,8 @@ spider_net_read_phy(struct net_device *netdev, int mii_id, int reg) /* we don't use semaphores to wait for an SPIDER_NET_GPROPCMPINT * interrupt, as we poll for the completion of the read operation - * in spider_net_read_phy. Should take about 50 us */ + * in spider_net_read_phy. Should take about 50 us + */ do { readvalue = spider_net_read_reg(card, SPIDER_NET_GPCROPCMD); } while (readvalue & SPIDER_NET_GPREXEC); @@ -387,7 +388,8 @@ spider_net_prepare_rx_descr(struct spider_net_card *card, (~(SPIDER_NET_RXBUF_ALIGN - 1)); /* and we need to have it 128 byte aligned, therefore we allocate a - * bit more */ + * bit more + */ /* allocate an skb */ descr->skb = netdev_alloc_skb(card->netdev, bufsize + SPIDER_NET_RXBUF_ALIGN - 1); @@ -488,7 +490,8 @@ spider_net_refill_rx_chain(struct spider_net_card *card) /* one context doing the refill (and a second context seeing that * and omitting it) is ok. If called by NAPI, we'll be called again * as spider_net_decode_one_descr is called several times. If some - * interrupt calls us, the NAPI is about to clean up anyway. */ + * interrupt calls us, the NAPI is about to clean up anyway. + */ if (!spin_trylock_irqsave(&chain->lock, flags)) return; @@ -523,14 +526,16 @@ spider_net_alloc_rx_skbs(struct spider_net_card *card) /* Put at least one buffer into the chain. if this fails, * we've got a problem. If not, spider_net_refill_rx_chain - * will do the rest at the end of this function. */ + * will do the rest at the end of this function. + */ if (spider_net_prepare_rx_descr(card, chain->head)) goto error; else chain->head = chain->head->next; /* This will allocate the rest of the rx buffers; - * if not, it's business as usual later on. */ + * if not, it's business as usual later on. + */ spider_net_refill_rx_chain(card); spider_net_enable_rxdmac(card); return 0; @@ -706,7 +711,8 @@ spider_net_set_low_watermark(struct spider_net_card *card) int i; /* Measure the length of the queue. Measurement does not - * need to be precise -- does not need a lock. */ + * need to be precise -- does not need a lock. + */ while (descr != card->tx_chain.head) { status = descr->hwdescr->dmac_cmd_status & SPIDER_NET_DESCR_NOT_IN_USE; if (status == SPIDER_NET_DESCR_NOT_IN_USE) @@ -786,7 +792,8 @@ spider_net_release_tx_chain(struct spider_net_card *card, int brutal) /* fallthrough, if we release the descriptors * brutally (then we don't care about - * SPIDER_NET_DESCR_CARDOWNED) */ + * SPIDER_NET_DESCR_CARDOWNED) + */ fallthrough; case SPIDER_NET_DESCR_RESPONSE_ERROR: @@ -948,7 +955,8 @@ spider_net_pass_skb_up(struct spider_net_descr *descr, skb_put(skb, hwdescr->valid_size); /* the card seems to add 2 bytes of junk in front - * of the ethernet frame */ + * of the ethernet frame + */ #define SPIDER_MISALIGN 2 skb_pull(skb, SPIDER_MISALIGN); skb->protocol = eth_type_trans(skb, netdev); @@ -1382,7 +1390,8 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg, /* PHY read operation completed */ /* we don't use semaphores, as we poll for the completion * of the read operation in spider_net_read_phy. Should take - * about 50 us */ + * about 50 us + */ show_error = 0; break; case SPIDER_NET_GPWFFINT: @@ -1450,7 +1459,8 @@ spider_net_handle_error_irq(struct spider_net_card *card, u32 status_reg, { case SPIDER_NET_GTMFLLINT: /* TX RAM full may happen on a usual case. - * Logging is not needed. */ + * Logging is not needed. + */ show_error = 0; break; case SPIDER_NET_GRFDFLLINT: @@ -1694,7 +1704,8 @@ spider_net_enable_card(struct spider_net_card *card) { int i; /* the following array consists of (register),(value) pairs - * that are set in this function. A register of 0 ends the list */ + * that are set in this function. A register of 0 ends the list + */ u32 regs[][2] = { { SPIDER_NET_GRESUMINTNUM, 0 }, { SPIDER_NET_GREINTNUM, 0 }, @@ -1757,7 +1768,8 @@ spider_net_enable_card(struct spider_net_card *card) spider_net_write_reg(card, SPIDER_NET_ECMODE, SPIDER_NET_ECMODE_VALUE); /* set chain tail address for RX chains and - * enable DMA */ + * enable DMA + */ spider_net_enable_rxchtails(card); spider_net_enable_rxdmac(card); @@ -1995,7 +2007,8 @@ static void spider_net_link_phy(struct timer_list *t) case BCM54XX_UNKNOWN: /* copper, fiber with and without failed, - * retry from beginning */ + * retry from beginning + */ spider_net_setup_aneg(card); card->medium = BCM54XX_COPPER; break; @@ -2263,7 +2276,8 @@ spider_net_setup_netdev(struct spider_net_card *card) netdev->features |= NETIF_F_RXCSUM; netdev->features |= NETIF_F_IP_CSUM | NETIF_F_LLTX; /* some time: NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX | - * NETIF_F_HW_VLAN_CTAG_FILTER */ + * NETIF_F_HW_VLAN_CTAG_FILTER + */ /* MTU range: 64 - 2294 */ netdev->min_mtu = SPIDER_NET_MIN_MTU; diff --git a/drivers/net/ethernet/toshiba/tc35815.c b/drivers/net/ethernet/toshiba/tc35815.c index 7a6e5ff8e5d43..fedb2bf692612 100644 --- a/drivers/net/ethernet/toshiba/tc35815.c +++ b/drivers/net/ethernet/toshiba/tc35815.c @@ -1914,7 +1914,8 @@ tc35815_set_multicast_list(struct net_device *dev) if (dev->flags & IFF_PROMISC) { /* With some (all?) 100MHalf HUB, controller will hang - * if we enabled promiscuous mode before linkup... */ + * if we enabled promiscuous mode before linkup... + */ struct tc35815_local *lp = netdev_priv(dev); if (!lp->link) From 44d043b53d3867523960f79c6a909c976e15f3f7 Mon Sep 17 00:00:00 2001 From: Yangyang Li Date: Wed, 31 Mar 2021 16:18:34 +0800 Subject: [PATCH 13/36] net: lpc_eth: fix format warnings of block comments Fix the following format warning: 1. Block comments use * on subsequent lines 2. Block comments use a trailing */ on a separate line Signed-off-by: Yangyang Li Signed-off-by: Weihang Li Signed-off-by: David S. Miller --- drivers/net/ethernet/nxp/lpc_eth.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/nxp/lpc_eth.c b/drivers/net/ethernet/nxp/lpc_eth.c index d3cbb4215f5cf..e72fd33a214c1 100644 --- a/drivers/net/ethernet/nxp/lpc_eth.c +++ b/drivers/net/ethernet/nxp/lpc_eth.c @@ -1044,7 +1044,8 @@ static netdev_tx_t lpc_eth_hard_start_xmit(struct sk_buff *skb, if (pldat->num_used_tx_buffs >= (ENET_TX_DESC - 1)) { /* This function should never be called when there are no - buffers */ + * buffers + */ netif_stop_queue(ndev); spin_unlock_irq(&pldat->lock); WARN(1, "BUG! TX request when no free TX buffers!\n"); @@ -1318,7 +1319,8 @@ static int lpc_eth_drv_probe(struct platform_device *pdev) pldat->dma_buff_size = PAGE_ALIGN(pldat->dma_buff_size); /* Allocate a chunk of memory for the DMA ethernet buffers - and descriptors */ + * and descriptors + */ pldat->dma_buff_base_v = dma_alloc_coherent(dev, pldat->dma_buff_size, &dma_handle, @@ -1365,7 +1367,8 @@ static int lpc_eth_drv_probe(struct platform_device *pdev) __lpc_mii_mngt_reset(pldat); /* Force default PHY interface setup in chip, this will probably be - changed by the PHY driver */ + * changed by the PHY driver + */ pldat->link = 0; pldat->speed = 100; pldat->duplex = DUPLEX_FULL; From 1caf8d39c58f3f63193d02928c8dce3fa07cee52 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:05 -0700 Subject: [PATCH 14/36] inet: shrink inet_timewait_death_row by 48 bytes struct inet_timewait_death_row uses two cache lines, because we want tw_count to use a full cache line to avoid false sharing. Rework its definition and placement in netns_ipv4 so that: 1) We add 60 bytes of padding after tw_count to avoid false sharing, knowing that tcp_death_row will have ____cacheline_aligned_in_smp attribute. 2) We do not risk padding before tcp_death_row, because we move it at the beginning of netns_ipv4, even if new fields are added later. 3) We do not waste 48 bytes of padding after it. Note that I have not changed dccp. pahole result for struct netns_ipv4 before/after the patch : /* size: 832, cachelines: 13, members: 139 */ /* sum members: 721, holes: 12, sum holes: 95 */ /* padding: 16 */ /* paddings: 2, sum paddings: 55 */ -> /* size: 768, cachelines: 12, members: 139 */ /* sum members: 673, holes: 11, sum holes: 39 */ /* padding: 56 */ /* paddings: 2, sum paddings: 7 */ /* forced alignments: 1 */ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 9c8dd424d79b1..1085ed4e0788d 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -32,14 +32,18 @@ struct inet_hashinfo; struct inet_timewait_death_row { atomic_t tw_count; + char tw_pad[L1_CACHE_BYTES - sizeof(atomic_t)]; - struct inet_hashinfo *hashinfo ____cacheline_aligned_in_smp; + struct inet_hashinfo *hashinfo; int sysctl_max_tw_buckets; }; struct tcp_fastopen_context; struct netns_ipv4 { + /* Please keep tcp_death_row at first field in netns_ipv4 */ + struct inet_timewait_death_row tcp_death_row ____cacheline_aligned_in_smp; + #ifdef CONFIG_SYSCTL struct ctl_table_header *forw_hdr; struct ctl_table_header *frags_hdr; @@ -175,7 +179,6 @@ struct netns_ipv4 { int sysctl_tcp_comp_sack_nr; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; - struct inet_timewait_death_row tcp_death_row; int sysctl_max_syn_backlog; int sysctl_tcp_fastopen; const struct tcp_congestion_ops __rcu *tcp_congestion_control; From 490f33c4e70431d0a4d01666a6525fdd43299cde Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:06 -0700 Subject: [PATCH 15/36] inet: shrink netns_ipv4 by another cache line By shuffling around some fields to remove 8 bytes of hole, we can save one cache line. pahole result before/after the patch : /* size: 768, cachelines: 12, members: 139 */ /* sum members: 673, holes: 11, sum holes: 39 */ /* padding: 56 */ /* paddings: 2, sum paddings: 7 */ /* forced alignments: 1 */ -> /* size: 704, cachelines: 11, members: 139 */ /* sum members: 673, holes: 10, sum holes: 31 */ /* paddings: 2, sum paddings: 7 */ /* forced alignments: 1 */ Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 1085ed4e0788d..538ed69919dc4 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -57,17 +57,17 @@ struct netns_ipv4 { struct mutex ra_mutex; #ifdef CONFIG_IP_MULTIPLE_TABLES struct fib_rules_ops *rules_ops; - bool fib_has_custom_rules; - unsigned int fib_rules_require_fldissect; struct fib_table __rcu *fib_main; struct fib_table __rcu *fib_default; + unsigned int fib_rules_require_fldissect; + bool fib_has_custom_rules; #endif bool fib_has_custom_local_routes; + bool fib_offload_disabled; #ifdef CONFIG_IP_ROUTE_CLASSID int fib_num_tclassid_users; #endif struct hlist_head *fib_table_hash; - bool fib_offload_disabled; struct sock *fibnl; struct sock * __percpu *icmp_sk; From b2908fac5b7b23c03fa1d3e1055ad95ba305c871 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:07 -0700 Subject: [PATCH 16/36] ipv4: convert fib_notify_on_flag_change sysctl to u8 Reduce footprint of sysctls. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index 538ed69919dc4..b187ac597b8ce 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -191,7 +191,7 @@ struct netns_ipv4 { int sysctl_udp_wmem_min; int sysctl_udp_rmem_min; - int sysctl_fib_notify_on_flag_change; + u8 sysctl_fib_notify_on_flag_change; #ifdef CONFIG_NET_L3_MASTER_DEV int sysctl_udp_l3mdev_accept; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 9199f507a005e..a2352d8d88cc9 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1364,9 +1364,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv4.sysctl_fib_notify_on_flag_change, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, From cd04bd022258f4aa6e8392c8133dbbf31da0f12f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:08 -0700 Subject: [PATCH 17/36] ipv4: convert udp_l3mdev_accept sysctl to u8 Reduce footprint of sysctls. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index b187ac597b8ce..d309b1b897158 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -194,7 +194,7 @@ struct netns_ipv4 { u8 sysctl_fib_notify_on_flag_change; #ifdef CONFIG_NET_L3_MASTER_DEV - int sysctl_udp_l3mdev_accept; + u8 sysctl_udp_l3mdev_accept; #endif int sysctl_igmp_max_memberships; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index a2352d8d88cc9..1b6ce649a433c 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -1065,9 +1065,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "udp_l3mdev_accept", .data = &init_net.ipv4.sysctl_udp_l3mdev_accept, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, From be205fe6ec4ffd6875f69e61205163fb686a5c74 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:09 -0700 Subject: [PATCH 18/36] ipv4: convert fib_multipath_{use_neigh|hash_policy} sysctls to u8 Make room for better packing of netns_ipv4 Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 4 ++-- net/ipv4/sysctl_net_ipv4.c | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index d309b1b897158..eb6ca07d3b0f5 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -220,8 +220,8 @@ struct netns_ipv4 { #endif #endif #ifdef CONFIG_IP_ROUTE_MULTIPATH - int sysctl_fib_multipath_use_neigh; - int sysctl_fib_multipath_hash_policy; + u8 sysctl_fib_multipath_use_neigh; + u8 sysctl_fib_multipath_hash_policy; #endif struct fib_notifier_ops *notifier_ops; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 1b6ce649a433c..ad75d6bb2df7f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -456,7 +456,7 @@ static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, ipv4.sysctl_fib_multipath_hash_policy); int ret; - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV4_MPATH_HASH_UPDATE, net); @@ -1038,16 +1038,16 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "fib_multipath_use_neigh", .data = &init_net.ipv4.sysctl_fib_multipath_use_neigh, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv4.sysctl_fib_multipath_hash_policy, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_fib_multipath_hash_policy, .extra1 = SYSCTL_ZERO, From 7d4b37ebb934aa32a54666fe9153d127c33ff89a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:10 -0700 Subject: [PATCH 19/36] ipv4: convert igmp_link_local_mcast_reports sysctl to u8 This sysctl is a bool, can use less storage. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index eb6ca07d3b0f5..fafcedf643832 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -197,9 +197,9 @@ struct netns_ipv4 { u8 sysctl_udp_l3mdev_accept; #endif + u8 sysctl_igmp_llm_reports; int sysctl_igmp_max_memberships; int sysctl_igmp_max_msf; - int sysctl_igmp_llm_reports; int sysctl_igmp_qrv; struct ping_group_range ping_group_range; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index ad75d6bb2df7f..fd2b35065bb21 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -848,9 +848,9 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "igmp_link_local_mcast_reports", .data = &init_net.ipv4.sysctl_igmp_llm_reports, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "igmp_max_memberships", From 1c3289c931740f235b29be5182e5f2dfb004593d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:11 -0700 Subject: [PATCH 20/36] tcp: convert tcp_comp_sack_nr sysctl to u8 tcp_comp_sack_nr max value was already 255. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv4.h | 2 +- net/ipv4/sysctl_net_ipv4.c | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h index fafcedf643832..87e1612497eae 100644 --- a/include/net/netns/ipv4.h +++ b/include/net/netns/ipv4.h @@ -171,12 +171,12 @@ struct netns_ipv4 { u8 sysctl_tcp_min_tso_segs; u8 sysctl_tcp_autocorking; u8 sysctl_tcp_reflect_tos; + u8 sysctl_tcp_comp_sack_nr; int sysctl_tcp_invalid_ratelimit; int sysctl_tcp_pacing_ss_ratio; int sysctl_tcp_pacing_ca_ratio; int sysctl_tcp_wmem[3]; int sysctl_tcp_rmem[3]; - int sysctl_tcp_comp_sack_nr; unsigned long sysctl_tcp_comp_sack_delay_ns; unsigned long sysctl_tcp_comp_sack_slack_ns; int sysctl_max_syn_backlog; diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index fd2b35065bb21..a09e466ce11d0 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -46,7 +46,6 @@ static int tcp_syn_retries_min = 1; static int tcp_syn_retries_max = MAX_TCP_SYNCNT; static int ip_ping_group_range_min[] = { 0, 0 }; static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX }; -static int comp_sack_nr_max = 255; static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; @@ -1330,11 +1329,10 @@ static struct ctl_table ipv4_net_table[] = { { .procname = "tcp_comp_sack_nr", .data = &init_net.ipv4.sysctl_tcp_comp_sack_nr, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, - .extra2 = &comp_sack_nr_max, }, { .procname = "tcp_reflect_tos", From a6175633a2af0eae07127311563d2a75096c111a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:12 -0700 Subject: [PATCH 21/36] ipv6: convert elligible sysctls to u8 Convert most sysctls that can fit in a byte. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 24 ++++++++++++------------ net/ipv6/icmp.c | 12 ++++++------ net/ipv6/sysctl_net_ipv6.c | 38 ++++++++++++++++++-------------------- 3 files changed, 36 insertions(+), 38 deletions(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 21c0debbd39ee..84f4a8bec397d 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -20,7 +20,6 @@ struct netns_sysctl_ipv6 { struct ctl_table_header *frags_hdr; struct ctl_table_header *xfrm6_hdr; #endif - int bindv6only; int flush_delay; int ip6_rt_max_size; int ip6_rt_gc_min_interval; @@ -29,21 +28,22 @@ struct netns_sysctl_ipv6 { int ip6_rt_gc_elasticity; int ip6_rt_mtu_expires; int ip6_rt_min_advmss; - int multipath_hash_policy; - int flowlabel_consistency; - int auto_flowlabels; + u8 bindv6only; + u8 multipath_hash_policy; + u8 flowlabel_consistency; + u8 auto_flowlabels; int icmpv6_time; - int icmpv6_echo_ignore_all; - int icmpv6_echo_ignore_multicast; - int icmpv6_echo_ignore_anycast; + u8 icmpv6_echo_ignore_all; + u8 icmpv6_echo_ignore_multicast; + u8 icmpv6_echo_ignore_anycast; DECLARE_BITMAP(icmpv6_ratemask, ICMPV6_MSG_MAX + 1); unsigned long *icmpv6_ratemask_ptr; - int anycast_src_echo_reply; - int ip_nonlocal_bind; - int fwmark_reflect; + u8 anycast_src_echo_reply; + u8 ip_nonlocal_bind; + u8 fwmark_reflect; + u8 flowlabel_state_ranges; int idgen_retries; int idgen_delay; - int flowlabel_state_ranges; int flowlabel_reflect; int max_dst_opts_cnt; int max_hbh_opts_cnt; @@ -51,7 +51,7 @@ struct netns_sysctl_ipv6 { int max_hbh_opts_len; int seg6_flowlabel; bool skip_notify_on_dev_down; - int fib_notify_on_flag_change; + u8 fib_notify_on_flag_change; }; struct netns_ipv6 { diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 29d38d6b55fb3..1bca2b09d77e6 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -1169,23 +1169,23 @@ static struct ctl_table ipv6_icmp_table_template[] = { { .procname = "echo_ignore_all", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_all, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_multicast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_multicast, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "echo_ignore_anycast", .data = &init_net.ipv6.sysctl.icmpv6_echo_ignore_anycast, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ratemask", diff --git a/net/ipv6/sysctl_net_ipv6.c b/net/ipv6/sysctl_net_ipv6.c index 263ab43ed06bc..27102c3d6e1da 100644 --- a/net/ipv6/sysctl_net_ipv6.c +++ b/net/ipv6/sysctl_net_ipv6.c @@ -23,7 +23,6 @@ static int two = 2; static int flowlabel_reflect_max = 0x7; -static int auto_flowlabels_min; static int auto_flowlabels_max = IP6_AUTO_FLOW_LABEL_MAX; static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, @@ -34,7 +33,7 @@ static int proc_rt6_multipath_hash_policy(struct ctl_table *table, int write, net = container_of(table->data, struct net, ipv6.sysctl.multipath_hash_policy); - ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); + ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); if (write && ret == 0) call_netevent_notifiers(NETEVENT_IPV6_MPATH_HASH_UPDATE, net); @@ -45,39 +44,38 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "bindv6only", .data = &init_net.ipv6.sysctl.bindv6only, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "anycast_src_echo_reply", .data = &init_net.ipv6.sysctl.anycast_src_echo_reply, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_consistency", .data = &init_net.ipv6.sysctl.flowlabel_consistency, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "auto_flowlabels", .data = &init_net.ipv6.sysctl.auto_flowlabels, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, - .extra1 = &auto_flowlabels_min, + .proc_handler = proc_dou8vec_minmax, .extra2 = &auto_flowlabels_max }, { .procname = "fwmark_reflect", .data = &init_net.ipv6.sysctl.fwmark_reflect, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "idgen_retries", @@ -96,16 +94,16 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "flowlabel_state_ranges", .data = &init_net.ipv6.sysctl.flowlabel_state_ranges, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "ip_nonlocal_bind", .data = &init_net.ipv6.sysctl.ip_nonlocal_bind, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec + .proc_handler = proc_dou8vec_minmax, }, { .procname = "flowlabel_reflect", @@ -147,7 +145,7 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "fib_multipath_hash_policy", .data = &init_net.ipv6.sysctl.multipath_hash_policy, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, .proc_handler = proc_rt6_multipath_hash_policy, .extra1 = SYSCTL_ZERO, @@ -163,9 +161,9 @@ static struct ctl_table ipv6_table_template[] = { { .procname = "fib_notify_on_flag_change", .data = &init_net.ipv6.sysctl.fib_notify_on_flag_change, - .maxlen = sizeof(int), + .maxlen = sizeof(u8), .mode = 0644, - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dou8vec_minmax, .extra1 = SYSCTL_ZERO, .extra2 = &two, }, From 0dd39d952f75a678b2ebcac8bd60f449f303c755 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 10:52:13 -0700 Subject: [PATCH 22/36] ipv6: move ip6_dst_ops first in netns_ipv6 ip6_dst_ops have cache line alignement. Moving it at beginning of netns_ipv6 removes a 48 byte hole, and shrinks netns_ipv6 from 12 to 11 cache lines. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/net/netns/ipv6.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/net/netns/ipv6.h b/include/net/netns/ipv6.h index 84f4a8bec397d..808f0f79ea9c9 100644 --- a/include/net/netns/ipv6.h +++ b/include/net/netns/ipv6.h @@ -55,6 +55,9 @@ struct netns_sysctl_ipv6 { }; struct netns_ipv6 { + /* Keep ip6_dst_ops at the beginning of netns_sysctl_ipv6 */ + struct dst_ops ip6_dst_ops; + struct netns_sysctl_ipv6 sysctl; struct ipv6_devconf *devconf_all; struct ipv6_devconf *devconf_dflt; @@ -76,7 +79,6 @@ struct netns_ipv6 { struct hlist_head *fib_table_hash; struct fib6_table *fib6_main_tbl; struct list_head fib6_walkers; - struct dst_ops ip6_dst_ops; rwlock_t fib6_walker_lock; spinlock_t fib6_gc_lock; unsigned int ip6_rt_gc_expire; From ac1db7acea67777be1ba86e36e058c479eab6508 Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Wed, 31 Mar 2021 16:36:02 +0800 Subject: [PATCH 23/36] net/tipc: fix missing destroy_workqueue() on error in tipc_crypto_start() Add the missing destroy_workqueue() before return from tipc_crypto_start() in the error handling case. Fixes: 1ef6f7c9390f ("tipc: add automatic session key exchange") Reported-by: Hulk Robot Signed-off-by: Yang Yingliang Signed-off-by: David S. Miller --- net/tipc/crypto.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 6f64acef73dce..76b8428c94a75 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -1492,6 +1492,8 @@ int tipc_crypto_start(struct tipc_crypto **crypto, struct net *net, /* Allocate statistic structure */ c->stats = alloc_percpu_gfp(struct tipc_crypto_stats, GFP_ATOMIC); if (!c->stats) { + if (c->wq) + destroy_workqueue(c->wq); kfree_sensitive(c); return -ENOMEM; } From 0d7a7b2014b1a499a0fe24c9f3063d7856b5aaaf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 31 Mar 2021 14:38:11 -0700 Subject: [PATCH 24/36] ipv6: remove extra dev_hold() for fallback tunnels My previous commits added a dev_hold() in tunnels ndo_init(), but forgot to remove it from special functions setting up fallback tunnels. Fallback tunnels do call their respective ndo_init() This leads to various reports like : unregister_netdevice: waiting for ip6gre0 to become free. Usage count = 2 Fixes: 48bb5697269a ("ip6_tunnel: sit: proper dev_{hold|put} in ndo_[un]init methods") Fixes: 6289a98f0817 ("sit: proper dev_{hold|put} in ndo_[un]init methods") Fixes: 40cb881b5aaa ("ip6_vti: proper dev_{hold|put} in ndo_[un]init methods") Fixes: 7f700334be9a ("ip6_gre: proper dev_{hold|put} in ndo_[un]init methods") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller --- net/ipv6/ip6_gre.c | 3 --- net/ipv6/ip6_tunnel.c | 1 - net/ipv6/ip6_vti.c | 1 - net/ipv6/sit.c | 1 - 4 files changed, 6 deletions(-) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 9689bf9f46f34..bc224f917bbd5 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -387,7 +387,6 @@ static struct ip6_tnl *ip6gre_tunnel_locate(struct net *net, if (!(nt->parms.o_flags & TUNNEL_SEQ)) dev->features |= NETIF_F_LLTX; - dev_hold(dev); ip6gre_tunnel_link(ign, nt); return nt; @@ -1539,8 +1538,6 @@ static void ip6gre_fb_tunnel_init(struct net_device *dev) strcpy(tunnel->parms.name, dev->name); tunnel->hlen = sizeof(struct ipv6hdr) + 4; - - dev_hold(dev); } static struct inet6_protocol ip6gre_protocol __read_mostly = { diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 67ee9d58ec5ef..07a0a06a9b52b 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1925,7 +1925,6 @@ static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); t->parms.proto = IPPROTO_IPV6; - dev_hold(dev); rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index a018afdb3e062..856e46ad0895b 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -963,7 +963,6 @@ static int __net_init vti6_fb_tnl_dev_init(struct net_device *dev) struct vti6_net *ip6n = net_generic(net, vti6_net_id); t->parms.proto = IPPROTO_IPV6; - dev_hold(dev); rcu_assign_pointer(ip6n->tnls_wc[0], t); return 0; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 488d3181aec3a..ff2ca2e7c7f50 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1470,7 +1470,6 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; iph->ttl = 64; - dev_hold(dev); rcu_assign_pointer(sitn->tunnels_wc[0], tunnel); } From 2fa423f5f0c6891effd4d5c8bdb91d418001da11 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 31 Mar 2021 23:08:49 +0300 Subject: [PATCH 25/36] net: enetc: consume the error RX buffer descriptors in a dedicated function We can and should check the RX BD errors before starting to build the skb. The only apparent reason why things are done in this backwards order is to spare one call to enetc_rxbd_next. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 43 ++++++++++++-------- 1 file changed, 27 insertions(+), 16 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 5a54976e6a281..362cfba7ce140 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -605,6 +605,28 @@ static void enetc_add_rx_buff_to_skb(struct enetc_bdr *rx_ring, int i, enetc_put_rx_buff(rx_ring, rx_swbd); } +static bool enetc_check_bd_errors_and_consume(struct enetc_bdr *rx_ring, + u32 bd_status, + union enetc_rx_bd **rxbd, int *i) +{ + if (likely(!(bd_status & ENETC_RXBD_LSTATUS(ENETC_RXBD_ERR_MASK)))) + return false; + + enetc_rxbd_next(rx_ring, rxbd, i); + + while (!(bd_status & ENETC_RXBD_LSTATUS_F)) { + dma_rmb(); + bd_status = le32_to_cpu((*rxbd)->r.lstatus); + + enetc_rxbd_next(rx_ring, rxbd, i); + } + + rx_ring->ndev->stats.rx_dropped++; + rx_ring->ndev->stats.rx_errors++; + + return true; +} + #define ENETC_RXBD_BUNDLE 16 /* # of BDs to update at once */ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, @@ -634,6 +656,11 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, enetc_wr_reg_hot(rx_ring->idr, BIT(rx_ring->index)); dma_rmb(); /* for reading other rxbd fields */ + + if (enetc_check_bd_errors_and_consume(rx_ring, bd_status, + &rxbd, &i)) + break; + size = le16_to_cpu(rxbd->r.buf_len); skb = enetc_map_rx_buff_to_skb(rx_ring, i, size); if (!skb) @@ -645,22 +672,6 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, enetc_rxbd_next(rx_ring, &rxbd, &i); - if (unlikely(bd_status & - ENETC_RXBD_LSTATUS(ENETC_RXBD_ERR_MASK))) { - dev_kfree_skb(skb); - while (!(bd_status & ENETC_RXBD_LSTATUS_F)) { - dma_rmb(); - bd_status = le32_to_cpu(rxbd->r.lstatus); - - enetc_rxbd_next(rx_ring, &rxbd, &i); - } - - rx_ring->ndev->stats.rx_dropped++; - rx_ring->ndev->stats.rx_errors++; - - break; - } - /* not last BD in frame? */ while (!(bd_status & ENETC_RXBD_LSTATUS_F)) { bd_status = le32_to_cpu(rxbd->r.lstatus); From a800abd3ecb9acc55821f7ac9bba6c956b36a595 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 31 Mar 2021 23:08:50 +0300 Subject: [PATCH 26/36] net: enetc: move skb creation into enetc_build_skb We need to build an skb from two code paths now: from the plain RX data path and from the XDP data path when the verdict is XDP_PASS. Create a new enetc_build_skb function which contains the essential steps for building an skb based on the first and last positions of buffer descriptors within the RX ring. We also squash the enetc_process_skb function into enetc_build_skb, because what that function did wasn't very meaningful on its own. The "rx_frm_cnt++" instruction has been moved around napi_gro_receive for cosmetic reasons, to be in the same spot as rx_byte_cnt++, which itself must be before napi_gro_receive, because that's when we lose ownership of the skb. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 81 +++++++++++--------- 1 file changed, 44 insertions(+), 37 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 362cfba7ce140..b2071b8dc3168 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -513,13 +513,6 @@ static void enetc_get_offloads(struct enetc_bdr *rx_ring, #endif } -static void enetc_process_skb(struct enetc_bdr *rx_ring, - struct sk_buff *skb) -{ - skb_record_rx_queue(skb, rx_ring->index); - skb->protocol = eth_type_trans(skb, rx_ring->ndev); -} - static bool enetc_page_reusable(struct page *page) { return (!page_is_pfmemalloc(page) && page_ref_count(page) == 1); @@ -627,6 +620,47 @@ static bool enetc_check_bd_errors_and_consume(struct enetc_bdr *rx_ring, return true; } +static struct sk_buff *enetc_build_skb(struct enetc_bdr *rx_ring, + u32 bd_status, union enetc_rx_bd **rxbd, + int *i, int *cleaned_cnt) +{ + struct sk_buff *skb; + u16 size; + + size = le16_to_cpu((*rxbd)->r.buf_len); + skb = enetc_map_rx_buff_to_skb(rx_ring, *i, size); + if (!skb) + return NULL; + + enetc_get_offloads(rx_ring, *rxbd, skb); + + (*cleaned_cnt)++; + + enetc_rxbd_next(rx_ring, rxbd, i); + + /* not last BD in frame? */ + while (!(bd_status & ENETC_RXBD_LSTATUS_F)) { + bd_status = le32_to_cpu((*rxbd)->r.lstatus); + size = ENETC_RXB_DMA_SIZE; + + if (bd_status & ENETC_RXBD_LSTATUS_F) { + dma_rmb(); + size = le16_to_cpu((*rxbd)->r.buf_len); + } + + enetc_add_rx_buff_to_skb(rx_ring, *i, size, skb); + + (*cleaned_cnt)++; + + enetc_rxbd_next(rx_ring, rxbd, i); + } + + skb_record_rx_queue(skb, rx_ring->index); + skb->protocol = eth_type_trans(skb, rx_ring->ndev); + + return skb; +} + #define ENETC_RXBD_BUNDLE 16 /* # of BDs to update at once */ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, @@ -643,7 +677,6 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, union enetc_rx_bd *rxbd; struct sk_buff *skb; u32 bd_status; - u16 size; if (cleaned_cnt >= ENETC_RXBD_BUNDLE) cleaned_cnt -= enetc_refill_rx_ring(rx_ring, @@ -661,41 +694,15 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, &rxbd, &i)) break; - size = le16_to_cpu(rxbd->r.buf_len); - skb = enetc_map_rx_buff_to_skb(rx_ring, i, size); + skb = enetc_build_skb(rx_ring, bd_status, &rxbd, &i, + &cleaned_cnt); if (!skb) break; - enetc_get_offloads(rx_ring, rxbd, skb); - - cleaned_cnt++; - - enetc_rxbd_next(rx_ring, &rxbd, &i); - - /* not last BD in frame? */ - while (!(bd_status & ENETC_RXBD_LSTATUS_F)) { - bd_status = le32_to_cpu(rxbd->r.lstatus); - size = ENETC_RXB_DMA_SIZE; - - if (bd_status & ENETC_RXBD_LSTATUS_F) { - dma_rmb(); - size = le16_to_cpu(rxbd->r.buf_len); - } - - enetc_add_rx_buff_to_skb(rx_ring, i, size, skb); - - cleaned_cnt++; - - enetc_rxbd_next(rx_ring, &rxbd, &i); - } - rx_byte_cnt += skb->len; - - enetc_process_skb(rx_ring, skb); + rx_frm_cnt++; napi_gro_receive(napi, skb); - - rx_frm_cnt++; } rx_ring->next_to_clean = i; From d504498d2eb3bfcbef4ddf3f51eb9f1391c8149f Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 31 Mar 2021 23:08:51 +0300 Subject: [PATCH 27/36] net: enetc: add a dedicated is_eof bit in the TX software BD In the transmit path, if we have a scatter/gather frame, it is put into multiple software buffer descriptors, the last of which has the skb pointer populated (which is necessary for rearming the TX MSI vector and for collecting the two-step TX timestamp from the TX confirmation path). At the moment, this is sufficient, but with XDP_TX, we'll need to service TX software buffer descriptors that don't have an skb pointer, however they might be final nonetheless. So add a dedicated bit for final software BDs that we populate and check explicitly. Also, we keep looking just for an skb when doing TX timestamping, because we don't want/need that for XDP. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 7 +++---- drivers/net/ethernet/freescale/enetc/enetc.h | 1 + 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index b2071b8dc3168..37d2d142a7449 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -157,6 +157,7 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb, temp_bd.flags = flags; *txbd = temp_bd; + tx_ring->tx_swbd[i].is_eof = true; tx_ring->tx_swbd[i].skb = skb; enetc_bdr_idx_inc(tx_ring, &i); @@ -316,8 +317,6 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) do_tstamp = false; while (bds_to_clean && tx_frm_cnt < ENETC_DEFAULT_TX_WORK) { - bool is_eof = !!tx_swbd->skb; - if (unlikely(tx_swbd->check_wb)) { struct enetc_ndev_priv *priv = netdev_priv(ndev); union enetc_tx_bd *txbd; @@ -335,7 +334,7 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) if (likely(tx_swbd->dma)) enetc_unmap_tx_buff(tx_ring, tx_swbd); - if (is_eof) { + if (tx_swbd->skb) { if (unlikely(do_tstamp)) { enetc_tstamp_tx(tx_swbd->skb, tstamp); do_tstamp = false; @@ -355,7 +354,7 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) } /* BD iteration loop end */ - if (is_eof) { + if (tx_swbd->is_eof) { tx_frm_cnt++; /* re-arm interrupt source */ enetc_wr_reg_hot(tx_ring->idr, BIT(tx_ring->index) | diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 773e412b9f4ea..d9e75644b89c2 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -25,6 +25,7 @@ struct enetc_tx_swbd { u8 is_dma_page:1; u8 check_wb:1; u8 do_tstamp:1; + u8 is_eof:1; }; #define ENETC_RX_MAXFRM_SIZE ENETC_MAC_MAXFRM_SIZE From 1ee8d6f3bebbdaa7692732c91685b27ae4c612be Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 31 Mar 2021 23:08:52 +0300 Subject: [PATCH 28/36] net: enetc: clean the TX software BD on the TX confirmation path With the future introduction of some new fields into enetc_tx_swbd such as is_xdp_tx, is_xdp_redirect etc, we need not only to set these bits to true from the XDP_TX/XDP_REDIRECT code path, but also to false from the old code paths. This is because TX software buffer descriptors are kept in a ring that is shadow of the hardware TX ring, so these structures keep getting reused, and there is always the possibility that when a software BD is reused (after we ran a full circle through the TX ring), the old user of the tx_swbd had set is_xdp_tx = true, and now we are sending a regular skb, which would need to set is_xdp_tx = false. To be minimally invasive to the old code paths, let's just scrub the software TX BD in the TX confirmation path (enetc_clean_tx_ring), once we know that nobody uses this software TX BD (tx_ring->next_to_clean hasn't yet been updated, and the TX paths check enetc_bd_unused which tells them if there's any more space in the TX ring for a new enqueue). Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 37d2d142a7449..ade05518b4960 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -344,6 +344,10 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) } tx_byte_cnt += tx_swbd->len; + /* Scrub the swbd here so we don't have to do that + * when we reuse it during xmit + */ + memset(tx_swbd, 0, sizeof(*tx_swbd)); bds_to_clean--; tx_swbd++; From 65d0cbb414cee012ceee9991d09f5e7c30b49fcc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 31 Mar 2021 23:08:53 +0300 Subject: [PATCH 29/36] net: enetc: move up enetc_reuse_page and enetc_page_reusable For XDP_TX, we need to call enetc_reuse_page from enetc_clean_tx_ring, so we need to avoid a forward declaration. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 38 ++++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index ade05518b4960..38301d0d7f0c5 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -275,6 +275,25 @@ static int enetc_bd_ready_count(struct enetc_bdr *tx_ring, int ci) return pi >= ci ? pi - ci : tx_ring->bd_count - ci + pi; } +static bool enetc_page_reusable(struct page *page) +{ + return (!page_is_pfmemalloc(page) && page_ref_count(page) == 1); +} + +static void enetc_reuse_page(struct enetc_bdr *rx_ring, + struct enetc_rx_swbd *old) +{ + struct enetc_rx_swbd *new; + + new = &rx_ring->rx_swbd[rx_ring->next_to_alloc]; + + /* next buf that may reuse a page */ + enetc_bdr_idx_inc(rx_ring, &rx_ring->next_to_alloc); + + /* copy page reference */ + *new = *old; +} + static void enetc_get_tx_tstamp(struct enetc_hw *hw, union enetc_tx_bd *txbd, u64 *tstamp) { @@ -516,25 +535,6 @@ static void enetc_get_offloads(struct enetc_bdr *rx_ring, #endif } -static bool enetc_page_reusable(struct page *page) -{ - return (!page_is_pfmemalloc(page) && page_ref_count(page) == 1); -} - -static void enetc_reuse_page(struct enetc_bdr *rx_ring, - struct enetc_rx_swbd *old) -{ - struct enetc_rx_swbd *new; - - new = &rx_ring->rx_swbd[rx_ring->next_to_alloc]; - - /* next buf that may reuse a page */ - enetc_bdr_idx_inc(rx_ring, &rx_ring->next_to_alloc); - - /* copy page reference */ - *new = *old; -} - static struct enetc_rx_swbd *enetc_get_rx_buff(struct enetc_bdr *rx_ring, int i, u16 size) { From d1b15102dd16adc17fd5e4db8a485e6459f98906 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 31 Mar 2021 23:08:54 +0300 Subject: [PATCH 30/36] net: enetc: add support for XDP_DROP and XDP_PASS For the RX ring, enetc uses an allocation scheme based on pages split into two buffers, which is already very efficient in terms of preventing reallocations / maximizing reuse, so I see no reason why I would change that. +--------+--------+--------+--------+--------+--------+--------+ | | | | | | | | | half B | half B | half B | half B | half B | half B | half B | | | | | | | | | +--------+--------+--------+--------+--------+--------+--------+ | | | | | | | | | half A | half A | half A | half A | half A | half A | half A | RX ring | | | | | | | | +--------+--------+--------+--------+--------+--------+--------+ ^ ^ | | next_to_clean next_to_alloc next_to_use +--------+--------+--------+--------+--------+ | | | | | | | half B | half B | half B | half B | half B | | | | | | | +--------+--------+--------+--------+--------+--------+--------+ | | | | | | | | | half B | half B | half A | half A | half A | half A | half A | RX ring | | | | | | | | +--------+--------+--------+--------+--------+--------+--------+ | | | ^ ^ | half A | half A | | | | | | next_to_clean next_to_use +--------+--------+ ^ | next_to_alloc then when enetc_refill_rx_ring is called, whose purpose is to advance next_to_use, it sees that it can take buffers up to next_to_alloc, and it says "oh, hey, rx_swbd->page isn't NULL, I don't need to allocate one!". The only problem is that for default PAGE_SIZE values of 4096, buffer sizes are 2048 bytes. While this is enough for normal skb allocations at an MTU of 1500 bytes, for XDP it isn't, because the XDP headroom is 256 bytes, and including skb_shared_info and alignment, we end up being able to make use of only 1472 bytes, which is insufficient for the default MTU. To solve that problem, we implement scatter/gather processing in the driver, because we would really like to keep the existing allocation scheme. A packet of 1500 bytes is received in a buffer of 1472 bytes and another one of 28 bytes. Because the headroom required by XDP is different (and much larger) than the one required by the network stack, whenever a BPF program is added or deleted on the port, we drain the existing RX buffers and seed new ones with the required headroom. We also keep the required headroom in rx_ring->buffer_offset. The simplest way to implement XDP_PASS, where an skb must be created, is to create an xdp_buff based on the next_to_clean RX BDs, but not clear those BDs from the RX ring yet, just keep the original index at which the BDs for this frame started. Then, if the verdict is XDP_PASS, instead of converting the xdb_buff to an skb, we replay a call to enetc_build_skb (just as in the normal enetc_clean_rx_ring case), starting from the original BD index. We would also like to be minimally invasive to the regular RX data path, and not check whether there is a BPF program attached to the ring on every packet. So we create a separate RX ring processing function for XDP. Because we only install/remove the BPF program while the interface is down, we forgo the rcu_read_lock() in enetc_clean_rx_ring, since there shouldn't be any circumstance in which we are processing packets and there is a potentially freed BPF program attached to the RX ring. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 284 ++++++++++++++++-- drivers/net/ethernet/freescale/enetc/enetc.h | 14 + .../ethernet/freescale/enetc/enetc_ethtool.c | 2 + .../net/ethernet/freescale/enetc/enetc_pf.c | 1 + 4 files changed, 281 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 38301d0d7f0c5..58bb0b78585a8 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -2,6 +2,7 @@ /* Copyright 2017-2019 NXP */ #include "enetc.h" +#include #include #include #include @@ -420,7 +421,7 @@ static bool enetc_new_page(struct enetc_bdr *rx_ring, rx_swbd->dma = addr; rx_swbd->page = page; - rx_swbd->page_offset = ENETC_RXB_PAD; + rx_swbd->page_offset = rx_ring->buffer_offset; return true; } @@ -550,6 +551,8 @@ static void enetc_put_rx_buff(struct enetc_bdr *rx_ring, struct enetc_rx_swbd *rx_swbd) { if (likely(enetc_page_reusable(rx_swbd->page))) { + size_t buffer_size = ENETC_RXB_TRUESIZE - rx_ring->buffer_offset; + rx_swbd->page_offset ^= ENETC_RXB_TRUESIZE; page_ref_inc(rx_swbd->page); @@ -558,8 +561,7 @@ static void enetc_put_rx_buff(struct enetc_bdr *rx_ring, /* sync for use by the device */ dma_sync_single_range_for_device(rx_ring->dev, rx_swbd->dma, rx_swbd->page_offset, - ENETC_RXB_DMA_SIZE, - DMA_FROM_DEVICE); + buffer_size, DMA_FROM_DEVICE); } else { dma_unmap_page(rx_ring->dev, rx_swbd->dma, PAGE_SIZE, DMA_FROM_DEVICE); @@ -576,13 +578,13 @@ static struct sk_buff *enetc_map_rx_buff_to_skb(struct enetc_bdr *rx_ring, void *ba; ba = page_address(rx_swbd->page) + rx_swbd->page_offset; - skb = build_skb(ba - ENETC_RXB_PAD, ENETC_RXB_TRUESIZE); + skb = build_skb(ba - rx_ring->buffer_offset, ENETC_RXB_TRUESIZE); if (unlikely(!skb)) { rx_ring->stats.rx_alloc_errs++; return NULL; } - skb_reserve(skb, ENETC_RXB_PAD); + skb_reserve(skb, rx_ring->buffer_offset); __skb_put(skb, size); enetc_put_rx_buff(rx_ring, rx_swbd); @@ -625,7 +627,7 @@ static bool enetc_check_bd_errors_and_consume(struct enetc_bdr *rx_ring, static struct sk_buff *enetc_build_skb(struct enetc_bdr *rx_ring, u32 bd_status, union enetc_rx_bd **rxbd, - int *i, int *cleaned_cnt) + int *i, int *cleaned_cnt, int buffer_size) { struct sk_buff *skb; u16 size; @@ -644,7 +646,7 @@ static struct sk_buff *enetc_build_skb(struct enetc_bdr *rx_ring, /* not last BD in frame? */ while (!(bd_status & ENETC_RXBD_LSTATUS_F)) { bd_status = le32_to_cpu((*rxbd)->r.lstatus); - size = ENETC_RXB_DMA_SIZE; + size = buffer_size; if (bd_status & ENETC_RXBD_LSTATUS_F) { dma_rmb(); @@ -698,7 +700,7 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, break; skb = enetc_build_skb(rx_ring, bd_status, &rxbd, &i, - &cleaned_cnt); + &cleaned_cnt, ENETC_RXB_DMA_SIZE); if (!skb) break; @@ -716,10 +718,174 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, return rx_frm_cnt; } +static void enetc_map_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i, + struct xdp_buff *xdp_buff, u16 size) +{ + struct enetc_rx_swbd *rx_swbd = enetc_get_rx_buff(rx_ring, i, size); + void *hard_start = page_address(rx_swbd->page) + rx_swbd->page_offset; + struct skb_shared_info *shinfo; + + xdp_prepare_buff(xdp_buff, hard_start - rx_ring->buffer_offset, + rx_ring->buffer_offset, size, false); + + shinfo = xdp_get_shared_info_from_buff(xdp_buff); + shinfo->nr_frags = 0; +} + +static void enetc_add_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i, + u16 size, struct xdp_buff *xdp_buff) +{ + struct skb_shared_info *shinfo = xdp_get_shared_info_from_buff(xdp_buff); + struct enetc_rx_swbd *rx_swbd = enetc_get_rx_buff(rx_ring, i, size); + skb_frag_t *frag = &shinfo->frags[shinfo->nr_frags]; + + skb_frag_off_set(frag, rx_swbd->page_offset); + skb_frag_size_set(frag, size); + __skb_frag_set_page(frag, rx_swbd->page); + + shinfo->nr_frags++; +} + +static void enetc_build_xdp_buff(struct enetc_bdr *rx_ring, u32 bd_status, + union enetc_rx_bd **rxbd, int *i, + int *cleaned_cnt, struct xdp_buff *xdp_buff) +{ + u16 size = le16_to_cpu((*rxbd)->r.buf_len); + + xdp_init_buff(xdp_buff, ENETC_RXB_TRUESIZE, &rx_ring->xdp.rxq); + + enetc_map_rx_buff_to_xdp(rx_ring, *i, xdp_buff, size); + (*cleaned_cnt)++; + enetc_rxbd_next(rx_ring, rxbd, i); + + /* not last BD in frame? */ + while (!(bd_status & ENETC_RXBD_LSTATUS_F)) { + bd_status = le32_to_cpu((*rxbd)->r.lstatus); + size = ENETC_RXB_DMA_SIZE_XDP; + + if (bd_status & ENETC_RXBD_LSTATUS_F) { + dma_rmb(); + size = le16_to_cpu((*rxbd)->r.buf_len); + } + + enetc_add_rx_buff_to_xdp(rx_ring, *i, size, xdp_buff); + (*cleaned_cnt)++; + enetc_rxbd_next(rx_ring, rxbd, i); + } +} + +/* Reuse the current page without performing half-page buffer flipping */ +static void enetc_put_xdp_buff(struct enetc_bdr *rx_ring, + struct enetc_rx_swbd *rx_swbd) +{ + enetc_reuse_page(rx_ring, rx_swbd); + + /* sync for use by the device */ + dma_sync_single_range_for_device(rx_ring->dev, rx_swbd->dma, + rx_swbd->page_offset, + ENETC_RXB_DMA_SIZE_XDP, + DMA_FROM_DEVICE); + + rx_swbd->page = NULL; +} + +static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, + int rx_ring_last) +{ + while (rx_ring_first != rx_ring_last) { + enetc_put_xdp_buff(rx_ring, + &rx_ring->rx_swbd[rx_ring_first]); + enetc_bdr_idx_inc(rx_ring, &rx_ring_first); + } + rx_ring->stats.xdp_drops++; +} + +static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, + struct napi_struct *napi, int work_limit, + struct bpf_prog *prog) +{ + int rx_frm_cnt = 0, rx_byte_cnt = 0; + int cleaned_cnt, i; + u32 xdp_act; + + cleaned_cnt = enetc_bd_unused(rx_ring); + /* next descriptor to process */ + i = rx_ring->next_to_clean; + + while (likely(rx_frm_cnt < work_limit)) { + union enetc_rx_bd *rxbd, *orig_rxbd; + int orig_i, orig_cleaned_cnt; + struct xdp_buff xdp_buff; + struct sk_buff *skb; + u32 bd_status; + + if (cleaned_cnt >= ENETC_RXBD_BUNDLE) + cleaned_cnt -= enetc_refill_rx_ring(rx_ring, + cleaned_cnt); + + rxbd = enetc_rxbd(rx_ring, i); + bd_status = le32_to_cpu(rxbd->r.lstatus); + if (!bd_status) + break; + + enetc_wr_reg_hot(rx_ring->idr, BIT(rx_ring->index)); + dma_rmb(); /* for reading other rxbd fields */ + + if (enetc_check_bd_errors_and_consume(rx_ring, bd_status, + &rxbd, &i)) + break; + + orig_rxbd = rxbd; + orig_cleaned_cnt = cleaned_cnt; + orig_i = i; + + enetc_build_xdp_buff(rx_ring, bd_status, &rxbd, &i, + &cleaned_cnt, &xdp_buff); + + xdp_act = bpf_prog_run_xdp(prog, &xdp_buff); + + switch (xdp_act) { + case XDP_ABORTED: + trace_xdp_exception(rx_ring->ndev, prog, xdp_act); + fallthrough; + case XDP_DROP: + enetc_xdp_drop(rx_ring, orig_i, i); + break; + case XDP_PASS: + rxbd = orig_rxbd; + cleaned_cnt = orig_cleaned_cnt; + i = orig_i; + + skb = enetc_build_skb(rx_ring, bd_status, &rxbd, + &i, &cleaned_cnt, + ENETC_RXB_DMA_SIZE_XDP); + if (unlikely(!skb)) + /* Exit the switch/case, not the loop */ + break; + + napi_gro_receive(napi, skb); + break; + default: + bpf_warn_invalid_xdp_action(xdp_act); + } + + rx_frm_cnt++; + } + + rx_ring->next_to_clean = i; + + rx_ring->stats.packets += rx_frm_cnt; + rx_ring->stats.bytes += rx_byte_cnt; + + return rx_frm_cnt; +} + static int enetc_poll(struct napi_struct *napi, int budget) { struct enetc_int_vector *v = container_of(napi, struct enetc_int_vector, napi); + struct enetc_bdr *rx_ring = &v->rx_ring; + struct bpf_prog *prog; bool complete = true; int work_done; int i; @@ -730,7 +896,11 @@ static int enetc_poll(struct napi_struct *napi, int budget) if (!enetc_clean_tx_ring(&v->tx_ring[i], budget)) complete = false; - work_done = enetc_clean_rx_ring(&v->rx_ring, napi, budget); + prog = rx_ring->xdp.prog; + if (prog) + work_done = enetc_clean_rx_ring_xdp(rx_ring, napi, budget, prog); + else + work_done = enetc_clean_rx_ring(rx_ring, napi, budget); if (work_done == budget) complete = false; if (work_done) @@ -1120,7 +1290,10 @@ static void enetc_setup_rxbdr(struct enetc_hw *hw, struct enetc_bdr *rx_ring) enetc_rxbdr_wr(hw, idx, ENETC_RBLENR, ENETC_RTBLENR_LEN(rx_ring->bd_count)); - enetc_rxbdr_wr(hw, idx, ENETC_RBBSR, ENETC_RXB_DMA_SIZE); + if (rx_ring->xdp.prog) + enetc_rxbdr_wr(hw, idx, ENETC_RBBSR, ENETC_RXB_DMA_SIZE_XDP); + else + enetc_rxbdr_wr(hw, idx, ENETC_RBBSR, ENETC_RXB_DMA_SIZE); enetc_rxbdr_wr(hw, idx, ENETC_RBPIR, 0); @@ -1511,6 +1684,54 @@ int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type, } } +static int enetc_setup_xdp_prog(struct net_device *dev, struct bpf_prog *prog, + struct netlink_ext_ack *extack) +{ + struct enetc_ndev_priv *priv = netdev_priv(dev); + struct bpf_prog *old_prog; + bool is_up; + int i; + + /* The buffer layout is changing, so we need to drain the old + * RX buffers and seed new ones. + */ + is_up = netif_running(dev); + if (is_up) + dev_close(dev); + + old_prog = xchg(&priv->xdp_prog, prog); + if (old_prog) + bpf_prog_put(old_prog); + + for (i = 0; i < priv->num_rx_rings; i++) { + struct enetc_bdr *rx_ring = priv->rx_ring[i]; + + rx_ring->xdp.prog = prog; + + if (prog) + rx_ring->buffer_offset = XDP_PACKET_HEADROOM; + else + rx_ring->buffer_offset = ENETC_RXB_PAD; + } + + if (is_up) + return dev_open(dev, extack); + + return 0; +} + +int enetc_setup_bpf(struct net_device *dev, struct netdev_bpf *xdp) +{ + switch (xdp->command) { + case XDP_SETUP_PROG: + return enetc_setup_xdp_prog(dev, xdp->prog, xdp->extack); + default: + return -EINVAL; + } + + return 0; +} + struct net_device_stats *enetc_get_stats(struct net_device *ndev) { struct enetc_ndev_priv *priv = netdev_priv(ndev); @@ -1727,6 +1948,28 @@ int enetc_alloc_msix(struct enetc_ndev_priv *priv) priv->int_vector[i] = v; + bdr = &v->rx_ring; + bdr->index = i; + bdr->ndev = priv->ndev; + bdr->dev = priv->dev; + bdr->bd_count = priv->rx_bd_count; + bdr->buffer_offset = ENETC_RXB_PAD; + priv->rx_ring[i] = bdr; + + err = xdp_rxq_info_reg(&bdr->xdp.rxq, priv->ndev, i, 0); + if (err) { + kfree(v); + goto fail; + } + + err = xdp_rxq_info_reg_mem_model(&bdr->xdp.rxq, + MEM_TYPE_PAGE_SHARED, NULL); + if (err) { + xdp_rxq_info_unreg(&bdr->xdp.rxq); + kfree(v); + goto fail; + } + /* init defaults for adaptive IC */ if (priv->ic_mode & ENETC_IC_RX_ADAPTIVE) { v->rx_ictt = 0x1; @@ -1754,22 +1997,20 @@ int enetc_alloc_msix(struct enetc_ndev_priv *priv) bdr->bd_count = priv->tx_bd_count; priv->tx_ring[idx] = bdr; } - - bdr = &v->rx_ring; - bdr->index = i; - bdr->ndev = priv->ndev; - bdr->dev = priv->dev; - bdr->bd_count = priv->rx_bd_count; - priv->rx_ring[i] = bdr; } return 0; fail: while (i--) { - netif_napi_del(&priv->int_vector[i]->napi); - cancel_work_sync(&priv->int_vector[i]->rx_dim.work); - kfree(priv->int_vector[i]); + struct enetc_int_vector *v = priv->int_vector[i]; + struct enetc_bdr *rx_ring = &v->rx_ring; + + xdp_rxq_info_unreg_mem_model(&rx_ring->xdp.rxq); + xdp_rxq_info_unreg(&rx_ring->xdp.rxq); + netif_napi_del(&v->napi); + cancel_work_sync(&v->rx_dim.work); + kfree(v); } pci_free_irq_vectors(pdev); @@ -1783,7 +2024,10 @@ void enetc_free_msix(struct enetc_ndev_priv *priv) for (i = 0; i < priv->bdr_int_num; i++) { struct enetc_int_vector *v = priv->int_vector[i]; + struct enetc_bdr *rx_ring = &v->rx_ring; + xdp_rxq_info_unreg_mem_model(&rx_ring->xdp.rxq); + xdp_rxq_info_unreg(&rx_ring->xdp.rxq); netif_napi_del(&v->napi); cancel_work_sync(&v->rx_dim.work); } diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index d9e75644b89c2..5815addfe9661 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -33,6 +33,8 @@ struct enetc_tx_swbd { #define ENETC_RXB_PAD NET_SKB_PAD /* add extra space if needed */ #define ENETC_RXB_DMA_SIZE \ (SKB_WITH_OVERHEAD(ENETC_RXB_TRUESIZE) - ENETC_RXB_PAD) +#define ENETC_RXB_DMA_SIZE_XDP \ + (SKB_WITH_OVERHEAD(ENETC_RXB_TRUESIZE) - XDP_PACKET_HEADROOM) struct enetc_rx_swbd { dma_addr_t dma; @@ -44,6 +46,12 @@ struct enetc_ring_stats { unsigned int packets; unsigned int bytes; unsigned int rx_alloc_errs; + unsigned int xdp_drops; +}; + +struct enetc_xdp_data { + struct xdp_rxq_info rxq; + struct bpf_prog *prog; }; #define ENETC_RX_RING_DEFAULT_SIZE 512 @@ -72,6 +80,9 @@ struct enetc_bdr { }; void __iomem *idr; /* Interrupt Detect Register pointer */ + int buffer_offset; + struct enetc_xdp_data xdp; + struct enetc_ring_stats stats; dma_addr_t bd_dma_base; @@ -276,6 +287,8 @@ struct enetc_ndev_priv { struct phylink *phylink; int ic_mode; u32 tx_ictt; + + struct bpf_prog *xdp_prog; }; /* Messaging */ @@ -315,6 +328,7 @@ int enetc_set_features(struct net_device *ndev, int enetc_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd); int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type, void *type_data); +int enetc_setup_bpf(struct net_device *dev, struct netdev_bpf *xdp); /* ethtool */ void enetc_set_ethtool_ops(struct net_device *ndev); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index 89e558135432b..0183c13f8c1e1 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -192,6 +192,7 @@ static const struct { static const char rx_ring_stats[][ETH_GSTRING_LEN] = { "Rx ring %2d frames", "Rx ring %2d alloc errors", + "Rx ring %2d XDP drops", }; static const char tx_ring_stats[][ETH_GSTRING_LEN] = { @@ -273,6 +274,7 @@ static void enetc_get_ethtool_stats(struct net_device *ndev, for (i = 0; i < priv->num_rx_rings; i++) { data[o++] = priv->rx_ring[i]->stats.packets; data[o++] = priv->rx_ring[i]->stats.rx_alloc_errs; + data[o++] = priv->rx_ring[i]->stats.xdp_drops; } if (!enetc_si_is_pf(priv->si)) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 5e95afd61c87a..0484dbe13422b 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -707,6 +707,7 @@ static const struct net_device_ops enetc_ndev_ops = { .ndo_set_features = enetc_pf_set_features, .ndo_do_ioctl = enetc_ioctl, .ndo_setup_tc = enetc_setup_tc, + .ndo_bpf = enetc_setup_bpf, }; static void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev, From 7ed2bc80074ed4ed30e0cab323305bde851f7a87 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 31 Mar 2021 23:08:55 +0300 Subject: [PATCH 31/36] net: enetc: add support for XDP_TX For reflecting packets back into the interface they came from, we create an array of TX software BDs derived from the RX software BDs. Therefore, we need to extend the TX software BD structure to contain most of the stuff that's already present in the RX software BD structure, for reasons that will become evident in a moment. For a frame with the XDP_TX verdict, we don't reuse any buffer right away as we do for XDP_DROP (the same page half) or XDP_PASS (the other page half, same as the skb code path). Because the buffer transfers ownership from the RX ring to the TX ring, reusing any page half right away is very dangerous. So what we can do is we can recycle the same page half as soon as TX is complete. The code path is: enetc_poll -> enetc_clean_rx_ring_xdp -> enetc_xdp_tx -> enetc_refill_rx_ring (time passes, another MSI interrupt is raised) enetc_poll -> enetc_clean_tx_ring -> enetc_recycle_xdp_tx_buff But that creates a problem, because there is a potentially large time window between enetc_xdp_tx and enetc_recycle_xdp_tx_buff, period in which we'll have less and less RX buffers. Basically, when the ship starts sinking, the knee-jerk reaction is to let enetc_refill_rx_ring do what it does for the standard skb code path (refill every 16 consumed buffers), but that turns out to be very inefficient. The problem is that we have no rx_swbd->page at our disposal from the enetc_reuse_page path, so enetc_refill_rx_ring would have to call enetc_new_page for every buffer that we refill (if we choose to refill at this early stage). Very inefficient, it only makes the problem worse, because page allocation is an expensive process, and CPU time is exactly what we're lacking. Additionally, there is an even bigger problem: if we let enetc_refill_rx_ring top up the ring's buffers again from the RX path, remember that the buffers sent to transmission haven't disappeared anywhere. They will be eventually sent, and processed in enetc_clean_tx_ring, and an attempt will be made to recycle them. But surprise, the RX ring is already full of new buffers, because we were premature in deciding that we should refill. So not only we took the expensive decision of allocating new pages, but now we must throw away perfectly good and reusable buffers. So what we do is we implement an elastic refill mechanism, which keeps track of the number of in-flight XDP_TX buffer descriptors. We top up the RX ring only up to the total ring capacity minus the number of BDs that are in flight (because we know that those BDs will return to us eventually). The enetc driver manages 1 RX ring per CPU, and the default TX ring management is the same. So we do XDP_TX towards the TX ring of the same index, because it is affined to the same CPU. This will probably not produce great results when we have a tc-taprio/tc-mqprio qdisc on the interface, because in that case, the number of TX rings might be greater, but I didn't add any checks for that yet (mostly because I didn't know what checks to add). It should also be noted that we need to change the DMA mapping direction for RX buffers, since they may now be reflected into the TX ring of the same device. We choose to use DMA_BIDIRECTIONAL instead of unmapping and remapping as DMA_TO_DEVICE, because performance is better this way. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 217 ++++++++++++++++-- drivers/net/ethernet/freescale/enetc/enetc.h | 25 ++ .../ethernet/freescale/enetc/enetc_ethtool.c | 11 +- 3 files changed, 228 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 58bb0b78585a8..ba5313a5d7a48 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -8,21 +8,20 @@ #include #include -/* ENETC overhead: optional extension BD + 1 BD gap */ -#define ENETC_TXBDS_NEEDED(val) ((val) + 2) -/* max # of chained Tx BDs is 15, including head and extension BD */ -#define ENETC_MAX_SKB_FRAGS 13 -#define ENETC_TXBDS_MAX_NEEDED ENETC_TXBDS_NEEDED(ENETC_MAX_SKB_FRAGS + 1) - static void enetc_unmap_tx_buff(struct enetc_bdr *tx_ring, struct enetc_tx_swbd *tx_swbd) { + /* For XDP_TX, pages come from RX, whereas for the other contexts where + * we have is_dma_page_set, those come from skb_frag_dma_map. We need + * to match the DMA mapping length, so we need to differentiate those. + */ if (tx_swbd->is_dma_page) dma_unmap_page(tx_ring->dev, tx_swbd->dma, - tx_swbd->len, DMA_TO_DEVICE); + tx_swbd->is_xdp_tx ? PAGE_SIZE : tx_swbd->len, + tx_swbd->dir); else dma_unmap_single(tx_ring->dev, tx_swbd->dma, - tx_swbd->len, DMA_TO_DEVICE); + tx_swbd->len, tx_swbd->dir); tx_swbd->dma = 0; } @@ -38,6 +37,13 @@ static void enetc_free_tx_skb(struct enetc_bdr *tx_ring, } } +/* Let H/W know BD ring has been updated */ +static void enetc_update_tx_ring_tail(struct enetc_bdr *tx_ring) +{ + /* includes wmb() */ + enetc_wr_reg_hot(tx_ring->tpir, tx_ring->next_to_use); +} + static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb, int active_offloads) { @@ -68,6 +74,7 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb, tx_swbd->dma = dma; tx_swbd->len = len; tx_swbd->is_dma_page = 0; + tx_swbd->dir = DMA_TO_DEVICE; count++; do_vlan = skb_vlan_tag_present(skb); @@ -150,6 +157,7 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb, tx_swbd->dma = dma; tx_swbd->len = len; tx_swbd->is_dma_page = 1; + tx_swbd->dir = DMA_TO_DEVICE; count++; } @@ -166,8 +174,7 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb, skb_tx_timestamp(skb); - /* let H/W know BD ring has been updated */ - enetc_wr_reg_hot(tx_ring->tpir, i); /* includes wmb() */ + enetc_update_tx_ring_tail(tx_ring); return count; @@ -320,6 +327,43 @@ static void enetc_tstamp_tx(struct sk_buff *skb, u64 tstamp) } } +static void enetc_recycle_xdp_tx_buff(struct enetc_bdr *tx_ring, + struct enetc_tx_swbd *tx_swbd) +{ + struct enetc_ndev_priv *priv = netdev_priv(tx_ring->ndev); + struct enetc_bdr *rx_ring = priv->rx_ring[tx_ring->index]; + struct enetc_rx_swbd rx_swbd = { + .dma = tx_swbd->dma, + .page = tx_swbd->page, + .page_offset = tx_swbd->page_offset, + .dir = tx_swbd->dir, + .len = tx_swbd->len, + }; + + if (likely(enetc_swbd_unused(rx_ring))) { + enetc_reuse_page(rx_ring, &rx_swbd); + + /* sync for use by the device */ + dma_sync_single_range_for_device(rx_ring->dev, rx_swbd.dma, + rx_swbd.page_offset, + ENETC_RXB_DMA_SIZE_XDP, + rx_swbd.dir); + + rx_ring->stats.recycles++; + } else { + /* RX ring is already full, we need to unmap and free the + * page, since there's nothing useful we can do with it. + */ + rx_ring->stats.recycle_failures++; + + dma_unmap_page(rx_ring->dev, rx_swbd.dma, PAGE_SIZE, + rx_swbd.dir); + __free_page(rx_swbd.page); + } + + rx_ring->xdp.xdp_tx_in_flight--; +} + static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) { struct net_device *ndev = tx_ring->ndev; @@ -351,7 +395,9 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) } } - if (likely(tx_swbd->dma)) + if (tx_swbd->is_xdp_tx) + enetc_recycle_xdp_tx_buff(tx_ring, tx_swbd); + else if (likely(tx_swbd->dma)) enetc_unmap_tx_buff(tx_ring, tx_swbd); if (tx_swbd->skb) { @@ -405,6 +451,7 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) static bool enetc_new_page(struct enetc_bdr *rx_ring, struct enetc_rx_swbd *rx_swbd) { + bool xdp = !!(rx_ring->xdp.prog); struct page *page; dma_addr_t addr; @@ -412,7 +459,10 @@ static bool enetc_new_page(struct enetc_bdr *rx_ring, if (unlikely(!page)) return false; - addr = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE); + /* For XDP_TX, we forgo dma_unmap -> dma_map */ + rx_swbd->dir = xdp ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE; + + addr = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, rx_swbd->dir); if (unlikely(dma_mapping_error(rx_ring->dev, addr))) { __free_page(page); @@ -536,6 +586,10 @@ static void enetc_get_offloads(struct enetc_bdr *rx_ring, #endif } +/* This gets called during the non-XDP NAPI poll cycle as well as on XDP_PASS, + * so it needs to work with both DMA_FROM_DEVICE as well as DMA_BIDIRECTIONAL + * mapped buffers. + */ static struct enetc_rx_swbd *enetc_get_rx_buff(struct enetc_bdr *rx_ring, int i, u16 size) { @@ -543,7 +597,7 @@ static struct enetc_rx_swbd *enetc_get_rx_buff(struct enetc_bdr *rx_ring, dma_sync_single_range_for_cpu(rx_ring->dev, rx_swbd->dma, rx_swbd->page_offset, - size, DMA_FROM_DEVICE); + size, rx_swbd->dir); return rx_swbd; } @@ -561,10 +615,10 @@ static void enetc_put_rx_buff(struct enetc_bdr *rx_ring, /* sync for use by the device */ dma_sync_single_range_for_device(rx_ring->dev, rx_swbd->dma, rx_swbd->page_offset, - buffer_size, DMA_FROM_DEVICE); + buffer_size, rx_swbd->dir); } else { - dma_unmap_page(rx_ring->dev, rx_swbd->dma, - PAGE_SIZE, DMA_FROM_DEVICE); + dma_unmap_page(rx_ring->dev, rx_swbd->dma, PAGE_SIZE, + rx_swbd->dir); } rx_swbd->page = NULL; @@ -718,6 +772,61 @@ static int enetc_clean_rx_ring(struct enetc_bdr *rx_ring, return rx_frm_cnt; } +static void enetc_xdp_map_tx_buff(struct enetc_bdr *tx_ring, int i, + struct enetc_tx_swbd *tx_swbd, + int frm_len) +{ + union enetc_tx_bd *txbd = ENETC_TXBD(*tx_ring, i); + + prefetchw(txbd); + + enetc_clear_tx_bd(txbd); + txbd->addr = cpu_to_le64(tx_swbd->dma + tx_swbd->page_offset); + txbd->buf_len = cpu_to_le16(tx_swbd->len); + txbd->frm_len = cpu_to_le16(frm_len); + + memcpy(&tx_ring->tx_swbd[i], tx_swbd, sizeof(*tx_swbd)); +} + +/* Puts in the TX ring one XDP frame, mapped as an array of TX software buffer + * descriptors. + */ +static bool enetc_xdp_tx(struct enetc_bdr *tx_ring, + struct enetc_tx_swbd *xdp_tx_arr, int num_tx_swbd) +{ + struct enetc_tx_swbd *tmp_tx_swbd = xdp_tx_arr; + int i, k, frm_len = tmp_tx_swbd->len; + + if (unlikely(enetc_bd_unused(tx_ring) < ENETC_TXBDS_NEEDED(num_tx_swbd))) + return false; + + while (unlikely(!tmp_tx_swbd->is_eof)) { + tmp_tx_swbd++; + frm_len += tmp_tx_swbd->len; + } + + i = tx_ring->next_to_use; + + for (k = 0; k < num_tx_swbd; k++) { + struct enetc_tx_swbd *xdp_tx_swbd = &xdp_tx_arr[k]; + + enetc_xdp_map_tx_buff(tx_ring, i, xdp_tx_swbd, frm_len); + + /* last BD needs 'F' bit set */ + if (xdp_tx_swbd->is_eof) { + union enetc_tx_bd *txbd = ENETC_TXBD(*tx_ring, i); + + txbd->flags = ENETC_TXBD_FLAGS_F; + } + + enetc_bdr_idx_inc(tx_ring, &i); + } + + tx_ring->next_to_use = i; + + return true; +} + static void enetc_map_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i, struct xdp_buff *xdp_buff, u16 size) { @@ -725,6 +834,9 @@ static void enetc_map_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i, void *hard_start = page_address(rx_swbd->page) + rx_swbd->page_offset; struct skb_shared_info *shinfo; + /* To be used for XDP_TX */ + rx_swbd->len = size; + xdp_prepare_buff(xdp_buff, hard_start - rx_ring->buffer_offset, rx_ring->buffer_offset, size, false); @@ -739,6 +851,9 @@ static void enetc_add_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i, struct enetc_rx_swbd *rx_swbd = enetc_get_rx_buff(rx_ring, i, size); skb_frag_t *frag = &shinfo->frags[shinfo->nr_frags]; + /* To be used for XDP_TX */ + rx_swbd->len = size; + skb_frag_off_set(frag, rx_swbd->page_offset); skb_frag_size_set(frag, size); __skb_frag_set_page(frag, rx_swbd->page); @@ -780,15 +895,48 @@ static void enetc_put_xdp_buff(struct enetc_bdr *rx_ring, { enetc_reuse_page(rx_ring, rx_swbd); - /* sync for use by the device */ dma_sync_single_range_for_device(rx_ring->dev, rx_swbd->dma, rx_swbd->page_offset, ENETC_RXB_DMA_SIZE_XDP, - DMA_FROM_DEVICE); + rx_swbd->dir); rx_swbd->page = NULL; } +/* Convert RX buffer descriptors to TX buffer descriptors. These will be + * recycled back into the RX ring in enetc_clean_tx_ring. We need to scrub the + * RX software BDs because the ownership of the buffer no longer belongs to the + * RX ring, so enetc_refill_rx_ring may not reuse rx_swbd->page. + */ +static int enetc_rx_swbd_to_xdp_tx_swbd(struct enetc_tx_swbd *xdp_tx_arr, + struct enetc_bdr *rx_ring, + int rx_ring_first, int rx_ring_last) +{ + int n = 0; + + for (; rx_ring_first != rx_ring_last; + n++, enetc_bdr_idx_inc(rx_ring, &rx_ring_first)) { + struct enetc_rx_swbd *rx_swbd = &rx_ring->rx_swbd[rx_ring_first]; + struct enetc_tx_swbd *tx_swbd = &xdp_tx_arr[n]; + + /* No need to dma_map, we already have DMA_BIDIRECTIONAL */ + tx_swbd->dma = rx_swbd->dma; + tx_swbd->dir = rx_swbd->dir; + tx_swbd->page = rx_swbd->page; + tx_swbd->page_offset = rx_swbd->page_offset; + tx_swbd->len = rx_swbd->len; + tx_swbd->is_dma_page = true; + tx_swbd->is_xdp_tx = true; + tx_swbd->is_eof = false; + memset(rx_swbd, 0, sizeof(*rx_swbd)); + } + + /* We rely on caller providing an rx_ring_last > rx_ring_first */ + xdp_tx_arr[n - 1].is_eof = true; + + return n; +} + static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, int rx_ring_last) { @@ -804,6 +952,10 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, struct napi_struct *napi, int work_limit, struct bpf_prog *prog) { + struct enetc_tx_swbd xdp_tx_arr[ENETC_MAX_SKB_FRAGS] = {0}; + struct enetc_ndev_priv *priv = netdev_priv(rx_ring->ndev); + struct enetc_bdr *tx_ring = priv->tx_ring[rx_ring->index]; + int xdp_tx_bd_cnt, xdp_tx_frm_cnt = 0; int rx_frm_cnt = 0, rx_byte_cnt = 0; int cleaned_cnt, i; u32 xdp_act; @@ -819,10 +971,6 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, struct sk_buff *skb; u32 bd_status; - if (cleaned_cnt >= ENETC_RXBD_BUNDLE) - cleaned_cnt -= enetc_refill_rx_ring(rx_ring, - cleaned_cnt); - rxbd = enetc_rxbd(rx_ring, i); bd_status = le32_to_cpu(rxbd->r.lstatus); if (!bd_status) @@ -865,6 +1013,20 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, napi_gro_receive(napi, skb); break; + case XDP_TX: + xdp_tx_bd_cnt = enetc_rx_swbd_to_xdp_tx_swbd(xdp_tx_arr, + rx_ring, + orig_i, i); + + if (!enetc_xdp_tx(tx_ring, xdp_tx_arr, xdp_tx_bd_cnt)) { + enetc_xdp_drop(rx_ring, orig_i, i); + tx_ring->stats.xdp_tx_drops++; + } else { + tx_ring->stats.xdp_tx += xdp_tx_bd_cnt; + rx_ring->xdp.xdp_tx_in_flight += xdp_tx_bd_cnt; + xdp_tx_frm_cnt++; + } + break; default: bpf_warn_invalid_xdp_action(xdp_act); } @@ -877,6 +1039,13 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, rx_ring->stats.packets += rx_frm_cnt; rx_ring->stats.bytes += rx_byte_cnt; + if (xdp_tx_frm_cnt) + enetc_update_tx_ring_tail(tx_ring); + + if (cleaned_cnt > rx_ring->xdp.xdp_tx_in_flight) + enetc_refill_rx_ring(rx_ring, enetc_bd_unused(rx_ring) - + rx_ring->xdp.xdp_tx_in_flight); + return rx_frm_cnt; } @@ -1141,8 +1310,8 @@ static void enetc_free_rx_ring(struct enetc_bdr *rx_ring) if (!rx_swbd->page) continue; - dma_unmap_page(rx_ring->dev, rx_swbd->dma, - PAGE_SIZE, DMA_FROM_DEVICE); + dma_unmap_page(rx_ring->dev, rx_swbd->dma, PAGE_SIZE, + rx_swbd->dir); __free_page(rx_swbd->page); rx_swbd->page = NULL; } diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 5815addfe9661..864da962ae213 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -21,11 +21,15 @@ struct enetc_tx_swbd { struct sk_buff *skb; dma_addr_t dma; + struct page *page; /* valid only if is_xdp_tx */ + u16 page_offset; /* valid only if is_xdp_tx */ u16 len; + enum dma_data_direction dir; u8 is_dma_page:1; u8 check_wb:1; u8 do_tstamp:1; u8 is_eof:1; + u8 is_xdp_tx:1; }; #define ENETC_RX_MAXFRM_SIZE ENETC_MAC_MAXFRM_SIZE @@ -40,18 +44,31 @@ struct enetc_rx_swbd { dma_addr_t dma; struct page *page; u16 page_offset; + enum dma_data_direction dir; + u16 len; }; +/* ENETC overhead: optional extension BD + 1 BD gap */ +#define ENETC_TXBDS_NEEDED(val) ((val) + 2) +/* max # of chained Tx BDs is 15, including head and extension BD */ +#define ENETC_MAX_SKB_FRAGS 13 +#define ENETC_TXBDS_MAX_NEEDED ENETC_TXBDS_NEEDED(ENETC_MAX_SKB_FRAGS + 1) + struct enetc_ring_stats { unsigned int packets; unsigned int bytes; unsigned int rx_alloc_errs; unsigned int xdp_drops; + unsigned int xdp_tx; + unsigned int xdp_tx_drops; + unsigned int recycles; + unsigned int recycle_failures; }; struct enetc_xdp_data { struct xdp_rxq_info rxq; struct bpf_prog *prog; + int xdp_tx_in_flight; }; #define ENETC_RX_RING_DEFAULT_SIZE 512 @@ -104,6 +121,14 @@ static inline int enetc_bd_unused(struct enetc_bdr *bdr) return bdr->bd_count + bdr->next_to_clean - bdr->next_to_use - 1; } +static inline int enetc_swbd_unused(struct enetc_bdr *bdr) +{ + if (bdr->next_to_clean > bdr->next_to_alloc) + return bdr->next_to_clean - bdr->next_to_alloc - 1; + + return bdr->bd_count + bdr->next_to_clean - bdr->next_to_alloc - 1; +} + /* Control BD ring */ #define ENETC_CBDR_DEFAULT_SIZE 64 struct enetc_cbdr { diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index 0183c13f8c1e1..37821a8b225e6 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -193,10 +193,14 @@ static const char rx_ring_stats[][ETH_GSTRING_LEN] = { "Rx ring %2d frames", "Rx ring %2d alloc errors", "Rx ring %2d XDP drops", + "Rx ring %2d recycles", + "Rx ring %2d recycle failures", }; static const char tx_ring_stats[][ETH_GSTRING_LEN] = { "Tx ring %2d frames", + "Tx ring %2d XDP frames", + "Tx ring %2d XDP drops", }; static int enetc_get_sset_count(struct net_device *ndev, int sset) @@ -268,13 +272,18 @@ static void enetc_get_ethtool_stats(struct net_device *ndev, for (i = 0; i < ARRAY_SIZE(enetc_si_counters); i++) data[o++] = enetc_rd64(hw, enetc_si_counters[i].reg); - for (i = 0; i < priv->num_tx_rings; i++) + for (i = 0; i < priv->num_tx_rings; i++) { data[o++] = priv->tx_ring[i]->stats.packets; + data[o++] = priv->tx_ring[i]->stats.xdp_tx; + data[o++] = priv->tx_ring[i]->stats.xdp_tx_drops; + } for (i = 0; i < priv->num_rx_rings; i++) { data[o++] = priv->rx_ring[i]->stats.packets; data[o++] = priv->rx_ring[i]->stats.rx_alloc_errs; data[o++] = priv->rx_ring[i]->stats.xdp_drops; + data[o++] = priv->rx_ring[i]->stats.recycles; + data[o++] = priv->rx_ring[i]->stats.recycle_failures; } if (!enetc_si_is_pf(priv->si)) From d6a2829e82cff9e5ec10b8ee293488b57399ed01 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 31 Mar 2021 23:08:56 +0300 Subject: [PATCH 32/36] net: enetc: increase RX ring default size As explained in the XDP_TX patch, when receiving a burst of frames with the XDP_TX verdict, there is a momentary dip in the number of available RX buffers. The system will eventually recover as TX completions will start kicking in and refilling our RX BD ring again. But until that happens, we need to survive with as few out-of-buffer discards as possible. This increases the memory footprint of the driver in order to avoid discards at 2.5Gbps line rate 64B packet sizes, the maximum speed available for testing on 1 port on NXP LS1028A. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index 864da962ae213..d0619fcbbe97f 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -71,7 +71,7 @@ struct enetc_xdp_data { int xdp_tx_in_flight; }; -#define ENETC_RX_RING_DEFAULT_SIZE 512 +#define ENETC_RX_RING_DEFAULT_SIZE 2048 #define ENETC_TX_RING_DEFAULT_SIZE 256 #define ENETC_DEFAULT_TX_WORK (ENETC_TX_RING_DEFAULT_SIZE / 2) From 9d2b68cc108db2fdb35022ed2d88cfb305c441a6 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Wed, 31 Mar 2021 23:08:57 +0300 Subject: [PATCH 33/36] net: enetc: add support for XDP_REDIRECT The driver implementation of the XDP_REDIRECT action reuses parts from XDP_TX, most notably the enetc_xdp_tx function which transmits an array of TX software BDs. Only this time, the buffers don't have DMA mappings, we need to create them. When a BPF program reaches the XDP_REDIRECT verdict for a frame, we can employ the same buffer reuse strategy as for the normal processing path and for XDP_PASS: we can flip to the other page half and seed that to the RX ring. Note that scatter/gather support is there, but disabled due to lack of multi-buffer support in XDP (which is added by this series): https://patchwork.kernel.org/project/netdevbpf/cover/cover.1616179034.git.lorenzo@kernel.org/ Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- drivers/net/ethernet/freescale/enetc/enetc.c | 212 +++++++++++++++++- drivers/net/ethernet/freescale/enetc/enetc.h | 11 +- .../ethernet/freescale/enetc/enetc_ethtool.c | 6 + .../net/ethernet/freescale/enetc/enetc_pf.c | 1 + 4 files changed, 218 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index ba5313a5d7a48..57049ae972015 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -8,6 +8,23 @@ #include #include +static struct sk_buff *enetc_tx_swbd_get_skb(struct enetc_tx_swbd *tx_swbd) +{ + if (tx_swbd->is_xdp_tx || tx_swbd->is_xdp_redirect) + return NULL; + + return tx_swbd->skb; +} + +static struct xdp_frame * +enetc_tx_swbd_get_xdp_frame(struct enetc_tx_swbd *tx_swbd) +{ + if (tx_swbd->is_xdp_redirect) + return tx_swbd->xdp_frame; + + return NULL; +} + static void enetc_unmap_tx_buff(struct enetc_bdr *tx_ring, struct enetc_tx_swbd *tx_swbd) { @@ -25,14 +42,20 @@ static void enetc_unmap_tx_buff(struct enetc_bdr *tx_ring, tx_swbd->dma = 0; } -static void enetc_free_tx_skb(struct enetc_bdr *tx_ring, - struct enetc_tx_swbd *tx_swbd) +static void enetc_free_tx_frame(struct enetc_bdr *tx_ring, + struct enetc_tx_swbd *tx_swbd) { + struct xdp_frame *xdp_frame = enetc_tx_swbd_get_xdp_frame(tx_swbd); + struct sk_buff *skb = enetc_tx_swbd_get_skb(tx_swbd); + if (tx_swbd->dma) enetc_unmap_tx_buff(tx_ring, tx_swbd); - if (tx_swbd->skb) { - dev_kfree_skb_any(tx_swbd->skb); + if (xdp_frame) { + xdp_return_frame(tx_swbd->xdp_frame); + tx_swbd->xdp_frame = NULL; + } else if (skb) { + dev_kfree_skb_any(skb); tx_swbd->skb = NULL; } } @@ -183,7 +206,7 @@ static int enetc_map_tx_buffs(struct enetc_bdr *tx_ring, struct sk_buff *skb, do { tx_swbd = &tx_ring->tx_swbd[i]; - enetc_free_tx_skb(tx_ring, tx_swbd); + enetc_free_tx_frame(tx_ring, tx_swbd); if (i == 0) i = tx_ring->bd_count; i--; @@ -381,6 +404,9 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) do_tstamp = false; while (bds_to_clean && tx_frm_cnt < ENETC_DEFAULT_TX_WORK) { + struct xdp_frame *xdp_frame = enetc_tx_swbd_get_xdp_frame(tx_swbd); + struct sk_buff *skb = enetc_tx_swbd_get_skb(tx_swbd); + if (unlikely(tx_swbd->check_wb)) { struct enetc_ndev_priv *priv = netdev_priv(ndev); union enetc_tx_bd *txbd; @@ -400,12 +426,15 @@ static bool enetc_clean_tx_ring(struct enetc_bdr *tx_ring, int napi_budget) else if (likely(tx_swbd->dma)) enetc_unmap_tx_buff(tx_ring, tx_swbd); - if (tx_swbd->skb) { + if (xdp_frame) { + xdp_return_frame(xdp_frame); + tx_swbd->xdp_frame = NULL; + } else if (skb) { if (unlikely(do_tstamp)) { - enetc_tstamp_tx(tx_swbd->skb, tstamp); + enetc_tstamp_tx(skb, tstamp); do_tstamp = false; } - napi_consume_skb(tx_swbd->skb, napi_budget); + napi_consume_skb(skb, napi_budget); tx_swbd->skb = NULL; } @@ -827,6 +856,109 @@ static bool enetc_xdp_tx(struct enetc_bdr *tx_ring, return true; } +static int enetc_xdp_frame_to_xdp_tx_swbd(struct enetc_bdr *tx_ring, + struct enetc_tx_swbd *xdp_tx_arr, + struct xdp_frame *xdp_frame) +{ + struct enetc_tx_swbd *xdp_tx_swbd = &xdp_tx_arr[0]; + struct skb_shared_info *shinfo; + void *data = xdp_frame->data; + int len = xdp_frame->len; + skb_frag_t *frag; + dma_addr_t dma; + unsigned int f; + int n = 0; + + dma = dma_map_single(tx_ring->dev, data, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) { + netdev_err(tx_ring->ndev, "DMA map error\n"); + return -1; + } + + xdp_tx_swbd->dma = dma; + xdp_tx_swbd->dir = DMA_TO_DEVICE; + xdp_tx_swbd->len = len; + xdp_tx_swbd->is_xdp_redirect = true; + xdp_tx_swbd->is_eof = false; + xdp_tx_swbd->xdp_frame = NULL; + + n++; + xdp_tx_swbd = &xdp_tx_arr[n]; + + shinfo = xdp_get_shared_info_from_frame(xdp_frame); + + for (f = 0, frag = &shinfo->frags[0]; f < shinfo->nr_frags; + f++, frag++) { + data = skb_frag_address(frag); + len = skb_frag_size(frag); + + dma = dma_map_single(tx_ring->dev, data, len, DMA_TO_DEVICE); + if (unlikely(dma_mapping_error(tx_ring->dev, dma))) { + /* Undo the DMA mapping for all fragments */ + while (n-- >= 0) + enetc_unmap_tx_buff(tx_ring, &xdp_tx_arr[n]); + + netdev_err(tx_ring->ndev, "DMA map error\n"); + return -1; + } + + xdp_tx_swbd->dma = dma; + xdp_tx_swbd->dir = DMA_TO_DEVICE; + xdp_tx_swbd->len = len; + xdp_tx_swbd->is_xdp_redirect = true; + xdp_tx_swbd->is_eof = false; + xdp_tx_swbd->xdp_frame = NULL; + + n++; + xdp_tx_swbd = &xdp_tx_arr[n]; + } + + xdp_tx_arr[n - 1].is_eof = true; + xdp_tx_arr[n - 1].xdp_frame = xdp_frame; + + return n; +} + +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, + struct xdp_frame **frames, u32 flags) +{ + struct enetc_tx_swbd xdp_redirect_arr[ENETC_MAX_SKB_FRAGS] = {0}; + struct enetc_ndev_priv *priv = netdev_priv(ndev); + struct enetc_bdr *tx_ring; + int xdp_tx_bd_cnt, i, k; + int xdp_tx_frm_cnt = 0; + + tx_ring = priv->tx_ring[smp_processor_id()]; + + prefetchw(ENETC_TXBD(*tx_ring, tx_ring->next_to_use)); + + for (k = 0; k < num_frames; k++) { + xdp_tx_bd_cnt = enetc_xdp_frame_to_xdp_tx_swbd(tx_ring, + xdp_redirect_arr, + frames[k]); + if (unlikely(xdp_tx_bd_cnt < 0)) + break; + + if (unlikely(!enetc_xdp_tx(tx_ring, xdp_redirect_arr, + xdp_tx_bd_cnt))) { + for (i = 0; i < xdp_tx_bd_cnt; i++) + enetc_unmap_tx_buff(tx_ring, + &xdp_redirect_arr[i]); + tx_ring->stats.xdp_tx_drops++; + break; + } + + xdp_tx_frm_cnt++; + } + + if (unlikely((flags & XDP_XMIT_FLUSH) || k != xdp_tx_frm_cnt)) + enetc_update_tx_ring_tail(tx_ring); + + tx_ring->stats.xdp_tx += xdp_tx_frm_cnt; + + return xdp_tx_frm_cnt; +} + static void enetc_map_rx_buff_to_xdp(struct enetc_bdr *rx_ring, int i, struct xdp_buff *xdp_buff, u16 size) { @@ -948,14 +1080,31 @@ static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, rx_ring->stats.xdp_drops++; } +static void enetc_xdp_free(struct enetc_bdr *rx_ring, int rx_ring_first, + int rx_ring_last) +{ + while (rx_ring_first != rx_ring_last) { + struct enetc_rx_swbd *rx_swbd = &rx_ring->rx_swbd[rx_ring_first]; + + if (rx_swbd->page) { + dma_unmap_page(rx_ring->dev, rx_swbd->dma, PAGE_SIZE, + rx_swbd->dir); + __free_page(rx_swbd->page); + rx_swbd->page = NULL; + } + enetc_bdr_idx_inc(rx_ring, &rx_ring_first); + } + rx_ring->stats.xdp_redirect_failures++; +} + static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, struct napi_struct *napi, int work_limit, struct bpf_prog *prog) { + int xdp_tx_bd_cnt, xdp_tx_frm_cnt = 0, xdp_redirect_frm_cnt = 0; struct enetc_tx_swbd xdp_tx_arr[ENETC_MAX_SKB_FRAGS] = {0}; struct enetc_ndev_priv *priv = netdev_priv(rx_ring->ndev); struct enetc_bdr *tx_ring = priv->tx_ring[rx_ring->index]; - int xdp_tx_bd_cnt, xdp_tx_frm_cnt = 0; int rx_frm_cnt = 0, rx_byte_cnt = 0; int cleaned_cnt, i; u32 xdp_act; @@ -969,6 +1118,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, int orig_i, orig_cleaned_cnt; struct xdp_buff xdp_buff; struct sk_buff *skb; + int tmp_orig_i, err; u32 bd_status; rxbd = enetc_rxbd(rx_ring, i); @@ -1026,6 +1176,43 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, rx_ring->xdp.xdp_tx_in_flight += xdp_tx_bd_cnt; xdp_tx_frm_cnt++; } + break; + case XDP_REDIRECT: + /* xdp_return_frame does not support S/G in the sense + * that it leaks the fragments (__xdp_return should not + * call page_frag_free only for the initial buffer). + * Until XDP_REDIRECT gains support for S/G let's keep + * the code structure in place, but dead. We drop the + * S/G frames ourselves to avoid memory leaks which + * would otherwise leave the kernel OOM. + */ + if (unlikely(cleaned_cnt - orig_cleaned_cnt != 1)) { + enetc_xdp_drop(rx_ring, orig_i, i); + rx_ring->stats.xdp_redirect_sg++; + break; + } + + tmp_orig_i = orig_i; + + while (orig_i != i) { + enetc_put_rx_buff(rx_ring, + &rx_ring->rx_swbd[orig_i]); + enetc_bdr_idx_inc(rx_ring, &orig_i); + } + + err = xdp_do_redirect(rx_ring->ndev, &xdp_buff, prog); + if (unlikely(err)) { + enetc_xdp_free(rx_ring, tmp_orig_i, i); + } else { + xdp_redirect_frm_cnt++; + rx_ring->stats.xdp_redirect++; + } + + if (unlikely(xdp_redirect_frm_cnt > ENETC_DEFAULT_TX_WORK)) { + xdp_do_flush_map(); + xdp_redirect_frm_cnt = 0; + } + break; default: bpf_warn_invalid_xdp_action(xdp_act); @@ -1039,6 +1226,9 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, rx_ring->stats.packets += rx_frm_cnt; rx_ring->stats.bytes += rx_byte_cnt; + if (xdp_redirect_frm_cnt) + xdp_do_flush_map(); + if (xdp_tx_frm_cnt) enetc_update_tx_ring_tail(tx_ring); @@ -1173,7 +1363,7 @@ static void enetc_free_txbdr(struct enetc_bdr *txr) int size, i; for (i = 0; i < txr->bd_count; i++) - enetc_free_tx_skb(txr, &txr->tx_swbd[i]); + enetc_free_tx_frame(txr, &txr->tx_swbd[i]); size = txr->bd_count * sizeof(union enetc_tx_bd); @@ -1290,7 +1480,7 @@ static void enetc_free_tx_ring(struct enetc_bdr *tx_ring) for (i = 0; i < tx_ring->bd_count; i++) { struct enetc_tx_swbd *tx_swbd = &tx_ring->tx_swbd[i]; - enetc_free_tx_skb(tx_ring, tx_swbd); + enetc_free_tx_frame(tx_ring, tx_swbd); } tx_ring->next_to_clean = 0; diff --git a/drivers/net/ethernet/freescale/enetc/enetc.h b/drivers/net/ethernet/freescale/enetc/enetc.h index d0619fcbbe97f..05474f46b0d91 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.h +++ b/drivers/net/ethernet/freescale/enetc/enetc.h @@ -19,7 +19,10 @@ (ETH_FCS_LEN + ETH_HLEN + VLAN_HLEN)) struct enetc_tx_swbd { - struct sk_buff *skb; + union { + struct sk_buff *skb; + struct xdp_frame *xdp_frame; + }; dma_addr_t dma; struct page *page; /* valid only if is_xdp_tx */ u16 page_offset; /* valid only if is_xdp_tx */ @@ -30,6 +33,7 @@ struct enetc_tx_swbd { u8 do_tstamp:1; u8 is_eof:1; u8 is_xdp_tx:1; + u8 is_xdp_redirect:1; }; #define ENETC_RX_MAXFRM_SIZE ENETC_MAC_MAXFRM_SIZE @@ -61,6 +65,9 @@ struct enetc_ring_stats { unsigned int xdp_drops; unsigned int xdp_tx; unsigned int xdp_tx_drops; + unsigned int xdp_redirect; + unsigned int xdp_redirect_failures; + unsigned int xdp_redirect_sg; unsigned int recycles; unsigned int recycle_failures; }; @@ -354,6 +361,8 @@ int enetc_ioctl(struct net_device *ndev, struct ifreq *rq, int cmd); int enetc_setup_tc(struct net_device *ndev, enum tc_setup_type type, void *type_data); int enetc_setup_bpf(struct net_device *dev, struct netdev_bpf *xdp); +int enetc_xdp_xmit(struct net_device *ndev, int num_frames, + struct xdp_frame **frames, u32 flags); /* ethtool */ void enetc_set_ethtool_ops(struct net_device *ndev); diff --git a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c index 37821a8b225e6..7cc81b453bd7a 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_ethtool.c @@ -195,6 +195,9 @@ static const char rx_ring_stats[][ETH_GSTRING_LEN] = { "Rx ring %2d XDP drops", "Rx ring %2d recycles", "Rx ring %2d recycle failures", + "Rx ring %2d redirects", + "Rx ring %2d redirect failures", + "Rx ring %2d redirect S/G", }; static const char tx_ring_stats[][ETH_GSTRING_LEN] = { @@ -284,6 +287,9 @@ static void enetc_get_ethtool_stats(struct net_device *ndev, data[o++] = priv->rx_ring[i]->stats.xdp_drops; data[o++] = priv->rx_ring[i]->stats.recycles; data[o++] = priv->rx_ring[i]->stats.recycle_failures; + data[o++] = priv->rx_ring[i]->stats.xdp_redirect; + data[o++] = priv->rx_ring[i]->stats.xdp_redirect_failures; + data[o++] = priv->rx_ring[i]->stats.xdp_redirect_sg; } if (!enetc_si_is_pf(priv->si)) diff --git a/drivers/net/ethernet/freescale/enetc/enetc_pf.c b/drivers/net/ethernet/freescale/enetc/enetc_pf.c index 0484dbe13422b..f61fedf462e5e 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc_pf.c +++ b/drivers/net/ethernet/freescale/enetc/enetc_pf.c @@ -708,6 +708,7 @@ static const struct net_device_ops enetc_ndev_ops = { .ndo_do_ioctl = enetc_ioctl, .ndo_setup_tc = enetc_setup_tc, .ndo_bpf = enetc_setup_bpf, + .ndo_xdp_xmit = enetc_xdp_xmit, }; static void enetc_pf_netdev_setup(struct enetc_si *si, struct net_device *ndev, From b494ba5a3cf822fa99fb941cd1c293da21f4f927 Mon Sep 17 00:00:00 2001 From: Voon Weifeng Date: Thu, 1 Apr 2021 00:18:25 +0800 Subject: [PATCH 34/36] net: stmmac: enable MTL ECC Error Address Status Over-ride by default Turn on the MEEAO field of MTL_ECC_Control_Register by default. As the MTL ECC Error Address Status Over-ride(MEEAO) is set by default, the following error address fields will hold the last valid address where the error is detected. Signed-off-by: Voon Weifeng Signed-off-by: Tan Tee Min Co-developed-by: Wong Vee Khee Signed-off-by: Wong Vee Khee Signed-off-by: David S. Miller --- drivers/net/ethernet/stmicro/stmmac/dwmac5.c | 1 + drivers/net/ethernet/stmicro/stmmac/dwmac5.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c index 5b010ebfede9f..d8c6ff7252377 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.c @@ -192,6 +192,7 @@ int dwmac5_safety_feat_config(void __iomem *ioaddr, unsigned int asp) /* 1. Enable Safety Features */ value = readl(ioaddr + MTL_ECC_CONTROL); + value |= MEEAO; /* MTL ECC Error Addr Status Override */ value |= TSOEE; /* TSO ECC */ value |= MRXPEE; /* MTL RX Parser ECC */ value |= MESTEE; /* MTL EST ECC */ diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h index ff555d8b0cdf9..6b2fd37b29ada 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac5.h +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac5.h @@ -98,6 +98,7 @@ #define ADDR GENMASK(15, 0) #define MTL_RXP_IACC_DATA 0x00000cb4 #define MTL_ECC_CONTROL 0x00000cc0 +#define MEEAO BIT(8) #define TSOEE BIT(4) #define MRXPEE BIT(3) #define MESTEE BIT(2) From 917e2e6c57980e2255c5eb8ddd77ed670ae49752 Mon Sep 17 00:00:00 2001 From: Frank Wunderlich Date: Wed, 31 Mar 2021 15:34:37 +0200 Subject: [PATCH 35/36] net: mediatek: add flow offload for mt7623 mt7623 uses offload version 2 too tested on Bananapi-R2 Signed-off-by: Frank Wunderlich Signed-off-by: David S. Miller --- drivers/net/ethernet/mediatek/mtk_eth_soc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c b/drivers/net/ethernet/mediatek/mtk_eth_soc.c index 0396f0db855f4..810def064f11b 100644 --- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c +++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c @@ -3202,6 +3202,7 @@ static const struct mtk_soc_data mt7623_data = { .hw_features = MTK_HW_FEATURES, .required_clks = MT7623_CLKS_BITMAP, .required_pctl = true, + .offload_version = 2, }; static const struct mtk_soc_data mt7629_data = { From 040806343bb4ef6365166eae666ced8a91b95321 Mon Sep 17 00:00:00 2001 From: Carlos Llamas Date: Thu, 1 Apr 2021 00:40:20 +0000 Subject: [PATCH 36/36] selftests/net: so_txtime multi-host support SO_TXTIME hardware offload requires testing across devices, either between machines or separate network namespaces. Split up SO_TXTIME test into tx and rx modes, so traffic can be sent from one process to another. Create a veth-pair on different namespaces and bind each process to an end point via [-S]ource and [-D]estination parameters. Optional start [-t]ime parameter can be passed to synchronize the test across the hosts (with synchorinzed clocks). Signed-off-by: Carlos Llamas Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- tools/testing/selftests/net/so_txtime.c | 247 +++++++++++++++++------ tools/testing/selftests/net/so_txtime.sh | 97 +++++++-- 2 files changed, 259 insertions(+), 85 deletions(-) diff --git a/tools/testing/selftests/net/so_txtime.c b/tools/testing/selftests/net/so_txtime.c index b4cca382d1254..59067f64b7753 100644 --- a/tools/testing/selftests/net/so_txtime.c +++ b/tools/testing/selftests/net/so_txtime.c @@ -2,9 +2,12 @@ /* * Test the SO_TXTIME API * - * Takes two streams of { payload, delivery time }[], one input and one output. - * Sends the input stream and verifies arrival matches the output stream. - * The two streams can differ due to out-of-order delivery and drops. + * Takes a stream of { payload, delivery time }[], to be sent across two + * processes. Start this program on two separate network namespaces or + * connected hosts, one instance in transmit mode and the other in receive + * mode using the '-r' option. Receiver will compare arrival timestamps to + * the expected stream. Sender will read transmit timestamps from the error + * queue. The streams can differ due to out-of-order delivery and drops. */ #define _GNU_SOURCE @@ -28,14 +31,17 @@ #include #include #include +#include static int cfg_clockid = CLOCK_TAI; -static bool cfg_do_ipv4; -static bool cfg_do_ipv6; static uint16_t cfg_port = 8000; static int cfg_variance_us = 4000; +static uint64_t cfg_start_time_ns; +static int cfg_mark; +static bool cfg_rx; static uint64_t glob_tstart; +static uint64_t tdeliver_max; /* encode one timed transmission (of a 1B payload) */ struct timed_send { @@ -44,18 +50,21 @@ struct timed_send { }; #define MAX_NUM_PKT 8 -static struct timed_send cfg_in[MAX_NUM_PKT]; -static struct timed_send cfg_out[MAX_NUM_PKT]; +static struct timed_send cfg_buf[MAX_NUM_PKT]; static int cfg_num_pkt; static int cfg_errq_level; static int cfg_errq_type; -static uint64_t gettime_ns(void) +static struct sockaddr_storage cfg_dst_addr; +static struct sockaddr_storage cfg_src_addr; +static socklen_t cfg_alen; + +static uint64_t gettime_ns(clockid_t clock) { struct timespec ts; - if (clock_gettime(cfg_clockid, &ts)) + if (clock_gettime(clock, &ts)) error(1, errno, "gettime"); return ts.tv_sec * (1000ULL * 1000 * 1000) + ts.tv_nsec; @@ -75,6 +84,8 @@ static void do_send_one(int fdt, struct timed_send *ts) msg.msg_iov = &iov; msg.msg_iovlen = 1; + msg.msg_name = (struct sockaddr *)&cfg_dst_addr; + msg.msg_namelen = cfg_alen; if (ts->delay_us >= 0) { memset(control, 0, sizeof(control)); @@ -82,6 +93,8 @@ static void do_send_one(int fdt, struct timed_send *ts) msg.msg_controllen = sizeof(control); tdeliver = glob_tstart + ts->delay_us * 1000; + tdeliver_max = tdeliver_max > tdeliver ? + tdeliver_max : tdeliver; cm = CMSG_FIRSTHDR(&msg); cm->cmsg_level = SOL_SOCKET; @@ -98,7 +111,7 @@ static void do_send_one(int fdt, struct timed_send *ts) } -static bool do_recv_one(int fdr, struct timed_send *ts) +static void do_recv_one(int fdr, struct timed_send *ts) { int64_t tstop, texpect; char rbuf[2]; @@ -106,13 +119,13 @@ static bool do_recv_one(int fdr, struct timed_send *ts) ret = recv(fdr, rbuf, sizeof(rbuf), 0); if (ret == -1 && errno == EAGAIN) - return true; + error(1, EAGAIN, "recv: timeout"); if (ret == -1) error(1, errno, "read"); if (ret != 1) error(1, 0, "read: %dB", ret); - tstop = (gettime_ns() - glob_tstart) / 1000; + tstop = (gettime_ns(cfg_clockid) - glob_tstart) / 1000; texpect = ts->delay_us >= 0 ? ts->delay_us : 0; fprintf(stderr, "payload:%c delay:%lld expected:%lld (us)\n", @@ -123,8 +136,6 @@ static bool do_recv_one(int fdr, struct timed_send *ts) if (llabs(tstop - texpect) > cfg_variance_us) error(1, 0, "exceeds variance (%d us)", cfg_variance_us); - - return false; } static void do_recv_verify_empty(int fdr) @@ -137,18 +148,18 @@ static void do_recv_verify_empty(int fdr) error(1, 0, "recv: not empty as expected (%d, %d)", ret, errno); } -static void do_recv_errqueue_timeout(int fdt) +static int do_recv_errqueue_timeout(int fdt) { char control[CMSG_SPACE(sizeof(struct sock_extended_err)) + CMSG_SPACE(sizeof(struct sockaddr_in6))] = {0}; char data[sizeof(struct ethhdr) + sizeof(struct ipv6hdr) + sizeof(struct udphdr) + 1]; struct sock_extended_err *err; + int ret, num_tstamp = 0; struct msghdr msg = {0}; struct iovec iov = {0}; struct cmsghdr *cm; int64_t tstamp = 0; - int ret; iov.iov_base = data; iov.iov_len = sizeof(data); @@ -206,9 +217,47 @@ static void do_recv_errqueue_timeout(int fdt) msg.msg_flags = 0; msg.msg_controllen = sizeof(control); + num_tstamp++; } - error(1, 0, "recv: timeout"); + return num_tstamp; +} + +static void recv_errqueue_msgs(int fdt) +{ + struct pollfd pfd = { .fd = fdt, .events = POLLERR }; + const int timeout_ms = 10; + int ret, num_tstamp = 0; + + do { + ret = poll(&pfd, 1, timeout_ms); + if (ret == -1) + error(1, errno, "poll"); + + if (ret && (pfd.revents & POLLERR)) + num_tstamp += do_recv_errqueue_timeout(fdt); + + if (num_tstamp == cfg_num_pkt) + break; + + } while (gettime_ns(cfg_clockid) < tdeliver_max); +} + +static void start_time_wait(void) +{ + uint64_t now; + int err; + + if (!cfg_start_time_ns) + return; + + now = gettime_ns(CLOCK_REALTIME); + if (cfg_start_time_ns < now) + return; + + err = usleep((cfg_start_time_ns - now) / 1000); + if (err) + error(1, errno, "usleep"); } static void setsockopt_txtime(int fd) @@ -245,6 +294,10 @@ static int setup_tx(struct sockaddr *addr, socklen_t alen) setsockopt_txtime(fd); + if (cfg_mark && + setsockopt(fd, SOL_SOCKET, SO_MARK, &cfg_mark, sizeof(cfg_mark))) + error(1, errno, "setsockopt mark"); + return fd; } @@ -266,31 +319,70 @@ static int setup_rx(struct sockaddr *addr, socklen_t alen) return fd; } -static void do_test(struct sockaddr *addr, socklen_t alen) +static void do_test_tx(struct sockaddr *addr, socklen_t alen) { - int fdt, fdr, i; + int fdt, i; fprintf(stderr, "\nSO_TXTIME ipv%c clock %s\n", addr->sa_family == PF_INET ? '4' : '6', cfg_clockid == CLOCK_TAI ? "tai" : "monotonic"); fdt = setup_tx(addr, alen); - fdr = setup_rx(addr, alen); - glob_tstart = gettime_ns(); + start_time_wait(); + glob_tstart = gettime_ns(cfg_clockid); for (i = 0; i < cfg_num_pkt; i++) - do_send_one(fdt, &cfg_in[i]); + do_send_one(fdt, &cfg_buf[i]); + + recv_errqueue_msgs(fdt); + + if (close(fdt)) + error(1, errno, "close t"); +} + +static void do_test_rx(struct sockaddr *addr, socklen_t alen) +{ + int fdr, i; + + fdr = setup_rx(addr, alen); + + start_time_wait(); + glob_tstart = gettime_ns(cfg_clockid); + for (i = 0; i < cfg_num_pkt; i++) - if (do_recv_one(fdr, &cfg_out[i])) - do_recv_errqueue_timeout(fdt); + do_recv_one(fdr, &cfg_buf[i]); do_recv_verify_empty(fdr); if (close(fdr)) error(1, errno, "close r"); - if (close(fdt)) - error(1, errno, "close t"); +} + +static void setup_sockaddr(int domain, const char *str_addr, + struct sockaddr_storage *sockaddr) +{ + struct sockaddr_in6 *addr6 = (void *) sockaddr; + struct sockaddr_in *addr4 = (void *) sockaddr; + + switch (domain) { + case PF_INET: + memset(addr4, 0, sizeof(*addr4)); + addr4->sin_family = AF_INET; + addr4->sin_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET, str_addr, &(addr4->sin_addr)) != 1) + error(1, 0, "ipv4 parse error: %s", str_addr); + break; + case PF_INET6: + memset(addr6, 0, sizeof(*addr6)); + addr6->sin6_family = AF_INET6; + addr6->sin6_port = htons(cfg_port); + if (str_addr && + inet_pton(AF_INET6, str_addr, &(addr6->sin6_addr)) != 1) + error(1, 0, "ipv6 parse error: %s", str_addr); + break; + } } static int parse_io(const char *optarg, struct timed_send *array) @@ -323,17 +415,46 @@ static int parse_io(const char *optarg, struct timed_send *array) return aoff / 2; } +static void usage(const char *progname) +{ + fprintf(stderr, "\nUsage: %s [options] \n" + "Options:\n" + " -4 only IPv4\n" + " -6 only IPv6\n" + " -c monotonic (default) or tai\n" + " -D destination IP address (server)\n" + " -S source IP address (client)\n" + " -r run rx mode\n" + " -t start time (UTC nanoseconds)\n" + " -m socket mark\n" + "\n", + progname); + exit(1); +} + static void parse_opts(int argc, char **argv) { - int c, ilen, olen; + char *daddr = NULL, *saddr = NULL; + int domain = PF_UNSPEC; + int c; - while ((c = getopt(argc, argv, "46c:")) != -1) { + while ((c = getopt(argc, argv, "46c:S:D:rt:m:")) != -1) { switch (c) { case '4': - cfg_do_ipv4 = true; + if (domain != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + domain = PF_INET; + cfg_alen = sizeof(struct sockaddr_in); + cfg_errq_level = SOL_IP; + cfg_errq_type = IP_RECVERR; break; case '6': - cfg_do_ipv6 = true; + if (domain != PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + domain = PF_INET6; + cfg_alen = sizeof(struct sockaddr_in6); + cfg_errq_level = SOL_IPV6; + cfg_errq_type = IPV6_RECVERR; break; case 'c': if (!strcmp(optarg, "tai")) @@ -344,50 +465,50 @@ static void parse_opts(int argc, char **argv) else error(1, 0, "unknown clock id %s", optarg); break; + case 'S': + saddr = optarg; + break; + case 'D': + daddr = optarg; + break; + case 'r': + cfg_rx = true; + break; + case 't': + cfg_start_time_ns = strtol(optarg, NULL, 0); + break; + case 'm': + cfg_mark = strtol(optarg, NULL, 0); + break; default: - error(1, 0, "parse error at %d", optind); + usage(argv[0]); } } - if (argc - optind != 2) - error(1, 0, "Usage: %s [-46] -c ", argv[0]); + if (argc - optind != 1) + usage(argv[0]); + + if (domain == PF_UNSPEC) + error(1, 0, "Pass one of -4 or -6"); + if (!daddr) + error(1, 0, "-D required\n"); + if (!cfg_rx && !saddr) + error(1, 0, "-S required\n"); - ilen = parse_io(argv[optind], cfg_in); - olen = parse_io(argv[optind + 1], cfg_out); - if (ilen != olen) - error(1, 0, "i/o streams len mismatch (%d, %d)\n", ilen, olen); - cfg_num_pkt = ilen; + setup_sockaddr(domain, daddr, &cfg_dst_addr); + setup_sockaddr(domain, saddr, &cfg_src_addr); + + cfg_num_pkt = parse_io(argv[optind], cfg_buf); } int main(int argc, char **argv) { parse_opts(argc, argv); - if (cfg_do_ipv6) { - struct sockaddr_in6 addr6 = {0}; - - addr6.sin6_family = AF_INET6; - addr6.sin6_port = htons(cfg_port); - addr6.sin6_addr = in6addr_loopback; - - cfg_errq_level = SOL_IPV6; - cfg_errq_type = IPV6_RECVERR; - - do_test((void *)&addr6, sizeof(addr6)); - } - - if (cfg_do_ipv4) { - struct sockaddr_in addr4 = {0}; - - addr4.sin_family = AF_INET; - addr4.sin_port = htons(cfg_port); - addr4.sin_addr.s_addr = htonl(INADDR_LOOPBACK); - - cfg_errq_level = SOL_IP; - cfg_errq_type = IP_RECVERR; - - do_test((void *)&addr4, sizeof(addr4)); - } + if (cfg_rx) + do_test_rx((void *)&cfg_dst_addr, cfg_alen); + else + do_test_tx((void *)&cfg_src_addr, cfg_alen); return 0; } diff --git a/tools/testing/selftests/net/so_txtime.sh b/tools/testing/selftests/net/so_txtime.sh index 3f7800eaecb1e..3f06f4d286a98 100755 --- a/tools/testing/selftests/net/so_txtime.sh +++ b/tools/testing/selftests/net/so_txtime.sh @@ -3,32 +3,85 @@ # # Regression tests for the SO_TXTIME interface -# Run in network namespace -if [[ $# -eq 0 ]]; then - if ! ./in_netns.sh $0 __subprocess; then - # test is time sensitive, can be flaky - echo "test failed: retry once" - ./in_netns.sh $0 __subprocess +set -e + +readonly DEV="veth0" +readonly BIN="./so_txtime" + +readonly RAND="$(mktemp -u XXXXXX)" +readonly NSPREFIX="ns-${RAND}" +readonly NS1="${NSPREFIX}1" +readonly NS2="${NSPREFIX}2" + +readonly SADDR4='192.168.1.1' +readonly DADDR4='192.168.1.2' +readonly SADDR6='fd::1' +readonly DADDR6='fd::2' + +cleanup() { + ip netns del "${NS2}" + ip netns del "${NS1}" +} + +trap cleanup EXIT + +# Create virtual ethernet pair between network namespaces +ip netns add "${NS1}" +ip netns add "${NS2}" + +ip link add "${DEV}" netns "${NS1}" type veth \ + peer name "${DEV}" netns "${NS2}" + +# Bring the devices up +ip -netns "${NS1}" link set "${DEV}" up +ip -netns "${NS2}" link set "${DEV}" up + +# Set fixed MAC addresses on the devices +ip -netns "${NS1}" link set dev "${DEV}" address 02:02:02:02:02:02 +ip -netns "${NS2}" link set dev "${DEV}" address 06:06:06:06:06:06 + +# Add fixed IP addresses to the devices +ip -netns "${NS1}" addr add 192.168.1.1/24 dev "${DEV}" +ip -netns "${NS2}" addr add 192.168.1.2/24 dev "${DEV}" +ip -netns "${NS1}" addr add fd::1/64 dev "${DEV}" nodad +ip -netns "${NS2}" addr add fd::2/64 dev "${DEV}" nodad + +do_test() { + local readonly IP="$1" + local readonly CLOCK="$2" + local readonly TXARGS="$3" + local readonly RXARGS="$4" + + if [[ "${IP}" == "4" ]]; then + local readonly SADDR="${SADDR4}" + local readonly DADDR="${DADDR4}" + elif [[ "${IP}" == "6" ]]; then + local readonly SADDR="${SADDR6}" + local readonly DADDR="${DADDR6}" + else + echo "Invalid IP version ${IP}" + exit 1 fi - exit $? -fi + local readonly START="$(date +%s%N --date="+ 0.1 seconds")" + ip netns exec "${NS2}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${RXARGS}" -r & + ip netns exec "${NS1}" "${BIN}" -"${IP}" -c "${CLOCK}" -t "${START}" -S "${SADDR}" -D "${DADDR}" "${TXARGS}" + wait "$!" +} -set -e +ip netns exec "${NS1}" tc qdisc add dev "${DEV}" root fq +do_test 4 mono a,-1 a,-1 +do_test 6 mono a,0 a,0 +do_test 6 mono a,10 a,10 +do_test 4 mono a,10,b,20 a,10,b,20 +do_test 6 mono a,20,b,10 b,20,a,20 -tc qdisc add dev lo root fq -./so_txtime -4 -6 -c mono a,-1 a,-1 -./so_txtime -4 -6 -c mono a,0 a,0 -./so_txtime -4 -6 -c mono a,10 a,10 -./so_txtime -4 -6 -c mono a,10,b,20 a,10,b,20 -./so_txtime -4 -6 -c mono a,20,b,10 b,20,a,20 - -if tc qdisc replace dev lo root etf clockid CLOCK_TAI delta 400000; then - ! ./so_txtime -4 -6 -c tai a,-1 a,-1 - ! ./so_txtime -4 -6 -c tai a,0 a,0 - ./so_txtime -4 -6 -c tai a,10 a,10 - ./so_txtime -4 -6 -c tai a,10,b,20 a,10,b,20 - ./so_txtime -4 -6 -c tai a,20,b,10 b,10,a,20 +if ip netns exec "${NS1}" tc qdisc replace dev "${DEV}" root etf clockid CLOCK_TAI delta 400000; then + ! do_test 4 tai a,-1 a,-1 + ! do_test 6 tai a,0 a,0 + do_test 6 tai a,10 a,10 + do_test 4 tai a,10,b,20 a,10,b,20 + do_test 6 tai a,20,b,10 b,10,a,20 else echo "tc ($(tc -V)) does not support qdisc etf. skipping" fi