From c6b486fb33680ad5a3a6390ce693c835caaae3f7 Mon Sep 17 00:00:00 2001 From: Siddharth Vadapalli Date: Mon, 3 Apr 2023 14:33:21 +0530 Subject: [PATCH 1/7] net: ethernet: ti: am65-cpsw: Fix mdio cleanup in probe In the am65_cpsw_nuss_probe() function's cleanup path, the call to of_platform_device_destroy() for the common->mdio_dev device is invoked unconditionally. It is possible that either the MDIO node is not present in the device-tree, or the MDIO node is disabled in the device-tree. In both these cases, the MDIO device is not created, resulting in a NULL pointer dereference when the of_platform_device_destroy() function is invoked on the common->mdio_dev device on the cleanup path. Fix this by ensuring that the common->mdio_dev device exists, before attempting to invoke of_platform_device_destroy(). Fixes: a45cfcc69a25 ("net: ethernet: ti: am65-cpsw-nuss: use of_platform_device_create() for mdio") Signed-off-by: Siddharth Vadapalli Reviewed-by: Roger Quadros Link: https://lore.kernel.org/r/20230403090321.835877-1-s-vadapalli@ti.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 4e3861c47708c..bcea87b7151c0 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -2926,7 +2926,8 @@ static int am65_cpsw_nuss_probe(struct platform_device *pdev) am65_cpsw_nuss_phylink_cleanup(common); am65_cpts_release(common->cpts); err_of_clear: - of_platform_device_destroy(common->mdio_dev, NULL); + if (common->mdio_dev) + of_platform_device_destroy(common->mdio_dev, NULL); err_pm_clear: pm_runtime_put_sync(dev); pm_runtime_disable(dev); @@ -2956,7 +2957,8 @@ static int am65_cpsw_nuss_remove(struct platform_device *pdev) am65_cpts_release(common->cpts); am65_cpsw_disable_serdes_phy(common); - of_platform_device_destroy(common->mdio_dev, NULL); + if (common->mdio_dev) + of_platform_device_destroy(common->mdio_dev, NULL); pm_runtime_put_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); From 218c597325f4faf7b7a6049233a30d7842b5b2dc Mon Sep 17 00:00:00 2001 From: Corinna Vinschen Date: Mon, 3 Apr 2023 14:11:20 +0200 Subject: [PATCH 2/7] net: stmmac: fix up RX flow hash indirection table when setting channels stmmac_reinit_queues() fails to fix up the RX hash. Even if the number of channels gets restricted, the output of `ethtool -x' indicates that all RX queues are used: $ ethtool -l enp0s29f2 Channel parameters for enp0s29f2: Pre-set maximums: RX: 8 TX: 8 Other: n/a Combined: n/a Current hardware settings: RX: 8 TX: 8 Other: n/a Combined: n/a $ ethtool -x enp0s29f2 RX flow hash indirection table for enp0s29f2 with 8 RX ring(s): 0: 0 1 2 3 4 5 6 7 8: 0 1 2 3 4 5 6 7 [...] $ ethtool -L enp0s29f2 rx 3 $ ethtool -x enp0s29f2 RX flow hash indirection table for enp0s29f2 with 3 RX ring(s): 0: 0 1 2 3 4 5 6 7 8: 0 1 2 3 4 5 6 7 [...] Fix this by setting the indirection table according to the number of specified queues. The result is now as expected: $ ethtool -L enp0s29f2 rx 3 $ ethtool -x enp0s29f2 RX flow hash indirection table for enp0s29f2 with 3 RX ring(s): 0: 0 1 2 0 1 2 0 1 8: 2 0 1 2 0 1 2 0 [...] Tested on Intel Elkhart Lake. Fixes: 0366f7e06a6b ("net: stmmac: add ethtool support for get/set channels") Signed-off-by: Corinna Vinschen Link: https://lore.kernel.org/r/20230403121120.489138-1-vinschen@redhat.com Signed-off-by: Paolo Abeni --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index d41a5f92aee7a..59cbf3597eb42 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -6950,7 +6950,7 @@ static void stmmac_napi_del(struct net_device *dev) int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) { struct stmmac_priv *priv = netdev_priv(dev); - int ret = 0; + int ret = 0, i; if (netif_running(dev)) stmmac_release(dev); @@ -6959,6 +6959,10 @@ int stmmac_reinit_queues(struct net_device *dev, u32 rx_cnt, u32 tx_cnt) priv->plat->rx_queues_to_use = rx_cnt; priv->plat->tx_queues_to_use = tx_cnt; + if (!netif_is_rxfh_configured(dev)) + for (i = 0; i < ARRAY_SIZE(priv->rss.table); i++) + priv->rss.table[i] = ethtool_rxfh_indir_default(i, + rx_cnt); stmmac_napi_add(dev); From 0a78cf7264d29abeca098eae0b188a10aabc8a32 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 3 Apr 2023 12:49:58 -0700 Subject: [PATCH 3/7] raw: Fix NULL deref in raw_get_next(). Dae R. Jeong reported a NULL deref in raw_get_next() [0]. It seems that the repro was running these sequences in parallel so that one thread was iterating on a socket that was being freed in another netns. unshare(0x40060200) r0 = syz_open_procfs(0x0, &(0x7f0000002080)='net/raw\x00') socket$inet_icmp_raw(0x2, 0x3, 0x1) pread64(r0, &(0x7f0000000000)=""/10, 0xa, 0x10000000007f) After commit 0daf07e52709 ("raw: convert raw sockets to RCU"), we use RCU and hlist_nulls_for_each_entry() to iterate over SOCK_RAW sockets. However, we should use spinlock for slow paths to avoid the NULL deref. Also, SOCK_RAW does not use SLAB_TYPESAFE_BY_RCU, and the slab object is not reused during iteration in the grace period. In fact, the lockless readers do not check the nulls marker with get_nulls_value(). So, SOCK_RAW should use hlist instead of hlist_nulls. Instead of adding an unnecessary barrier by sk_nulls_for_each_rcu(), let's convert hlist_nulls to hlist and use sk_for_each_rcu() for fast paths and sk_for_each() and spinlock for /proc/net/raw. [0]: general protection fault, probably for non-canonical address 0xdffffc0000000005: 0000 [#1] PREEMPT SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000028-0x000000000000002f] CPU: 2 PID: 20952 Comm: syz-executor.0 Not tainted 6.2.0-g048ec869bafd-dirty #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 RIP: 0010:read_pnet include/net/net_namespace.h:383 [inline] RIP: 0010:sock_net include/net/sock.h:649 [inline] RIP: 0010:raw_get_next net/ipv4/raw.c:974 [inline] RIP: 0010:raw_get_idx net/ipv4/raw.c:986 [inline] RIP: 0010:raw_seq_start+0x431/0x800 net/ipv4/raw.c:995 Code: ef e8 33 3d 94 f7 49 8b 6d 00 4c 89 ef e8 b7 65 5f f7 49 89 ed 49 83 c5 98 0f 84 9a 00 00 00 48 83 c5 c8 48 89 e8 48 c1 e8 03 <42> 80 3c 30 00 74 08 48 89 ef e8 00 3d 94 f7 4c 8b 7d 00 48 89 ef RSP: 0018:ffffc9001154f9b0 EFLAGS: 00010206 RAX: 0000000000000005 RBX: 1ffff1100302c8fd RCX: 0000000000000000 RDX: 0000000000000028 RSI: ffffc9001154f988 RDI: ffffc9000f77a338 RBP: 0000000000000029 R08: ffffffff8a50ffb4 R09: fffffbfff24b6bd9 R10: fffffbfff24b6bd9 R11: 0000000000000000 R12: ffff88801db73b78 R13: fffffffffffffff9 R14: dffffc0000000000 R15: 0000000000000030 FS: 00007f843ae8e700(0000) GS:ffff888063700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000055bb9614b35f CR3: 000000003c672000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: seq_read_iter+0x4c6/0x10f0 fs/seq_file.c:225 seq_read+0x224/0x320 fs/seq_file.c:162 pde_read fs/proc/inode.c:316 [inline] proc_reg_read+0x23f/0x330 fs/proc/inode.c:328 vfs_read+0x31e/0xd30 fs/read_write.c:468 ksys_pread64 fs/read_write.c:665 [inline] __do_sys_pread64 fs/read_write.c:675 [inline] __se_sys_pread64 fs/read_write.c:672 [inline] __x64_sys_pread64+0x1e9/0x280 fs/read_write.c:672 do_syscall_x64 arch/x86/entry/common.c:51 [inline] do_syscall_64+0x4e/0xa0 arch/x86/entry/common.c:82 entry_SYSCALL_64_after_hwframe+0x63/0xcd RIP: 0033:0x478d29 Code: f7 d8 64 89 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 bc ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007f843ae8dbe8 EFLAGS: 00000246 ORIG_RAX: 0000000000000011 RAX: ffffffffffffffda RBX: 0000000000791408 RCX: 0000000000478d29 RDX: 000000000000000a RSI: 0000000020000000 RDI: 0000000000000003 RBP: 00000000f477909a R08: 0000000000000000 R09: 0000000000000000 R10: 000010000000007f R11: 0000000000000246 R12: 0000000000791740 R13: 0000000000791414 R14: 0000000000791408 R15: 00007ffc2eb48a50 Modules linked in: ---[ end trace 0000000000000000 ]--- RIP: 0010:read_pnet include/net/net_namespace.h:383 [inline] RIP: 0010:sock_net include/net/sock.h:649 [inline] RIP: 0010:raw_get_next net/ipv4/raw.c:974 [inline] RIP: 0010:raw_get_idx net/ipv4/raw.c:986 [inline] RIP: 0010:raw_seq_start+0x431/0x800 net/ipv4/raw.c:995 Code: ef e8 33 3d 94 f7 49 8b 6d 00 4c 89 ef e8 b7 65 5f f7 49 89 ed 49 83 c5 98 0f 84 9a 00 00 00 48 83 c5 c8 48 89 e8 48 c1 e8 03 <42> 80 3c 30 00 74 08 48 89 ef e8 00 3d 94 f7 4c 8b 7d 00 48 89 ef RSP: 0018:ffffc9001154f9b0 EFLAGS: 00010206 RAX: 0000000000000005 RBX: 1ffff1100302c8fd RCX: 0000000000000000 RDX: 0000000000000028 RSI: ffffc9001154f988 RDI: ffffc9000f77a338 RBP: 0000000000000029 R08: ffffffff8a50ffb4 R09: fffffbfff24b6bd9 R10: fffffbfff24b6bd9 R11: 0000000000000000 R12: ffff88801db73b78 R13: fffffffffffffff9 R14: dffffc0000000000 R15: 0000000000000030 FS: 00007f843ae8e700(0000) GS:ffff888063700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f92ff166000 CR3: 000000003c672000 CR4: 00000000003506e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Fixes: 0daf07e52709 ("raw: convert raw sockets to RCU") Reported-by: syzbot Reported-by: Dae R. Jeong Link: https://lore.kernel.org/netdev/ZCA2mGV_cmq7lIfV@dragonet/ Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- include/net/raw.h | 4 ++-- net/ipv4/raw.c | 36 +++++++++++++++++++----------------- net/ipv4/raw_diag.c | 10 ++++------ net/ipv6/raw.c | 10 ++++------ 4 files changed, 29 insertions(+), 31 deletions(-) diff --git a/include/net/raw.h b/include/net/raw.h index 2c004c20ed996..3af5289fdead9 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -37,7 +37,7 @@ int raw_rcv(struct sock *, struct sk_buff *); struct raw_hashinfo { spinlock_t lock; - struct hlist_nulls_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned; + struct hlist_head ht[RAW_HTABLE_SIZE] ____cacheline_aligned; }; static inline u32 raw_hashfunc(const struct net *net, u32 proto) @@ -51,7 +51,7 @@ static inline void raw_hashinfo_init(struct raw_hashinfo *hashinfo) spin_lock_init(&hashinfo->lock); for (i = 0; i < RAW_HTABLE_SIZE; i++) - INIT_HLIST_NULLS_HEAD(&hashinfo->ht[i], i); + INIT_HLIST_HEAD(&hashinfo->ht[i]); } #ifdef CONFIG_PROC_FS diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 94df935ee0c5a..8088a5011e7df 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -91,12 +91,12 @@ EXPORT_SYMBOL_GPL(raw_v4_hashinfo); int raw_hash_sk(struct sock *sk) { struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; - struct hlist_nulls_head *hlist; + struct hlist_head *hlist; hlist = &h->ht[raw_hashfunc(sock_net(sk), inet_sk(sk)->inet_num)]; spin_lock(&h->lock); - __sk_nulls_add_node_rcu(sk, hlist); + sk_add_node_rcu(sk, hlist); sock_set_flag(sk, SOCK_RCU_FREE); spin_unlock(&h->lock); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); @@ -110,7 +110,7 @@ void raw_unhash_sk(struct sock *sk) struct raw_hashinfo *h = sk->sk_prot->h.raw_hash; spin_lock(&h->lock); - if (__sk_nulls_del_node_init_rcu(sk)) + if (sk_del_node_init_rcu(sk)) sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1); spin_unlock(&h->lock); } @@ -163,16 +163,15 @@ static int icmp_filter(const struct sock *sk, const struct sk_buff *skb) static int raw_v4_input(struct net *net, struct sk_buff *skb, const struct iphdr *iph, int hash) { - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; int sdif = inet_sdif(skb); + struct hlist_head *hlist; int dif = inet_iif(skb); int delivered = 0; struct sock *sk; hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { if (!raw_v4_match(net, sk, iph->protocol, iph->saddr, iph->daddr, dif, sdif)) continue; @@ -264,10 +263,9 @@ static void raw_err(struct sock *sk, struct sk_buff *skb, u32 info) void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) { struct net *net = dev_net(skb->dev); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; int dif = skb->dev->ifindex; int sdif = inet_sdif(skb); + struct hlist_head *hlist; const struct iphdr *iph; struct sock *sk; int hash; @@ -276,7 +274,7 @@ void raw_icmp_error(struct sk_buff *skb, int protocol, u32 info) hlist = &raw_v4_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { iph = (const struct iphdr *)skb->data; if (!raw_v4_match(net, sk, iph->protocol, iph->daddr, iph->saddr, dif, sdif)) @@ -950,14 +948,13 @@ static struct sock *raw_get_first(struct seq_file *seq, int bucket) { struct raw_hashinfo *h = pde_data(file_inode(seq->file)); struct raw_iter_state *state = raw_seq_private(seq); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; + struct hlist_head *hlist; struct sock *sk; for (state->bucket = bucket; state->bucket < RAW_HTABLE_SIZE; ++state->bucket) { hlist = &h->ht[state->bucket]; - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each(sk, hlist) { if (sock_net(sk) == seq_file_net(seq)) return sk; } @@ -970,7 +967,7 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk) struct raw_iter_state *state = raw_seq_private(seq); do { - sk = sk_nulls_next(sk); + sk = sk_next(sk); } while (sk && sock_net(sk) != seq_file_net(seq)); if (!sk) @@ -989,9 +986,12 @@ static struct sock *raw_get_idx(struct seq_file *seq, loff_t pos) } void *raw_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(RCU) + __acquires(&h->lock) { - rcu_read_lock(); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); + + spin_lock(&h->lock); + return *pos ? raw_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; } EXPORT_SYMBOL_GPL(raw_seq_start); @@ -1010,9 +1010,11 @@ void *raw_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL_GPL(raw_seq_next); void raw_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) + __releases(&h->lock) { - rcu_read_unlock(); + struct raw_hashinfo *h = pde_data(file_inode(seq->file)); + + spin_unlock(&h->lock); } EXPORT_SYMBOL_GPL(raw_seq_stop); diff --git a/net/ipv4/raw_diag.c b/net/ipv4/raw_diag.c index 999321834b94a..da3591a66a169 100644 --- a/net/ipv4/raw_diag.c +++ b/net/ipv4/raw_diag.c @@ -57,8 +57,7 @@ static bool raw_lookup(struct net *net, struct sock *sk, static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r) { struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; + struct hlist_head *hlist; struct sock *sk; int slot; @@ -68,7 +67,7 @@ static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 rcu_read_lock(); for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) { hlist = &hashinfo->ht[slot]; - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { if (raw_lookup(net, sk, r)) { /* * Grab it and keep until we fill @@ -142,9 +141,8 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, struct raw_hashinfo *hashinfo = raw_get_hashinfo(r); struct net *net = sock_net(skb->sk); struct inet_diag_dump_data *cb_data; - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; int num, s_num, slot, s_slot; + struct hlist_head *hlist; struct sock *sk = NULL; struct nlattr *bc; @@ -161,7 +159,7 @@ static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, num = 0; hlist = &hashinfo->ht[slot]; - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { struct inet_sock *inet = inet_sk(sk); if (!net_eq(sock_net(sk), net)) diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index bac9ba747bdec..a327aa481df48 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -141,10 +141,9 @@ EXPORT_SYMBOL(rawv6_mh_filter_unregister); static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) { struct net *net = dev_net(skb->dev); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; const struct in6_addr *saddr; const struct in6_addr *daddr; + struct hlist_head *hlist; struct sock *sk; bool delivered = false; __u8 hash; @@ -155,7 +154,7 @@ static bool ipv6_raw_deliver(struct sk_buff *skb, int nexthdr) hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { int filtered; if (!raw_v6_match(net, sk, nexthdr, daddr, saddr, @@ -333,15 +332,14 @@ void raw6_icmp_error(struct sk_buff *skb, int nexthdr, u8 type, u8 code, int inner_offset, __be32 info) { struct net *net = dev_net(skb->dev); - struct hlist_nulls_head *hlist; - struct hlist_nulls_node *hnode; + struct hlist_head *hlist; struct sock *sk; int hash; hash = raw_hashfunc(net, nexthdr); hlist = &raw_v6_hashinfo.ht[hash]; rcu_read_lock(); - sk_nulls_for_each(sk, hnode, hlist) { + sk_for_each_rcu(sk, hlist) { /* Note: ipv6_hdr(skb) != skb->data */ const struct ipv6hdr *ip6h = (const struct ipv6hdr *)skb->data; From ab5fb73ffa01072b4d8031cc05801fa1cb653bee Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 3 Apr 2023 12:49:59 -0700 Subject: [PATCH 4/7] ping: Fix potentail NULL deref for /proc/net/icmp. After commit dbca1596bbb0 ("ping: convert to RCU lookups, get rid of rwlock"), we use RCU for ping sockets, but we should use spinlock for /proc/net/icmp to avoid a potential NULL deref mentioned in the previous patch. Let's go back to using spinlock there. Note we can convert ping sockets to use hlist instead of hlist_nulls because we do not use SLAB_TYPESAFE_BY_RCU for ping sockets. Fixes: dbca1596bbb0 ("ping: convert to RCU lookups, get rid of rwlock") Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/ipv4/ping.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index 409ec2a1f95b0..5178a3f3cb537 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -1089,13 +1089,13 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos) } void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family) - __acquires(RCU) + __acquires(ping_table.lock) { struct ping_iter_state *state = seq->private; state->bucket = 0; state->family = family; - rcu_read_lock(); + spin_lock(&ping_table.lock); return *pos ? ping_get_idx(seq, *pos-1) : SEQ_START_TOKEN; } @@ -1121,9 +1121,9 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos) EXPORT_SYMBOL_GPL(ping_seq_next); void ping_seq_stop(struct seq_file *seq, void *v) - __releases(RCU) + __releases(ping_table.lock) { - rcu_read_unlock(); + spin_unlock(&ping_table.lock); } EXPORT_SYMBOL_GPL(ping_seq_stop); From e847c7675e19ef344913724dc68f83df31ad6a17 Mon Sep 17 00:00:00 2001 From: Andy Roulin Date: Mon, 3 Apr 2023 14:20:53 -0700 Subject: [PATCH 5/7] ethtool: reset #lanes when lanes is omitted If the number of lanes was forced and then subsequently the user omits this parameter, the ksettings->lanes is reset. The driver should then reset the number of lanes to the device's default for the specified speed. However, although the ksettings->lanes is set to 0, the mod variable is not set to true to indicate the driver and userspace should be notified of the changes. The consequence is that the same ethtool operation will produce different results based on the initial state. If the initial state is: $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 2 Duplex: Full Auto-negotiation: on then executing 'ethtool -s swp1 speed 50000 autoneg off' will yield: $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 2 Duplex: Full Auto-negotiation: off While if the initial state is: $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 1 Duplex: Full Auto-negotiation: off executing the same 'ethtool -s swp1 speed 50000 autoneg off' results in: $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 1 Duplex: Full Auto-negotiation: off This patch fixes this behavior. Omitting lanes will always results in the driver choosing the default lane width for the chosen speed. In this scenario, regardless of the initial state, the end state will be, e.g., $ ethtool swp1 | grep -A 3 'Speed: ' Speed: 500000Mb/s Lanes: 2 Duplex: Full Auto-negotiation: off Fixes: 012ce4dd3102 ("ethtool: Extend link modes settings uAPI with lanes") Signed-off-by: Andy Roulin Reviewed-by: Danielle Ratson Reviewed-by: Ido Schimmel Link: https://lore.kernel.org/r/ac238d6b-8726-8156-3810-6471291dbc7f@nvidia.com Signed-off-by: Jakub Kicinski --- net/ethtool/linkmodes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index fab66c169b9fa..20165e07ef901 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -270,11 +270,12 @@ static int ethnl_update_linkmodes(struct genl_info *info, struct nlattr **tb, "lanes configuration not supported by device"); return -EOPNOTSUPP; } - } else if (!lsettings->autoneg) { - /* If autoneg is off and lanes parameter is not passed from user, - * set the lanes parameter to 0. + } else if (!lsettings->autoneg && ksettings->lanes) { + /* If autoneg is off and lanes parameter is not passed from user but + * it was defined previously then set the lanes parameter to 0. */ ksettings->lanes = 0; + *mod = true; } ret = ethnl_update_bitset(ksettings->link_modes.advertising, From a1865f2e7d10dde00d35a2122b38d2e469ae67ed Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 3 Apr 2023 21:46:43 +0000 Subject: [PATCH 6/7] netlink: annotate lockless accesses to nlk->max_recvmsg_len syzbot reported a data-race in data-race in netlink_recvmsg() [1] Indeed, netlink_recvmsg() can be run concurrently, and netlink_dump() also needs protection. [1] BUG: KCSAN: data-race in netlink_recvmsg / netlink_recvmsg read to 0xffff888141840b38 of 8 bytes by task 23057 on cpu 0: netlink_recvmsg+0xea/0x730 net/netlink/af_netlink.c:1988 sock_recvmsg_nosec net/socket.c:1017 [inline] sock_recvmsg net/socket.c:1038 [inline] __sys_recvfrom+0x1ee/0x2e0 net/socket.c:2194 __do_sys_recvfrom net/socket.c:2212 [inline] __se_sys_recvfrom net/socket.c:2208 [inline] __x64_sys_recvfrom+0x78/0x90 net/socket.c:2208 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd write to 0xffff888141840b38 of 8 bytes by task 23037 on cpu 1: netlink_recvmsg+0x114/0x730 net/netlink/af_netlink.c:1989 sock_recvmsg_nosec net/socket.c:1017 [inline] sock_recvmsg net/socket.c:1038 [inline] ____sys_recvmsg+0x156/0x310 net/socket.c:2720 ___sys_recvmsg net/socket.c:2762 [inline] do_recvmmsg+0x2e5/0x710 net/socket.c:2856 __sys_recvmmsg net/socket.c:2935 [inline] __do_sys_recvmmsg net/socket.c:2958 [inline] __se_sys_recvmmsg net/socket.c:2951 [inline] __x64_sys_recvmmsg+0xe2/0x160 net/socket.c:2951 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x63/0xcd value changed: 0x0000000000000000 -> 0x0000000000001000 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 23037 Comm: syz-executor.2 Not tainted 6.3.0-rc4-syzkaller-00195-g5a57b48fdfcb #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/02/2023 Fixes: 9063e21fb026 ("netlink: autosize skb lengthes") Reported-by: syzbot Signed-off-by: Eric Dumazet Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20230403214643.768555-1-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/netlink/af_netlink.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index c642776597531..f365dfdd672d7 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -1952,7 +1952,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, struct scm_cookie scm; struct sock *sk = sock->sk; struct netlink_sock *nlk = nlk_sk(sk); - size_t copied; + size_t copied, max_recvmsg_len; struct sk_buff *skb, *data_skb; int err, ret; @@ -1985,9 +1985,10 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, #endif /* Record the max length of recvmsg() calls for future allocations */ - nlk->max_recvmsg_len = max(nlk->max_recvmsg_len, len); - nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len, - SKB_WITH_OVERHEAD(32768)); + max_recvmsg_len = max(READ_ONCE(nlk->max_recvmsg_len), len); + max_recvmsg_len = min_t(size_t, max_recvmsg_len, + SKB_WITH_OVERHEAD(32768)); + WRITE_ONCE(nlk->max_recvmsg_len, max_recvmsg_len); copied = data_skb->len; if (len < copied) { @@ -2236,6 +2237,7 @@ static int netlink_dump(struct sock *sk) struct netlink_ext_ack extack = {}; struct netlink_callback *cb; struct sk_buff *skb = NULL; + size_t max_recvmsg_len; struct module *module; int err = -ENOBUFS; int alloc_min_size; @@ -2258,8 +2260,9 @@ static int netlink_dump(struct sock *sk) cb = &nlk->cb; alloc_min_size = max_t(int, cb->min_dump_alloc, NLMSG_GOODSIZE); - if (alloc_min_size < nlk->max_recvmsg_len) { - alloc_size = nlk->max_recvmsg_len; + max_recvmsg_len = READ_ONCE(nlk->max_recvmsg_len); + if (alloc_min_size < max_recvmsg_len) { + alloc_size = max_recvmsg_len; skb = alloc_skb(alloc_size, (GFP_KERNEL & ~__GFP_DIRECT_RECLAIM) | __GFP_NOWARN | __GFP_NORETRY); From 3ce9345580974863c060fa32971537996a7b2d57 Mon Sep 17 00:00:00 2001 From: Shailend Chand Date: Mon, 3 Apr 2023 10:28:09 -0700 Subject: [PATCH 7/7] gve: Secure enough bytes in the first TX desc for all TCP pkts Non-GSO TCP packets whose SKBs' linear portion did not include the entire TCP header were not populating the first Tx descriptor with as many bytes as the vNIC expected. This change ensures that all TCP packets populate the first descriptor with the correct number of bytes. Fixes: 893ce44df565 ("gve: Add basic driver framework for Compute Engine Virtual NIC") Signed-off-by: Shailend Chand Link: https://lore.kernel.org/r/20230403172809.2939306-1-shailend@google.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/google/gve/gve.h | 2 ++ drivers/net/ethernet/google/gve/gve_tx.c | 12 +++++------- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/google/gve/gve.h b/drivers/net/ethernet/google/gve/gve.h index 64eb0442c82fd..005cb9dfe078b 100644 --- a/drivers/net/ethernet/google/gve/gve.h +++ b/drivers/net/ethernet/google/gve/gve.h @@ -47,6 +47,8 @@ #define GVE_RX_BUFFER_SIZE_DQO 2048 +#define GVE_GQ_TX_MIN_PKT_DESC_BYTES 182 + /* Each slot in the desc ring has a 1:1 mapping to a slot in the data ring */ struct gve_rx_desc_queue { struct gve_rx_desc *desc_ring; /* the descriptor ring */ diff --git a/drivers/net/ethernet/google/gve/gve_tx.c b/drivers/net/ethernet/google/gve/gve_tx.c index 4888bf05fbedb..5e11b82367545 100644 --- a/drivers/net/ethernet/google/gve/gve_tx.c +++ b/drivers/net/ethernet/google/gve/gve_tx.c @@ -284,8 +284,8 @@ static inline int gve_skb_fifo_bytes_required(struct gve_tx_ring *tx, int bytes; int hlen; - hlen = skb_is_gso(skb) ? skb_checksum_start_offset(skb) + - tcp_hdrlen(skb) : skb_headlen(skb); + hlen = skb_is_gso(skb) ? skb_checksum_start_offset(skb) + tcp_hdrlen(skb) : + min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len); pad_bytes = gve_tx_fifo_pad_alloc_one_frag(&tx->tx_fifo, hlen); @@ -454,13 +454,11 @@ static int gve_tx_add_skb_copy(struct gve_priv *priv, struct gve_tx_ring *tx, st pkt_desc = &tx->desc[idx]; l4_hdr_offset = skb_checksum_start_offset(skb); - /* If the skb is gso, then we want the tcp header in the first segment - * otherwise we want the linear portion of the skb (which will contain - * the checksum because skb->csum_start and skb->csum_offset are given - * relative to skb->head) in the first segment. + /* If the skb is gso, then we want the tcp header alone in the first segment + * otherwise we want the minimum required by the gVNIC spec. */ hlen = is_gso ? l4_hdr_offset + tcp_hdrlen(skb) : - skb_headlen(skb); + min_t(int, GVE_GQ_TX_MIN_PKT_DESC_BYTES, skb->len); info->skb = skb; /* We don't want to split the header, so if necessary, pad to the end