From 472547778de24e2764ab325268dd5b77e6923939 Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Thu, 22 Oct 2020 13:27:38 -0700 Subject: [PATCH 01/65] selftest/bpf: Fix profiler test using CO-RE relocation for enums Instead of hard-coding invalid pids_cgrp_id, use Kconfig to detect the presence of that enum value and CO-RE to capture its actual value in the hosts's kernel. Fixes: 03d4d13fab3f ("selftests/bpf: Add profiler test") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Tested-by: Song Liu Link: https://lore.kernel.org/bpf/20201022202739.3667367-1-andrii@kernel.org --- tools/testing/selftests/bpf/progs/profiler.inc.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/bpf/progs/profiler.inc.h b/tools/testing/selftests/bpf/progs/profiler.inc.h index 00578311a4233e..30982a7e4d0f78 100644 --- a/tools/testing/selftests/bpf/progs/profiler.inc.h +++ b/tools/testing/selftests/bpf/progs/profiler.inc.h @@ -243,7 +243,10 @@ static ino_t get_inode_from_kernfs(struct kernfs_node* node) } } -int pids_cgrp_id = 1; +extern bool CONFIG_CGROUP_PIDS __kconfig __weak; +enum cgroup_subsys_id___local { + pids_cgrp_id___local = 123, /* value doesn't matter */ +}; static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, struct task_struct* task, @@ -253,7 +256,9 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, BPF_CORE_READ(task, nsproxy, cgroup_ns, root_cset, dfl_cgrp, kn); struct kernfs_node* proc_kernfs = BPF_CORE_READ(task, cgroups, dfl_cgrp, kn); - if (ENABLE_CGROUP_V1_RESOLVER) { + if (ENABLE_CGROUP_V1_RESOLVER && CONFIG_CGROUP_PIDS) { + int cgrp_id = bpf_core_enum_value(enum cgroup_subsys_id___local, + pids_cgrp_id___local); #ifdef UNROLL #pragma unroll #endif @@ -262,7 +267,7 @@ static INLINE void* populate_cgroup_info(struct cgroup_data_t* cgroup_data, BPF_CORE_READ(task, cgroups, subsys[i]); if (subsys != NULL) { int subsys_id = BPF_CORE_READ(subsys, ss, id); - if (subsys_id == pids_cgrp_id) { + if (subsys_id == cgrp_id) { proc_kernfs = BPF_CORE_READ(subsys, cgroup, kn); root_kernfs = BPF_CORE_READ(subsys, ss, root, kf_root, kn); break; From 343a3e8bc635bd4c58d45a4fe67f9c3a78fbd191 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 26 Oct 2020 17:20:50 +0100 Subject: [PATCH 02/65] bpf: Fix -Wshadow warnings There are thousands of warnings about one macro in a W=2 build: include/linux/filter.h:561:6: warning: declaration of 'ret' shadows a previous local [-Wshadow] Prefix all the locals in that macro with __ to avoid most of these warnings. Fixes: 492ecee892c2 ("bpf: enable program stats") Signed-off-by: Arnd Bergmann Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201026162110.3710415-1-arnd@kernel.org --- include/linux/filter.h | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/filter.h b/include/linux/filter.h index 72d62cbc1578f9..1b62397bd12471 100644 --- a/include/linux/filter.h +++ b/include/linux/filter.h @@ -558,21 +558,21 @@ struct sk_filter { DECLARE_STATIC_KEY_FALSE(bpf_stats_enabled_key); #define __BPF_PROG_RUN(prog, ctx, dfunc) ({ \ - u32 ret; \ + u32 __ret; \ cant_migrate(); \ if (static_branch_unlikely(&bpf_stats_enabled_key)) { \ - struct bpf_prog_stats *stats; \ - u64 start = sched_clock(); \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ - stats = this_cpu_ptr(prog->aux->stats); \ - u64_stats_update_begin(&stats->syncp); \ - stats->cnt++; \ - stats->nsecs += sched_clock() - start; \ - u64_stats_update_end(&stats->syncp); \ + struct bpf_prog_stats *__stats; \ + u64 __start = sched_clock(); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __stats = this_cpu_ptr(prog->aux->stats); \ + u64_stats_update_begin(&__stats->syncp); \ + __stats->cnt++; \ + __stats->nsecs += sched_clock() - __start; \ + u64_stats_update_end(&__stats->syncp); \ } else { \ - ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ + __ret = dfunc(ctx, (prog)->insnsi, (prog)->bpf_func); \ } \ - ret; }) + __ret; }) #define BPF_PROG_RUN(prog, ctx) \ __BPF_PROG_RUN(prog, ctx, bpf_dispatcher_nop_func) From c66dca98a24cb5f3493dd08d40bcfa94a220fa92 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Toke=20H=C3=B8iland-J=C3=B8rgensen?= Date: Tue, 27 Oct 2020 00:36:23 +0100 Subject: [PATCH 03/65] samples/bpf: Set rlimit for memlock to infinity in all samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memlock rlimit is a notorious source of failure for BPF programs. Most of the samples just set it to infinity, but a few used a lower limit. The problem with unconditionally setting a lower limit is that this will also override the limit if the system-wide setting is *higher* than the limit being set, which can lead to failures on systems that lock a lot of memory, but set 'ulimit -l' to unlimited before running a sample. One fix for this is to only conditionally set the limit if the current limit is lower, but it is simpler to just unify all the samples and have them all set the limit to infinity. Signed-off-by: Toke Høiland-Jørgensen Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Jesper Dangaard Brouer Link: https://lore.kernel.org/bpf/20201026233623.91728-1-toke@redhat.com --- samples/bpf/task_fd_query_user.c | 2 +- samples/bpf/tracex2_user.c | 2 +- samples/bpf/tracex3_user.c | 2 +- samples/bpf/xdp_redirect_cpu_user.c | 2 +- samples/bpf/xdp_rxq_info_user.c | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/samples/bpf/task_fd_query_user.c b/samples/bpf/task_fd_query_user.c index 4a74531dc40310..b68bd2f8fdc924 100644 --- a/samples/bpf/task_fd_query_user.c +++ b/samples/bpf/task_fd_query_user.c @@ -290,7 +290,7 @@ static int test_debug_fs_uprobe(char *binary_path, long offset, bool is_return) int main(int argc, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; extern char __executable_start; char filename[256], buf[256]; __u64 uprobe_file_offset; diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index 3e36b3e4e3efdb..3d6eab711d23e1 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -116,7 +116,7 @@ static void int_exit(int sig) int main(int ac, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; long key, next_key, value; struct bpf_link *links[2]; struct bpf_program *prog; diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index 70e987775c156f..83e0fecbb01ace 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -107,7 +107,7 @@ static void print_hist(int fd) int main(int ac, char **argv) { - struct rlimit r = {1024*1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_link *links[2]; struct bpf_program *prog; struct bpf_object *obj; diff --git a/samples/bpf/xdp_redirect_cpu_user.c b/samples/bpf/xdp_redirect_cpu_user.c index 6fb8dbde62c5e0..f78cb18319aaf9 100644 --- a/samples/bpf/xdp_redirect_cpu_user.c +++ b/samples/bpf/xdp_redirect_cpu_user.c @@ -765,7 +765,7 @@ static int load_cpumap_prog(char *file_name, char *prog_name, int main(int argc, char **argv) { - struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char *prog_name = "xdp_cpu_map5_lb_hash_ip_pairs"; char *mprog_filename = "xdp_redirect_kern.o"; char *redir_interface = NULL, *redir_map = NULL; diff --git a/samples/bpf/xdp_rxq_info_user.c b/samples/bpf/xdp_rxq_info_user.c index caa4e7ffcfc7b8..93fa1bc54f131e 100644 --- a/samples/bpf/xdp_rxq_info_user.c +++ b/samples/bpf/xdp_rxq_info_user.c @@ -450,7 +450,7 @@ static void stats_poll(int interval, int action, __u32 cfg_opt) int main(int argc, char **argv) { __u32 cfg_options= NO_TOUCH ; /* Default: Don't touch packet memory */ - struct rlimit r = {10 * 1024 * 1024, RLIM_INFINITY}; + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; struct bpf_prog_load_attr prog_load_attr = { .prog_type = BPF_PROG_TYPE_XDP, }; From 821f5c90130d15f8f725412d714d05df3b9e0fac Mon Sep 17 00:00:00 2001 From: Andrii Nakryiko Date: Wed, 28 Oct 2020 11:12:04 -0700 Subject: [PATCH 04/65] bpf: Add struct bpf_redir_neigh forward declaration to BPF helper defs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Forward-declare struct bpf_redir_neigh in bpf_helper_defs.h to avoid compiler warning about unknown structs. Fixes: ba452c9e996d ("bpf: Fix bpf_redirect_neigh helper api to support supplying nexthop") Signed-off-by: Andrii Nakryiko Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20201028181204.111241-1-andrii@kernel.org --- scripts/bpf_helpers_doc.py | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/bpf_helpers_doc.py b/scripts/bpf_helpers_doc.py index 6769caae142fbc..31484377b8b11a 100755 --- a/scripts/bpf_helpers_doc.py +++ b/scripts/bpf_helpers_doc.py @@ -408,6 +408,7 @@ class PrinterHelpers(Printer): 'struct bpf_perf_event_data', 'struct bpf_perf_event_value', 'struct bpf_pidns_info', + 'struct bpf_redir_neigh', 'struct bpf_sock', 'struct bpf_sock_addr', 'struct bpf_sock_ops', From e5e1a4bc916d29958c3b587354293738fcb984d7 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 27 Oct 2020 13:32:01 +0100 Subject: [PATCH 05/65] xsk: Fix possible memory leak at socket close MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix a possible memory leak at xsk socket close that is caused by the refcounting of the umem object being wrong. The reference count of the umem was decremented only after the pool had been freed. Note that if the buffer pool is destroyed, it is important that the umem is destroyed after the pool, otherwise the umem would disappear while the driver is still running. And as the buffer pool needs to be destroyed in a work queue, the umem is also (if its refcount reaches zero) destroyed after the buffer pool in that same work queue. What was missing is that the refcount also needs to be decremented when the pool is not freed and when the pool has not even been created. The first case happens when the refcount of the pool is higher than 1, i.e. it is still being used by some other socket using the same device and queue id. In this case, it is safe to decrement the refcount of the umem outside of the work queue as the umem will never be freed because the refcount of the umem is always greater than or equal to the refcount of the buffer pool. The second case is if the buffer pool has not been created yet, i.e. the socket was closed before it was bound but after the umem was created. In this case, it is safe to destroy the umem outside of the work queue, since there is no pool that can use it by definition. Fixes: 1c1efc2af158 ("xsk: Create and free buffer pool independently from umem") Reported-by: syzbot+eb71df123dc2be2c1456@syzkaller.appspotmail.com Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Björn Töpel Link: https://lore.kernel.org/bpf/1603801921-2712-1-git-send-email-magnus.karlsson@gmail.com --- include/net/xsk_buff_pool.h | 2 +- net/xdp/xsk.c | 3 ++- net/xdp/xsk_buff_pool.c | 7 +++++-- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/net/xsk_buff_pool.h b/include/net/xsk_buff_pool.h index 0140d086dc84ea..01755b838c7450 100644 --- a/include/net/xsk_buff_pool.h +++ b/include/net/xsk_buff_pool.h @@ -86,7 +86,7 @@ int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, void xp_destroy(struct xsk_buff_pool *pool); void xp_release(struct xdp_buff_xsk *xskb); void xp_get_pool(struct xsk_buff_pool *pool); -void xp_put_pool(struct xsk_buff_pool *pool); +bool xp_put_pool(struct xsk_buff_pool *pool); void xp_clear_dev(struct xsk_buff_pool *pool); void xp_add_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); void xp_del_xsk(struct xsk_buff_pool *pool, struct xdp_sock *xs); diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index b71a32eeae65b2..cfbec3989a769e 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -1146,7 +1146,8 @@ static void xsk_destruct(struct sock *sk) if (!sock_flag(sk, SOCK_DEAD)) return; - xp_put_pool(xs->pool); + if (!xp_put_pool(xs->pool)) + xdp_put_umem(xs->umem); sk_refcnt_debug_dec(sk); } diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index 64c9e55d4d4e78..8a3bf4e1318e1d 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -251,15 +251,18 @@ void xp_get_pool(struct xsk_buff_pool *pool) refcount_inc(&pool->users); } -void xp_put_pool(struct xsk_buff_pool *pool) +bool xp_put_pool(struct xsk_buff_pool *pool) { if (!pool) - return; + return false; if (refcount_dec_and_test(&pool->users)) { INIT_WORK(&pool->work, xp_release_deferred); schedule_work(&pool->work); + return true; } + + return false; } static struct xsk_dma_map *xp_find_dma_map(struct xsk_buff_pool *pool) From 1e6f5dcc1b9ec9068f5d38331cec38b35498edf5 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Oct 2020 16:36:45 -0700 Subject: [PATCH 06/65] tools, bpftool: Avoid array index warnings. The bpf_caps array is shorter without CAP_BPF, avoid out of bounds reads if this isn't defined. Working around this avoids -Wno-array-bounds with clang. Signed-off-by: Ian Rogers Signed-off-by: Daniel Borkmann Reviewed-by: Tobias Klauser Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201027233646.3434896-1-irogers@google.com --- tools/bpf/bpftool/feature.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/feature.c b/tools/bpf/bpftool/feature.c index a43a6f10b564c3..359960a8f1defc 100644 --- a/tools/bpf/bpftool/feature.c +++ b/tools/bpf/bpftool/feature.c @@ -843,9 +843,14 @@ static int handle_perms(void) else p_err("missing %s%s%s%s%s%s%s%srequired for full feature probing; run as root or use 'unprivileged'", capability_msg(bpf_caps, 0), +#ifdef CAP_BPF capability_msg(bpf_caps, 1), capability_msg(bpf_caps, 2), - capability_msg(bpf_caps, 3)); + capability_msg(bpf_caps, 3) +#else + "", "", "", "", "", "" +#endif /* CAP_BPF */ + ); goto exit_free; } From 0698ac66e01019528f0db4191ae3aaf9978e67da Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 27 Oct 2020 16:36:46 -0700 Subject: [PATCH 07/65] tools, bpftool: Remove two unused variables. Avoid an unused variable warning. Signed-off-by: Ian Rogers Signed-off-by: Daniel Borkmann Reviewed-by: Tobias Klauser Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201027233646.3434896-2-irogers@google.com --- tools/bpf/bpftool/skeleton/profiler.bpf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/bpf/bpftool/skeleton/profiler.bpf.c b/tools/bpf/bpftool/skeleton/profiler.bpf.c index 4e3512f700c085..ce5b65e07ab107 100644 --- a/tools/bpf/bpftool/skeleton/profiler.bpf.c +++ b/tools/bpf/bpftool/skeleton/profiler.bpf.c @@ -70,7 +70,7 @@ int BPF_PROG(fentry_XXX) static inline void fexit_update_maps(u32 id, struct bpf_perf_event_value *after) { - struct bpf_perf_event_value *before, diff, *accum; + struct bpf_perf_event_value *before, diff; before = bpf_map_lookup_elem(&fentry_readings, &id); /* only account samples with a valid fentry_reading */ @@ -95,7 +95,7 @@ int BPF_PROG(fexit_XXX) { struct bpf_perf_event_value readings[MAX_NUM_MATRICS]; u32 cpu = bpf_get_smp_processor_id(); - u32 i, one = 1, zero = 0; + u32 i, zero = 0; int err; u64 *count; From 080b6f40763565f65ebb9540219c71ce885cf568 Mon Sep 17 00:00:00 2001 From: Ard Biesheuvel Date: Wed, 28 Oct 2020 18:15:05 +0100 Subject: [PATCH 08/65] bpf: Don't rely on GCC __attribute__((optimize)) to disable GCSE Commit 3193c0836 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") introduced a __no_fgcse macro that expands to a function scope __attribute__((optimize("-fno-gcse"))), to disable a GCC specific optimization that was causing trouble on x86 builds, and was not expected to have any positive effect in the first place. However, as the GCC manual documents, __attribute__((optimize)) is not for production use, and results in all other optimization options to be forgotten for the function in question. This can cause all kinds of trouble, but in one particular reported case, it causes -fno-asynchronous-unwind-tables to be disregarded, resulting in .eh_frame info to be emitted for the function. This reverts commit 3193c0836, and instead, it disables the -fgcse optimization for the entire source file, but only when building for X86 using GCC with CONFIG_BPF_JIT_ALWAYS_ON disabled. Note that the original commit states that CONFIG_RETPOLINE=n triggers the issue, whereas CONFIG_RETPOLINE=y performs better without the optimization, so it is kept disabled in both cases. Fixes: 3193c0836f20 ("bpf: Disable GCC -fgcse optimization for ___bpf_prog_run()") Signed-off-by: Ard Biesheuvel Signed-off-by: Alexei Starovoitov Tested-by: Geert Uytterhoeven Reviewed-by: Nick Desaulniers Link: https://lore.kernel.org/lkml/CAMuHMdUg0WJHEcq6to0-eODpXPOywLot6UD2=GFHpzoj_hCoBQ@mail.gmail.com/ Link: https://lore.kernel.org/bpf/20201028171506.15682-2-ardb@kernel.org --- include/linux/compiler-gcc.h | 2 -- include/linux/compiler_types.h | 4 ---- kernel/bpf/Makefile | 6 +++++- kernel/bpf/core.c | 2 +- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index d1e3c6896b7168..5deb37024574e7 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -175,5 +175,3 @@ #else #define __diag_GCC_8(s) #endif - -#define __no_fgcse __attribute__((optimize("-fno-gcse"))) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 6e390d58a9f8c5..ac3fa37a84f940 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -247,10 +247,6 @@ struct ftrace_likely_data { #define asm_inline asm #endif -#ifndef __no_fgcse -# define __no_fgcse -#endif - /* Are two types/vars the same type (ignoring qualifiers)? */ #define __same_type(a, b) __builtin_types_compatible_p(typeof(a), typeof(b)) diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index bdc8cd1b6767ba..c1b9f71ee6aac4 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,6 +1,10 @@ # SPDX-License-Identifier: GPL-2.0 obj-y := core.o -CFLAGS_core.o += $(call cc-disable-warning, override-init) +ifneq ($(CONFIG_BPF_JIT_ALWAYS_ON),y) +# ___bpf_prog_run() needs GCSE disabled on x86; see 3193c0836f203 for details +cflags-nogcse-$(CONFIG_X86)$(CONFIG_CC_IS_GCC) := -fno-gcse +endif +CFLAGS_core.o += $(call cc-disable-warning, override-init) $(cflags-nogcse-yy) obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o tnum.o bpf_iter.o map_iter.o task_iter.o prog_iter.o obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o bpf_lru_list.o lpm_trie.o map_in_map.o diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 9268d77898b72b..55454d2278b17c 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -1369,7 +1369,7 @@ u64 __weak bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) * * Decode and execute eBPF instructions. */ -static u64 __no_fgcse ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) +static u64 ___bpf_prog_run(u64 *regs, const struct bpf_insn *insn, u64 *stack) { #define BPF_INSN_2_LBL(x, y) [BPF_##x | BPF_##y] = &&x##_##y #define BPF_INSN_3_LBL(x, y, z) [BPF_##x | BPF_##y | BPF_##z] = &&x##_##y##_##z From 7a078d2d18801bba7bde7337a823d7342299acf7 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 29 Oct 2020 15:37:07 -0700 Subject: [PATCH 09/65] libbpf, hashmap: Fix undefined behavior in hash_bits If bits is 0, the case when the map is empty, then the >> is the size of the register which is undefined behavior - on x86 it is the same as a shift by 0. Fix by handling the 0 case explicitly and guarding calls to hash_bits for empty maps in hashmap__for_each_key_entry and hashmap__for_each_entry_safe. Fixes: e3b924224028 ("libbpf: add resizable non-thread safe internal hashmap") Suggested-by: Andrii Nakryiko , Signed-off-by: Ian Rogers Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201029223707.494059-1-irogers@google.com --- tools/lib/bpf/hashmap.h | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/tools/lib/bpf/hashmap.h b/tools/lib/bpf/hashmap.h index d9b385fe808c69..10a4c4cd13cf7c 100644 --- a/tools/lib/bpf/hashmap.h +++ b/tools/lib/bpf/hashmap.h @@ -15,6 +15,9 @@ static inline size_t hash_bits(size_t h, int bits) { /* shuffle bits and return requested number of upper bits */ + if (bits == 0) + return 0; + #if (__SIZEOF_SIZE_T__ == __SIZEOF_LONG_LONG__) /* LP64 case */ return (h * 11400714819323198485llu) >> (__SIZEOF_LONG_LONG__ * 8 - bits); @@ -174,17 +177,17 @@ bool hashmap__find(const struct hashmap *map, const void *key, void **value); * @key: key to iterate entries for */ #define hashmap__for_each_key_entry(map, cur, _key) \ - for (cur = ({ size_t bkt = hash_bits(map->hash_fn((_key), map->ctx),\ - map->cap_bits); \ - map->buckets ? map->buckets[bkt] : NULL; }); \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ cur; \ cur = cur->next) \ if (map->equal_fn(cur->key, (_key), map->ctx)) #define hashmap__for_each_key_entry_safe(map, cur, tmp, _key) \ - for (cur = ({ size_t bkt = hash_bits(map->hash_fn((_key), map->ctx),\ - map->cap_bits); \ - cur = map->buckets ? map->buckets[bkt] : NULL; }); \ + for (cur = map->buckets \ + ? map->buckets[hash_bits(map->hash_fn((_key), map->ctx), map->cap_bits)] \ + : NULL; \ cur && ({ tmp = cur->next; true; }); \ cur = tmp) \ if (map->equal_fn(cur->key, (_key), map->ctx)) From f78331f74cacb33d87cd60376dacc5bd397959e2 Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 3 Nov 2020 10:41:29 +0100 Subject: [PATCH 10/65] libbpf: Fix null dereference in xsk_socket__delete Fix a possible null pointer dereference in xsk_socket__delete that will occur if a null pointer is fed into the function. Fixes: 2f6324a3937f ("libbpf: Support shared umems between queues and devices") Reported-by: Andrii Nakryiko Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1604396490-12129-2-git-send-email-magnus.karlsson@gmail.com --- tools/lib/bpf/xsk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index e3c98c007825cf..504b7a85d44567 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -891,13 +891,14 @@ int xsk_umem__delete(struct xsk_umem *umem) void xsk_socket__delete(struct xsk_socket *xsk) { size_t desc_sz = sizeof(struct xdp_desc); - struct xsk_ctx *ctx = xsk->ctx; struct xdp_mmap_offsets off; + struct xsk_ctx *ctx; int err; if (!xsk) return; + ctx = xsk->ctx; if (ctx->prog_fd != -1) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); From 25cf73b9ff88fd4608699a0313f820758b4c252d Mon Sep 17 00:00:00 2001 From: Magnus Karlsson Date: Tue, 3 Nov 2020 10:41:30 +0100 Subject: [PATCH 11/65] libbpf: Fix possible use after free in xsk_socket__delete Fix a possible use after free in xsk_socket__delete that will happen if xsk_put_ctx() frees the ctx. To fix, save the umem reference taken from the context and just use that instead. Fixes: 2f6324a3937f ("libbpf: Support shared umems between queues and devices") Signed-off-by: Magnus Karlsson Signed-off-by: Daniel Borkmann Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/1604396490-12129-3-git-send-email-magnus.karlsson@gmail.com --- tools/lib/bpf/xsk.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tools/lib/bpf/xsk.c b/tools/lib/bpf/xsk.c index 504b7a85d44567..9bc537d0b92da7 100644 --- a/tools/lib/bpf/xsk.c +++ b/tools/lib/bpf/xsk.c @@ -892,6 +892,7 @@ void xsk_socket__delete(struct xsk_socket *xsk) { size_t desc_sz = sizeof(struct xdp_desc); struct xdp_mmap_offsets off; + struct xsk_umem *umem; struct xsk_ctx *ctx; int err; @@ -899,6 +900,7 @@ void xsk_socket__delete(struct xsk_socket *xsk) return; ctx = xsk->ctx; + umem = ctx->umem; if (ctx->prog_fd != -1) { xsk_delete_bpf_maps(xsk); close(ctx->prog_fd); @@ -918,11 +920,11 @@ void xsk_socket__delete(struct xsk_socket *xsk) xsk_put_ctx(ctx); - ctx->umem->refcount--; + umem->refcount--; /* Do not close an fd that also has an associated umem connected * to it. */ - if (xsk->fd != ctx->umem->fd) + if (xsk->fd != umem->fd) close(xsk->fd); free(xsk); } From e68e28b4a9d71261e3f8fd05a72d6cf0b443a493 Mon Sep 17 00:00:00 2001 From: Maor Dickman Date: Wed, 30 Sep 2020 16:31:11 +0300 Subject: [PATCH 12/65] net/mlx5e: Fix modify header actions memory leak Modify header actions are allocated during parse tc actions and only freed during the flow creation, however, on error flow the allocated memory is wrongly unfreed. Fix this by calling dealloc_mod_hdr_actions in __mlx5e_add_fdb_flow and mlx5e_add_nic_flow error flow. Fixes: d7e75a325cb2 ("net/mlx5e: Add offloading of E-Switch TC pedit (header re-write) actions") Fixes: 2f4fe4cab073 ("net/mlx5e: Add offloading of NIC TC pedit (header re-write) actions") Signed-off-by: Maor Dickman Reviewed-by: Paul Blakey Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index e3a968e9e2a033..2e2fa0440032dd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -4658,6 +4658,7 @@ __mlx5e_add_fdb_flow(struct mlx5e_priv *priv, return flow; err_free: + dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts); mlx5e_flow_put(priv, flow); out: return ERR_PTR(err); @@ -4802,6 +4803,7 @@ mlx5e_add_nic_flow(struct mlx5e_priv *priv, return 0; err_free: + dealloc_mod_hdr_actions(&parse_attr->mod_hdr_acts); mlx5e_flow_put(priv, flow); out: return err; From 78c906e430b13d30a8cfbdef4ccbbe1686841a9e Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Mon, 31 Aug 2020 16:17:29 +0300 Subject: [PATCH 13/65] net/mlx5e: Protect encap route dev from concurrent release In functions mlx5e_route_lookup_ipv{4|6}() route_dev can be arbitrary net device and not necessary mlx5 eswitch port representor. As such, in order to ensure that route_dev is not destroyed concurrent the code needs either explicitly take reference to the device before releasing reference to rtable instance or ensure that caller holds rtnl lock. First approach is chosen as a fix since rtnl lock dependency was intentionally removed from mlx5 TC layer. To prevent unprotected usage of route_dev in encap code take a reference to the device before releasing rt. Don't save direct pointer to the device in mlx5_encap_entry structure and use ifindex instead. Modify users of route_dev pointer to properly obtain the net device instance from its ifindex. Fixes: 61086f391044 ("net/mlx5e: Protect encap hash table with mutex") Fixes: 6707f74be862 ("net/mlx5e: Update hw flows when encap source mac changed") Signed-off-by: Vlad Buslov Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- .../ethernet/mellanox/mlx5/core/en/rep/tc.c | 6 +- .../ethernet/mellanox/mlx5/core/en/tc_tun.c | 72 ++++++++++++------- .../net/ethernet/mellanox/mlx5/core/en_rep.h | 2 +- 3 files changed, 52 insertions(+), 28 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c index e36e505d38ad81..d29af7b9c695a6 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/rep/tc.c @@ -107,12 +107,16 @@ void mlx5e_rep_update_flows(struct mlx5e_priv *priv, mlx5e_tc_encap_flows_del(priv, e, &flow_list); if (neigh_connected && !(e->flags & MLX5_ENCAP_ENTRY_VALID)) { + struct net_device *route_dev; + ether_addr_copy(e->h_dest, ha); ether_addr_copy(eth->h_dest, ha); /* Update the encap source mac, in case that we delete * the flows when encap source mac changed. */ - ether_addr_copy(eth->h_source, e->route_dev->dev_addr); + route_dev = __dev_get_by_index(dev_net(priv->netdev), e->route_dev_ifindex); + if (route_dev) + ether_addr_copy(eth->h_source, route_dev->dev_addr); mlx5e_tc_encap_flows_add(priv, e, &flow_list); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c index 7cce85faa16faf..90930e54b6f288 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_tun.c @@ -77,13 +77,13 @@ static int get_route_and_out_devs(struct mlx5e_priv *priv, return 0; } -static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, - struct net_device *mirred_dev, - struct net_device **out_dev, - struct net_device **route_dev, - struct flowi4 *fl4, - struct neighbour **out_n, - u8 *out_ttl) +static int mlx5e_route_lookup_ipv4_get(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct net_device **out_dev, + struct net_device **route_dev, + struct flowi4 *fl4, + struct neighbour **out_n, + u8 *out_ttl) { struct neighbour *n; struct rtable *rt; @@ -117,18 +117,28 @@ static int mlx5e_route_lookup_ipv4(struct mlx5e_priv *priv, ip_rt_put(rt); return ret; } + dev_hold(*route_dev); if (!(*out_ttl)) *out_ttl = ip4_dst_hoplimit(&rt->dst); n = dst_neigh_lookup(&rt->dst, &fl4->daddr); ip_rt_put(rt); - if (!n) + if (!n) { + dev_put(*route_dev); return -ENOMEM; + } *out_n = n; return 0; } +static void mlx5e_route_lookup_ipv4_put(struct net_device *route_dev, + struct neighbour *n) +{ + neigh_release(n); + dev_put(route_dev); +} + static const char *mlx5e_netdev_kind(struct net_device *dev) { if (dev->rtnl_link_ops) @@ -193,8 +203,8 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, fl4.saddr = tun_key->u.ipv4.src; ttl = tun_key->ttl; - err = mlx5e_route_lookup_ipv4(priv, mirred_dev, &out_dev, &route_dev, - &fl4, &n, &ttl); + err = mlx5e_route_lookup_ipv4_get(priv, mirred_dev, &out_dev, &route_dev, + &fl4, &n, &ttl); if (err) return err; @@ -223,7 +233,7 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, e->m_neigh.family = n->ops->family; memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len); e->out_dev = out_dev; - e->route_dev = route_dev; + e->route_dev_ifindex = route_dev->ifindex; /* It's important to add the neigh to the hash table before checking * the neigh validity state. So if we'll get a notification, in case the @@ -278,7 +288,7 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev)); - neigh_release(n); + mlx5e_route_lookup_ipv4_put(route_dev, n); return err; destroy_neigh_entry: @@ -286,18 +296,18 @@ int mlx5e_tc_tun_create_header_ipv4(struct mlx5e_priv *priv, free_encap: kfree(encap_header); release_neigh: - neigh_release(n); + mlx5e_route_lookup_ipv4_put(route_dev, n); return err; } #if IS_ENABLED(CONFIG_INET) && IS_ENABLED(CONFIG_IPV6) -static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv, - struct net_device *mirred_dev, - struct net_device **out_dev, - struct net_device **route_dev, - struct flowi6 *fl6, - struct neighbour **out_n, - u8 *out_ttl) +static int mlx5e_route_lookup_ipv6_get(struct mlx5e_priv *priv, + struct net_device *mirred_dev, + struct net_device **out_dev, + struct net_device **route_dev, + struct flowi6 *fl6, + struct neighbour **out_n, + u8 *out_ttl) { struct dst_entry *dst; struct neighbour *n; @@ -318,15 +328,25 @@ static int mlx5e_route_lookup_ipv6(struct mlx5e_priv *priv, return ret; } + dev_hold(*route_dev); n = dst_neigh_lookup(dst, &fl6->daddr); dst_release(dst); - if (!n) + if (!n) { + dev_put(*route_dev); return -ENOMEM; + } *out_n = n; return 0; } +static void mlx5e_route_lookup_ipv6_put(struct net_device *route_dev, + struct neighbour *n) +{ + neigh_release(n); + dev_put(route_dev); +} + int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, struct net_device *mirred_dev, struct mlx5e_encap_entry *e) @@ -348,8 +368,8 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, fl6.daddr = tun_key->u.ipv6.dst; fl6.saddr = tun_key->u.ipv6.src; - err = mlx5e_route_lookup_ipv6(priv, mirred_dev, &out_dev, &route_dev, - &fl6, &n, &ttl); + err = mlx5e_route_lookup_ipv6_get(priv, mirred_dev, &out_dev, &route_dev, + &fl6, &n, &ttl); if (err) return err; @@ -378,7 +398,7 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, e->m_neigh.family = n->ops->family; memcpy(&e->m_neigh.dst_ip, n->primary_key, n->tbl->key_len); e->out_dev = out_dev; - e->route_dev = route_dev; + e->route_dev_ifindex = route_dev->ifindex; /* It's importent to add the neigh to the hash table before checking * the neigh validity state. So if we'll get a notification, in case the @@ -433,7 +453,7 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, e->flags |= MLX5_ENCAP_ENTRY_VALID; mlx5e_rep_queue_neigh_stats_work(netdev_priv(out_dev)); - neigh_release(n); + mlx5e_route_lookup_ipv6_put(route_dev, n); return err; destroy_neigh_entry: @@ -441,7 +461,7 @@ int mlx5e_tc_tun_create_header_ipv6(struct mlx5e_priv *priv, free_encap: kfree(encap_header); release_neigh: - neigh_release(n); + mlx5e_route_lookup_ipv6_put(route_dev, n); return err; } #endif diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h index 9020d1419bcf62..8932c387d46a3e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.h @@ -186,7 +186,7 @@ struct mlx5e_encap_entry { unsigned char h_dest[ETH_ALEN]; /* destination eth addr */ struct net_device *out_dev; - struct net_device *route_dev; + int route_dev_ifindex; struct mlx5e_tc_tunnel *tunnel; int reformat_type; u8 flags; From f42139ba49791ab6b12443c60044872705b74a1e Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 8 Oct 2020 11:34:03 +0300 Subject: [PATCH 14/65] net/mlx5e: Use spin_lock_bh for async_icosq_lock async_icosq_lock may be taken from softirq and non-softirq contexts. It requires protection with spin_lock_bh, otherwise a softirq may be triggered in the middle of the critical section, and it may deadlock if it tries to take the same lock. This patch fixes such a scenario by using spin_lock_bh to disable softirqs on that CPU while inside the critical section. Fixes: 8d94b590f1e4 ("net/mlx5e: Turn XSK ICOSQ into a general asynchronous one") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en/xsk/setup.c | 4 ++-- .../net/ethernet/mellanox/mlx5/core/en/xsk/tx.c | 4 ++-- .../ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c | 14 +++++++------- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c index 4e574ac730198c..be3465ba38ca1f 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/setup.c @@ -122,9 +122,9 @@ void mlx5e_activate_xsk(struct mlx5e_channel *c) set_bit(MLX5E_RQ_STATE_ENABLED, &c->xskrq.state); /* TX queue is created active. */ - spin_lock(&c->async_icosq_lock); + spin_lock_bh(&c->async_icosq_lock); mlx5e_trigger_irq(&c->async_icosq); - spin_unlock(&c->async_icosq_lock); + spin_unlock_bh(&c->async_icosq_lock); } void mlx5e_deactivate_xsk(struct mlx5e_channel *c) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c index fb671a45712995..8e96260fce1df3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/xsk/tx.c @@ -36,9 +36,9 @@ int mlx5e_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags) if (test_and_set_bit(MLX5E_SQ_STATE_PENDING_XSK_TX, &c->async_icosq.state)) return 0; - spin_lock(&c->async_icosq_lock); + spin_lock_bh(&c->async_icosq_lock); mlx5e_trigger_irq(&c->async_icosq); - spin_unlock(&c->async_icosq_lock); + spin_unlock_bh(&c->async_icosq_lock); } return 0; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c index ccaccb9fc2f7b3..7f6221b8b1f7f5 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_rx.c @@ -188,7 +188,7 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, err = 0; sq = &c->async_icosq; - spin_lock(&c->async_icosq_lock); + spin_lock_bh(&c->async_icosq_lock); cseg = post_static_params(sq, priv_rx); if (IS_ERR(cseg)) @@ -199,7 +199,7 @@ static int post_rx_param_wqes(struct mlx5e_channel *c, mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); unlock: - spin_unlock(&c->async_icosq_lock); + spin_unlock_bh(&c->async_icosq_lock); return err; @@ -265,10 +265,10 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, BUILD_BUG_ON(MLX5E_KTLS_GET_PROGRESS_WQEBBS != 1); - spin_lock(&sq->channel->async_icosq_lock); + spin_lock_bh(&sq->channel->async_icosq_lock); if (unlikely(!mlx5e_wqc_has_room_for(&sq->wq, sq->cc, sq->pc, 1))) { - spin_unlock(&sq->channel->async_icosq_lock); + spin_unlock_bh(&sq->channel->async_icosq_lock); err = -ENOSPC; goto err_dma_unmap; } @@ -299,7 +299,7 @@ resync_post_get_progress_params(struct mlx5e_icosq *sq, icosq_fill_wi(sq, pi, &wi); sq->pc++; mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); - spin_unlock(&sq->channel->async_icosq_lock); + spin_unlock_bh(&sq->channel->async_icosq_lock); return 0; @@ -360,7 +360,7 @@ static int resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_rx err = 0; sq = &c->async_icosq; - spin_lock(&c->async_icosq_lock); + spin_lock_bh(&c->async_icosq_lock); cseg = post_static_params(sq, priv_rx); if (IS_ERR(cseg)) { @@ -372,7 +372,7 @@ static int resync_handle_seq_match(struct mlx5e_ktls_offload_context_rx *priv_rx mlx5e_notify_hw(&sq->wq, sq->pc, sq->uar_map, cseg); priv_rx->stats->tls_resync_res_ok++; unlock: - spin_unlock(&c->async_icosq_lock); + spin_unlock_bh(&c->async_icosq_lock); return err; } From 465e7baab6d93b399344f5868f84c177ab5cd16f Mon Sep 17 00:00:00 2001 From: Maor Gottlieb Date: Wed, 21 Oct 2020 08:42:49 +0300 Subject: [PATCH 15/65] net/mlx5: Fix deletion of duplicate rules When a rule is duplicated, the refcount of the rule is increased so only the second deletion of the rule should cause destruction of the FTE. Currently, the FTE will be destroyed in the first deletion of rule since the modify_mask will be 0. Fix it and call to destroy FTE only if all the rules (FTE's children) have been removed. Fixes: 718ce4d601db ("net/mlx5: Consolidate update FTE for all removal changes") Signed-off-by: Maor Gottlieb Reviewed-by: Mark Bloch Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/fs_core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 16091838bfcf90..325a5b0d6829ee 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -2010,10 +2010,11 @@ void mlx5_del_flow_rules(struct mlx5_flow_handle *handle) down_write_ref_node(&fte->node, false); for (i = handle->num_rules - 1; i >= 0; i--) tree_remove_node(&handle->rule[i]->node, true); - if (fte->modify_mask && fte->dests_size) { - modify_fte(fte); + if (fte->dests_size) { + if (fte->modify_mask) + modify_fte(fte); up_write_ref_node(&fte->node, false); - } else { + } else if (list_empty(&fte->node.children)) { del_hw_fte(&fte->node); /* Avoid double call to del_hw_fte */ fte->node.del_hw_func = NULL; From ae35859445607f7f18dd4f332749219cd636ed59 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Mon, 2 Nov 2020 12:41:28 +0200 Subject: [PATCH 16/65] net/mlx5: E-switch, Avoid extack error log for disabled vport When E-switch vport is disabled, querying its hardware address is unsupported. Avoid setting extack error log message in such case. Fixes: f099fde16db3 ("net/mlx5: E-switch, Support querying port function mac address") Signed-off-by: Parav Pandit Reviewed-by: Roi Dayan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c index 6e6a9a56399283..e8e6294c7ccae1 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c @@ -1902,8 +1902,6 @@ int mlx5_devlink_port_function_hw_addr_get(struct devlink *devlink, ether_addr_copy(hw_addr, vport->info.mac); *hw_addr_len = ETH_ALEN; err = 0; - } else { - NL_SET_ERR_MSG_MOD(extack, "Eswitch vport is disabled"); } mutex_unlock(&esw->state_lock); return err; From c5eb51adf06b2644fa28d4af886bfdcc53e288da Mon Sep 17 00:00:00 2001 From: Aya Levin Date: Wed, 23 Sep 2020 12:58:44 +0300 Subject: [PATCH 17/65] net/mlx5e: Fix VXLAN synchronization after function reload During driver reload, perform firmware tear-down which results in firmware losing the configured VXLAN ports. These ports are still available in the driver's database. Fix this by cleaning up driver's VXLAN database in the nic unload flow, before firmware tear-down. With that, minimize mlx5_vxlan_destroy() to remove only what was added in mlx5_vxlan_create() and warn on leftover UDP ports. Fixes: 18a2b7f969c9 ("net/mlx5: convert to new udp_tunnel infrastructure") Signed-off-by: Aya Levin Reviewed-by: Moshe Shemesh Signed-off-by: Saeed Mahameed --- .../net/ethernet/mellanox/mlx5/core/en_main.c | 1 + .../ethernet/mellanox/mlx5/core/lib/vxlan.c | 23 ++++++++++++++----- .../ethernet/mellanox/mlx5/core/lib/vxlan.h | 2 ++ 3 files changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index b3f02aac7f268c..ebce97921e03ca 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -5253,6 +5253,7 @@ static void mlx5e_nic_disable(struct mlx5e_priv *priv) mlx5e_disable_async_events(priv); mlx5_lag_remove(mdev); + mlx5_vxlan_reset_to_default(mdev->vxlan); } int mlx5e_update_nic_rx(struct mlx5e_priv *priv) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c index 3315afe2f8dce6..38084400ee8fab 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.c @@ -167,6 +167,17 @@ struct mlx5_vxlan *mlx5_vxlan_create(struct mlx5_core_dev *mdev) } void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) +{ + if (!mlx5_vxlan_allowed(vxlan)) + return; + + mlx5_vxlan_del_port(vxlan, IANA_VXLAN_UDP_PORT); + WARN_ON(!hash_empty(vxlan->htable)); + + kfree(vxlan); +} + +void mlx5_vxlan_reset_to_default(struct mlx5_vxlan *vxlan) { struct mlx5_vxlan_port *vxlanp; struct hlist_node *tmp; @@ -175,12 +186,12 @@ void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) if (!mlx5_vxlan_allowed(vxlan)) return; - /* Lockless since we are the only hash table consumers*/ hash_for_each_safe(vxlan->htable, bkt, tmp, vxlanp, hlist) { - hash_del(&vxlanp->hlist); - mlx5_vxlan_core_del_port_cmd(vxlan->mdev, vxlanp->udp_port); - kfree(vxlanp); + /* Don't delete default UDP port added by the HW. + * Remove only user configured ports + */ + if (vxlanp->udp_port == IANA_VXLAN_UDP_PORT) + continue; + mlx5_vxlan_del_port(vxlan, vxlanp->udp_port); } - - kfree(vxlan); } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h index ec766529f49b69..34ef662da35ed4 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/vxlan.h @@ -56,6 +56,7 @@ void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan); int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port); int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port); bool mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port); +void mlx5_vxlan_reset_to_default(struct mlx5_vxlan *vxlan); #else static inline struct mlx5_vxlan* mlx5_vxlan_create(struct mlx5_core_dev *mdev) { return ERR_PTR(-EOPNOTSUPP); } @@ -63,6 +64,7 @@ static inline void mlx5_vxlan_destroy(struct mlx5_vxlan *vxlan) { return; } static inline int mlx5_vxlan_add_port(struct mlx5_vxlan *vxlan, u16 port) { return -EOPNOTSUPP; } static inline int mlx5_vxlan_del_port(struct mlx5_vxlan *vxlan, u16 port) { return -EOPNOTSUPP; } static inline bool mlx5_vxlan_lookup_port(struct mlx5_vxlan *vxlan, u16 port) { return false; } +static inline void mlx5_vxlan_reset_to_default(struct mlx5_vxlan *vxlan) { return; } #endif #endif /* __MLX5_VXLAN_H__ */ From 1a50cf9a67ff2241c2949d30bc11c8dd4280eef8 Mon Sep 17 00:00:00 2001 From: Maxim Mikityanskiy Date: Thu, 22 Oct 2020 12:49:51 +0300 Subject: [PATCH 18/65] net/mlx5e: Fix incorrect access of RCU-protected xdp_prog rq->xdp_prog is RCU-protected and should be accessed only with rcu_access_pointer for the NULL check in mlx5e_poll_rx_cq. rq->xdp_prog may change on the fly only from one non-NULL value to another non-NULL value, so the checks in mlx5e_xdp_handle and mlx5e_poll_rx_cq will have the same result during one NAPI cycle, meaning that no additional synchronization is needed. Fixes: fe45386a2082 ("net/mlx5e: Use RCU to protect rq->xdp_prog") Signed-off-by: Maxim Mikityanskiy Reviewed-by: Tariq Toukan Signed-off-by: Saeed Mahameed --- drivers/net/ethernet/mellanox/mlx5/core/en_rx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c index 599f5b5ebc978f..6628a0197b4e08 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c @@ -1584,7 +1584,7 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget) } while ((++work_done < budget) && (cqe = mlx5_cqwq_get_cqe(cqwq))); out: - if (rq->xdp_prog) + if (rcu_access_pointer(rq->xdp_prog)) mlx5e_xdp_rx_poll_complete(rq); mlx5_cqwq_update_db_record(cqwq); From f9b7ff0d7f7a466a920424246e7ddc2b84c87e52 Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Thu, 5 Nov 2020 11:52:30 +0000 Subject: [PATCH 19/65] tools/bpftool: Fix attaching flow dissector My earlier patch to reject non-zero arguments to flow dissector attach broke attaching via bpftool. Instead of 0 it uses -1 for target_fd. Fix this by passing a zero argument when attaching the flow dissector. Fixes: 1b514239e859 ("bpf: flow_dissector: Check value of unused flags to BPF_PROG_ATTACH") Reported-by: Jiri Benc Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Acked-by: Song Liu Link: https://lore.kernel.org/bpf/20201105115230.296657-1-lmb@cloudflare.com --- tools/bpf/bpftool/prog.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/prog.c b/tools/bpf/bpftool/prog.c index d942c1e3372c75..acdb2c245f0a46 100644 --- a/tools/bpf/bpftool/prog.c +++ b/tools/bpf/bpftool/prog.c @@ -940,7 +940,7 @@ static int parse_attach_detach_args(int argc, char **argv, int *progfd, } if (*attach_type == BPF_FLOW_DISSECTOR) { - *mapfd = -1; + *mapfd = 0; return 0; } From 7c0afcad7507636529e6a5a2a5eef5482619a449 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 5 Nov 2020 11:51:09 -0800 Subject: [PATCH 20/65] bpf: BPF_PRELOAD depends on BPF_SYSCALL Fix build error when BPF_SYSCALL is not set/enabled but BPF_PRELOAD is by making BPF_PRELOAD depend on BPF_SYSCALL. ERROR: modpost: "bpf_preload_ops" [kernel/bpf/preload/bpf_preload.ko] undefined! Reported-by: kernel test robot Reported-by: Randy Dunlap Signed-off-by: Randy Dunlap Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201105195109.26232-1-rdunlap@infradead.org --- kernel/bpf/preload/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/bpf/preload/Kconfig b/kernel/bpf/preload/Kconfig index ace49111d3a366..26bced26247380 100644 --- a/kernel/bpf/preload/Kconfig +++ b/kernel/bpf/preload/Kconfig @@ -6,6 +6,7 @@ config USERMODE_DRIVER menuconfig BPF_PRELOAD bool "Preload BPF file system with kernel specific program and map iterators" depends on BPF + depends on BPF_SYSCALL # The dependency on !COMPILE_TEST prevents it from being enabled # in allmodconfig or allyesconfig configurations depends on !COMPILE_TEST From d3bec0138bfbe58606fc1d6f57a4cdc1a20218db Mon Sep 17 00:00:00 2001 From: David Verbeiren Date: Wed, 4 Nov 2020 12:23:32 +0100 Subject: [PATCH 21/65] bpf: Zero-fill re-used per-cpu map element Zero-fill element values for all other cpus than current, just as when not using prealloc. This is the only way the bpf program can ensure known initial values for all cpus ('onallcpus' cannot be set when coming from the bpf program). The scenario is: bpf program inserts some elements in a per-cpu map, then deletes some (or userspace does). When later adding new elements using bpf_map_update_elem(), the bpf program can only set the value of the new elements for the current cpu. When prealloc is enabled, previously deleted elements are re-used. Without the fix, values for other cpus remain whatever they were when the re-used entry was previously freed. A selftest is added to validate correct operation in above scenario as well as in case of LRU per-cpu map element re-use. Fixes: 6c9059817432 ("bpf: pre-allocate hash map elements") Signed-off-by: David Verbeiren Signed-off-by: Alexei Starovoitov Acked-by: Matthieu Baerts Acked-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20201104112332.15191-1-david.verbeiren@tessares.net --- kernel/bpf/hashtab.c | 30 ++- .../selftests/bpf/prog_tests/map_init.c | 214 ++++++++++++++++++ .../selftests/bpf/progs/test_map_init.c | 33 +++ 3 files changed, 275 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/prog_tests/map_init.c create mode 100644 tools/testing/selftests/bpf/progs/test_map_init.c diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 1815e97d4c9c1b..1fccba6e88c4ed 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -821,6 +821,32 @@ static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr, } } +static void pcpu_init_value(struct bpf_htab *htab, void __percpu *pptr, + void *value, bool onallcpus) +{ + /* When using prealloc and not setting the initial value on all cpus, + * zero-fill element values for other cpus (just as what happens when + * not using prealloc). Otherwise, bpf program has no way to ensure + * known initial values for cpus other than current one + * (onallcpus=false always when coming from bpf prog). + */ + if (htab_is_prealloc(htab) && !onallcpus) { + u32 size = round_up(htab->map.value_size, 8); + int current_cpu = raw_smp_processor_id(); + int cpu; + + for_each_possible_cpu(cpu) { + if (cpu == current_cpu) + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value, + size); + else + memset(per_cpu_ptr(pptr, cpu), 0, size); + } + } else { + pcpu_copy_value(htab, pptr, value, onallcpus); + } +} + static bool fd_htab_map_needs_adjust(const struct bpf_htab *htab) { return htab->map.map_type == BPF_MAP_TYPE_HASH_OF_MAPS && @@ -891,7 +917,7 @@ static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, } } - pcpu_copy_value(htab, pptr, value, onallcpus); + pcpu_init_value(htab, pptr, value, onallcpus); if (!prealloc) htab_elem_set_ptr(l_new, key_size, pptr); @@ -1183,7 +1209,7 @@ static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key, pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size), value, onallcpus); } else { - pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size), + pcpu_init_value(htab, htab_elem_get_ptr(l_new, key_size), value, onallcpus); hlist_nulls_add_head_rcu(&l_new->hash_node, head); l_new = NULL; diff --git a/tools/testing/selftests/bpf/prog_tests/map_init.c b/tools/testing/selftests/bpf/prog_tests/map_init.c new file mode 100644 index 00000000000000..14a31109dd0e0d --- /dev/null +++ b/tools/testing/selftests/bpf/prog_tests/map_init.c @@ -0,0 +1,214 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Copyright (c) 2020 Tessares SA */ + +#include +#include "test_map_init.skel.h" + +#define TEST_VALUE 0x1234 +#define FILL_VALUE 0xdeadbeef + +static int nr_cpus; +static int duration; + +typedef unsigned long long map_key_t; +typedef unsigned long long map_value_t; +typedef struct { + map_value_t v; /* padding */ +} __bpf_percpu_val_align pcpu_map_value_t; + + +static int map_populate(int map_fd, int num) +{ + pcpu_map_value_t value[nr_cpus]; + int i, err; + map_key_t key; + + for (i = 0; i < nr_cpus; i++) + bpf_percpu(value, i) = FILL_VALUE; + + for (key = 1; key <= num; key++) { + err = bpf_map_update_elem(map_fd, &key, value, BPF_NOEXIST); + if (!ASSERT_OK(err, "bpf_map_update_elem")) + return -1; + } + + return 0; +} + +static struct test_map_init *setup(enum bpf_map_type map_type, int map_sz, + int *map_fd, int populate) +{ + struct test_map_init *skel; + int err; + + skel = test_map_init__open(); + if (!ASSERT_OK_PTR(skel, "skel_open")) + return NULL; + + err = bpf_map__set_type(skel->maps.hashmap1, map_type); + if (!ASSERT_OK(err, "bpf_map__set_type")) + goto error; + + err = bpf_map__set_max_entries(skel->maps.hashmap1, map_sz); + if (!ASSERT_OK(err, "bpf_map__set_max_entries")) + goto error; + + err = test_map_init__load(skel); + if (!ASSERT_OK(err, "skel_load")) + goto error; + + *map_fd = bpf_map__fd(skel->maps.hashmap1); + if (CHECK(*map_fd < 0, "bpf_map__fd", "failed\n")) + goto error; + + err = map_populate(*map_fd, populate); + if (!ASSERT_OK(err, "map_populate")) + goto error_map; + + return skel; + +error_map: + close(*map_fd); +error: + test_map_init__destroy(skel); + return NULL; +} + +/* executes bpf program that updates map with key, value */ +static int prog_run_insert_elem(struct test_map_init *skel, map_key_t key, + map_value_t value) +{ + struct test_map_init__bss *bss; + + bss = skel->bss; + + bss->inKey = key; + bss->inValue = value; + bss->inPid = getpid(); + + if (!ASSERT_OK(test_map_init__attach(skel), "skel_attach")) + return -1; + + /* Let tracepoint trigger */ + syscall(__NR_getpgid); + + test_map_init__detach(skel); + + return 0; +} + +static int check_values_one_cpu(pcpu_map_value_t *value, map_value_t expected) +{ + int i, nzCnt = 0; + map_value_t val; + + for (i = 0; i < nr_cpus; i++) { + val = bpf_percpu(value, i); + if (val) { + if (CHECK(val != expected, "map value", + "unexpected for cpu %d: 0x%llx\n", i, val)) + return -1; + nzCnt++; + } + } + + if (CHECK(nzCnt != 1, "map value", "set for %d CPUs instead of 1!\n", + nzCnt)) + return -1; + + return 0; +} + +/* Add key=1 elem with values set for all CPUs + * Delete elem key=1 + * Run bpf prog that inserts new key=1 elem with value=0x1234 + * (bpf prog can only set value for current CPU) + * Lookup Key=1 and check value is as expected for all CPUs: + * value set by bpf prog for one CPU, 0 for all others + */ +static void test_pcpu_map_init(void) +{ + pcpu_map_value_t value[nr_cpus]; + struct test_map_init *skel; + int map_fd, err; + map_key_t key; + + /* max 1 elem in map so insertion is forced to reuse freed entry */ + skel = setup(BPF_MAP_TYPE_PERCPU_HASH, 1, &map_fd, 1); + if (!ASSERT_OK_PTR(skel, "prog_setup")) + return; + + /* delete element so the entry can be re-used*/ + key = 1; + err = bpf_map_delete_elem(map_fd, &key); + if (!ASSERT_OK(err, "bpf_map_delete_elem")) + goto cleanup; + + /* run bpf prog that inserts new elem, re-using the slot just freed */ + err = prog_run_insert_elem(skel, key, TEST_VALUE); + if (!ASSERT_OK(err, "prog_run_insert_elem")) + goto cleanup; + + /* check that key=1 was re-created by bpf prog */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto cleanup; + + /* and has expected values */ + check_values_one_cpu(value, TEST_VALUE); + +cleanup: + test_map_init__destroy(skel); +} + +/* Add key=1 and key=2 elems with values set for all CPUs + * Run bpf prog that inserts new key=3 elem + * (only for current cpu; other cpus should have initial value = 0) + * Lookup Key=1 and check value is as expected for all CPUs + */ +static void test_pcpu_lru_map_init(void) +{ + pcpu_map_value_t value[nr_cpus]; + struct test_map_init *skel; + int map_fd, err; + map_key_t key; + + /* Set up LRU map with 2 elements, values filled for all CPUs. + * With these 2 elements, the LRU map is full + */ + skel = setup(BPF_MAP_TYPE_LRU_PERCPU_HASH, 2, &map_fd, 2); + if (!ASSERT_OK_PTR(skel, "prog_setup")) + return; + + /* run bpf prog that inserts new key=3 element, re-using LRU slot */ + key = 3; + err = prog_run_insert_elem(skel, key, TEST_VALUE); + if (!ASSERT_OK(err, "prog_run_insert_elem")) + goto cleanup; + + /* check that key=3 replaced one of earlier elements */ + err = bpf_map_lookup_elem(map_fd, &key, value); + if (!ASSERT_OK(err, "bpf_map_lookup_elem")) + goto cleanup; + + /* and has expected values */ + check_values_one_cpu(value, TEST_VALUE); + +cleanup: + test_map_init__destroy(skel); +} + +void test_map_init(void) +{ + nr_cpus = bpf_num_possible_cpus(); + if (nr_cpus <= 1) { + printf("%s:SKIP: >1 cpu needed for this test\n", __func__); + test__skip(); + return; + } + + if (test__start_subtest("pcpu_map_init")) + test_pcpu_map_init(); + if (test__start_subtest("pcpu_lru_map_init")) + test_pcpu_lru_map_init(); +} diff --git a/tools/testing/selftests/bpf/progs/test_map_init.c b/tools/testing/selftests/bpf/progs/test_map_init.c new file mode 100644 index 00000000000000..c89d28ead67370 --- /dev/null +++ b/tools/testing/selftests/bpf/progs/test_map_init.c @@ -0,0 +1,33 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2020 Tessares SA */ + +#include "vmlinux.h" +#include + +__u64 inKey = 0; +__u64 inValue = 0; +__u32 inPid = 0; + +struct { + __uint(type, BPF_MAP_TYPE_PERCPU_HASH); + __uint(max_entries, 2); + __type(key, __u64); + __type(value, __u64); +} hashmap1 SEC(".maps"); + + +SEC("tp/syscalls/sys_enter_getpgid") +int sysenter_getpgid(const void *ctx) +{ + /* Just do it for once, when called from our own test prog. This + * ensures the map value is only updated for a single CPU. + */ + int cur_pid = bpf_get_current_pid_tgid() >> 32; + + if (cur_pid == inPid) + bpf_map_update_elem(&hashmap1, &inKey, &inValue, BPF_NOEXIST); + + return 0; +} + +char _license[] SEC("license") = "GPL"; From 6f64e477830000746c1f992050fbd45c03c89429 Mon Sep 17 00:00:00 2001 From: KP Singh Date: Thu, 5 Nov 2020 23:06:51 +0000 Subject: [PATCH 22/65] bpf: Update verification logic for LSM programs The current logic checks if the name of the BTF type passed in attach_btf_id starts with "bpf_lsm_", this is not sufficient as it also allows attachment to non-LSM hooks like the very function that performs this check, i.e. bpf_lsm_verify_prog. In order to ensure that this verification logic allows attachment to only LSM hooks, the LSM_HOOK definitions in lsm_hook_defs.h are used to generate a BTF_ID set. Upon verification, the attach_btf_id of the program being attached is checked for presence in this set. Fixes: 9e4e01dfd325 ("bpf: lsm: Implement attach, detach and execution") Signed-off-by: KP Singh Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20201105230651.2621917-1-kpsingh@chromium.org --- kernel/bpf/bpf_lsm.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/bpf/bpf_lsm.c b/kernel/bpf/bpf_lsm.c index 78ea8a7bd27feb..56cc5a915f6703 100644 --- a/kernel/bpf/bpf_lsm.c +++ b/kernel/bpf/bpf_lsm.c @@ -13,6 +13,7 @@ #include #include #include +#include /* For every LSM hook that allows attachment of BPF programs, declare a nop * function where a BPF program can be attached. @@ -26,7 +27,11 @@ noinline RET bpf_lsm_##NAME(__VA_ARGS__) \ #include #undef LSM_HOOK -#define BPF_LSM_SYM_PREFX "bpf_lsm_" +#define LSM_HOOK(RET, DEFAULT, NAME, ...) BTF_ID(func, bpf_lsm_##NAME) +BTF_SET_START(bpf_lsm_hooks) +#include +#undef LSM_HOOK +BTF_SET_END(bpf_lsm_hooks) int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, const struct bpf_prog *prog) @@ -37,8 +42,7 @@ int bpf_lsm_verify_prog(struct bpf_verifier_log *vlog, return -EINVAL; } - if (strncmp(BPF_LSM_SYM_PREFX, prog->aux->attach_func_name, - sizeof(BPF_LSM_SYM_PREFX) - 1)) { + if (!btf_id_set_contains(&bpf_lsm_hooks, prog->aux->attach_btf_id)) { bpf_log(vlog, "attach_btf_id %u points to wrong type name %s\n", prog->aux->attach_btf_id, prog->aux->attach_func_name); return -EINVAL; From cc6528bc9a0c901c83b8220a2e2617f3354d6dd9 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 5 Nov 2020 15:28:42 +0100 Subject: [PATCH 23/65] r8169: fix potential skb double free in an error path The caller of rtl8169_tso_csum_v2() frees the skb if false is returned. eth_skb_pad() internally frees the skb on error what would result in a double free. Therefore use __skb_put_padto() directly and instruct it to not free the skb on error. Fixes: b423e9ae49d7 ("r8169: fix offloaded tx checksum for small packets.") Reported-by: Jakub Kicinski Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/f7e68191-acff-9ded-4263-c016428a8762@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 7766d73823eb6a..4cb43a980ce9f9 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4163,7 +4163,8 @@ static bool rtl8169_tso_csum_v2(struct rtl8169_private *tp, opts[1] |= transport_offset << TCPHO_SHIFT; } else { if (unlikely(skb->len < ETH_ZLEN && rtl_test_hw_pad_bug(tp))) - return !eth_skb_pad(skb); + /* eth_skb_pad would free the skb on error */ + return !__skb_put_padto(skb, ETH_ZLEN, false); } return true; From 847f0a2bfd2fe16d6afa537816b313b71f32e139 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Thu, 5 Nov 2020 18:14:47 +0100 Subject: [PATCH 24/65] r8169: disable hw csum for short packets on all chip versions RTL8125B has same or similar short packet hw padding bug as RTL8168evl. The main workaround has been extended accordingly, however we have to disable also hw checksumming for short packets on affected new chip versions. Instead of checking for an affected chip version let's simply disable hw checksumming for short packets in general. v2: - remove the version checks and disable short packet hw csum in general - reflect this in commit title and message Fixes: 0439297be951 ("r8169: add support for RTL8125B") Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/7fbb35f0-e244-ef65-aa55-3872d7d38698@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/realtek/r8169_main.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4cb43a980ce9f9..85d9c3e30c6994 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -4343,18 +4343,9 @@ static netdev_features_t rtl8169_features_check(struct sk_buff *skb, rtl_chip_supports_csum_v2(tp)) features &= ~NETIF_F_ALL_TSO; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { - if (skb->len < ETH_ZLEN) { - switch (tp->mac_version) { - case RTL_GIGA_MAC_VER_11: - case RTL_GIGA_MAC_VER_12: - case RTL_GIGA_MAC_VER_17: - case RTL_GIGA_MAC_VER_34: - features &= ~NETIF_F_CSUM_MASK; - break; - default: - break; - } - } + /* work around hw bug on some chip versions */ + if (skb->len < ETH_ZLEN) + features &= ~NETIF_F_CSUM_MASK; if (transport_offset > TCPHO_MAX && rtl_chip_supports_csum_v2(tp)) From 4e0396c59559264442963b349ab71f66e471f84d Mon Sep 17 00:00:00 2001 From: Vadym Kochan Date: Fri, 6 Nov 2020 18:11:25 +0200 Subject: [PATCH 25/65] net: marvell: prestera: fix compilation with CONFIG_BRIDGE=m With CONFIG_BRIDGE=m the compilation fails: ld: drivers/net/ethernet/marvell/prestera/prestera_switchdev.o: in function `prestera_bridge_port_event': prestera_switchdev.c:(.text+0x2ebd): undefined reference to `br_vlan_enabled' in case the driver is statically enabled. Fix it by adding 'BRIDGE || BRIDGE=n' dependency. Fixes: e1189d9a5fbe ("net: marvell: prestera: Add Switchdev driver implementation") Reported-by: Randy Dunlap Signed-off-by: Vadym Kochan Acked-by: Randy Dunlap # build-tested Link: https://lore.kernel.org/r/20201106161128.24069-1-vadym.kochan@plvision.eu Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/marvell/prestera/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/prestera/Kconfig b/drivers/net/ethernet/marvell/prestera/Kconfig index b1fcc44f566af7..b6f20e2034c685 100644 --- a/drivers/net/ethernet/marvell/prestera/Kconfig +++ b/drivers/net/ethernet/marvell/prestera/Kconfig @@ -6,6 +6,7 @@ config PRESTERA tristate "Marvell Prestera Switch ASICs support" depends on NET_SWITCHDEV && VLAN_8021Q + depends on BRIDGE || BRIDGE=n select NET_DEVLINK help This driver supports Marvell Prestera Switch ASICs family. From 8ef9ba4d666614497a057d09b0a6eafc1e34eadf Mon Sep 17 00:00:00 2001 From: Oliver Herms Date: Tue, 3 Nov 2020 11:41:33 +0100 Subject: [PATCH 26/65] IPv6: Set SIT tunnel hard_header_len to zero Due to the legacy usage of hard_header_len for SIT tunnels while already using infrastructure from net/ipv4/ip_tunnel.c the calculation of the path MTU in tnl_update_pmtu is incorrect. This leads to unnecessary creation of MTU exceptions for any flow going over a SIT tunnel. As SIT tunnels do not have a header themsevles other than their transport (L3, L2) headers we're leaving hard_header_len set to zero as tnl_update_pmtu is already taking care of the transport headers sizes. This will also help avoiding unnecessary IPv6 GC runs and spinlock contention seen when using SIT tunnels and for more than net.ipv6.route.gc_thresh flows. Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Signed-off-by: Oliver Herms Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/20201103104133.GA1573211@tws Signed-off-by: Jakub Kicinski --- net/ipv6/sit.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 5e2c34c0ac9736..5e7983cb61546f 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1128,7 +1128,6 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) if (tdev && !netif_is_l3_master(tdev)) { int t_hlen = tunnel->hlen + sizeof(struct iphdr); - dev->hard_header_len = tdev->hard_header_len + sizeof(struct iphdr); dev->mtu = tdev->mtu - t_hlen; if (dev->mtu < IPV6_MIN_MTU) dev->mtu = IPV6_MIN_MTU; @@ -1426,7 +1425,6 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->priv_destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; - dev->hard_header_len = LL_MAX_HEADER + t_hlen; dev->mtu = ETH_DATA_LEN - t_hlen; dev->min_mtu = IPV6_MIN_MTU; dev->max_mtu = IP6_MAX_MTU - t_hlen; From 77a2d673d5c9d1d359b5652ff75043273c5dea28 Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 6 Nov 2020 17:59:52 +0100 Subject: [PATCH 27/65] tunnels: Fix off-by-one in lower MTU bounds for ICMP/ICMPv6 replies Jianlin reports that a bridged IPv6 VXLAN endpoint, carrying IPv6 packets over a link with a PMTU estimation of exactly 1350 bytes, won't trigger ICMPv6 Packet Too Big replies when the encapsulated datagrams exceed said PMTU value. VXLAN over IPv6 adds 70 bytes of overhead, so an ICMPv6 reply indicating 1280 bytes as inner MTU would be legitimate and expected. This comes from an off-by-one error I introduced in checks added as part of commit 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets"), whose purpose was to prevent sending ICMPv6 Packet Too Big messages with an MTU lower than the smallest permissible IPv6 link MTU, i.e. 1280 bytes. In iptunnel_pmtud_check_icmpv6(), avoid triggering a reply only if the advertised MTU would be less than, and not equal to, 1280 bytes. Also fix the analogous comparison for IPv4, that is, skip the ICMP reply only if the resulting MTU is strictly less than 576 bytes. This becomes apparent while running the net/pmtu.sh bridged VXLAN or GENEVE selftests with adjusted lower-link MTU values. Using e.g. GENEVE, setting ll_mtu to the values reported below, in the test_pmtu_ipvX_over_bridged_vxlanY_or_geneveY_exception() test function, we can see failures on the following tests: test | ll_mtu -------------------------------|-------- pmtu_ipv4_br_geneve4_exception | 626 pmtu_ipv6_br_geneve4_exception | 1330 pmtu_ipv6_br_geneve6_exception | 1350 owing to the different tunneling overheads implied by the corresponding configurations. Reported-by: Jianlin Shi Fixes: 4cb47a8644cc ("tunnels: PMTU discovery support for directly bridged IP packets") Signed-off-by: Stefano Brivio Link: https://lore.kernel.org/r/4f5fc2f33bfdf8409549fafd4f952b008bf04d63.1604681709.git.sbrivio@redhat.com Signed-off-by: Jakub Kicinski --- net/ipv4/ip_tunnel_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 25f1caf5abf93c..e25be2d01a7ac0 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -263,7 +263,7 @@ static int iptunnel_pmtud_check_icmp(struct sk_buff *skb, int mtu) const struct icmphdr *icmph = icmp_hdr(skb); const struct iphdr *iph = ip_hdr(skb); - if (mtu <= 576 || iph->frag_off != htons(IP_DF)) + if (mtu < 576 || iph->frag_off != htons(IP_DF)) return 0; if (ipv4_is_lbcast(iph->daddr) || ipv4_is_multicast(iph->daddr) || @@ -359,7 +359,7 @@ static int iptunnel_pmtud_check_icmpv6(struct sk_buff *skb, int mtu) __be16 frag_off; int offset; - if (mtu <= IPV6_MIN_MTU) + if (mtu < IPV6_MIN_MTU) return 0; if (stype == IPV6_ADDR_ANY || stype == IPV6_ADDR_MULTICAST || From 413691384a37fe27f43460226c4160e33140e638 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 8 Nov 2020 00:46:15 +0000 Subject: [PATCH 28/65] ethtool: netlink: add missing netdev_features_change() call After updating userspace Ethtool from 5.7 to 5.9, I noticed that NETDEV_FEAT_CHANGE is no more raised when changing netdev features through Ethtool. That's because the old Ethtool ioctl interface always calls netdev_features_change() at the end of user request processing to inform the kernel that our netdevice has some features changed, but the new Netlink interface does not. Instead, it just notifies itself with ETHTOOL_MSG_FEATURES_NTF. Replace this ethtool_notify() call with netdev_features_change(), so the kernel will be aware of any features changes, just like in case with the ioctl interface. This does not omit Ethtool notifications, as Ethtool itself listens to NETDEV_FEAT_CHANGE and drops ETHTOOL_MSG_FEATURES_NTF on it (net/ethtool/netlink.c:ethnl_netdev_event()). From v1 [1]: - dropped extra new line as advised by Jakub; - no functional changes. [1] https://lore.kernel.org/netdev/AlZXQ2o5uuTVHCfNGOiGgJ8vJ3KgO5YIWAnQjH0cDE@cp3-web-009.plabs.ch Fixes: 0980bfcd6954 ("ethtool: set netdev features with FEATURES_SET request") Signed-off-by: Alexander Lobakin Reviewed-by: Michal Kubecek Link: https://lore.kernel.org/r/ahA2YWXYICz5rbUSQqNG4roJ8OlJzzYQX7PTiG80@cp4-web-028.plabs.ch Signed-off-by: Jakub Kicinski --- net/ethtool/features.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ethtool/features.c b/net/ethtool/features.c index 8ee4cdbd6b82f1..1c9f4df273bd55 100644 --- a/net/ethtool/features.c +++ b/net/ethtool/features.c @@ -280,7 +280,7 @@ int ethnl_set_features(struct sk_buff *skb, struct genl_info *info) active_diff_mask, compact); } if (mod) - ethtool_notify(dev, ETHTOOL_MSG_FEATURES_NTF, NULL); + netdev_features_change(dev); out_rtnl: rtnl_unlock(); From 16eb0eb835c77c5e8824b8aa90b11b00ddc5c122 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20Neusch=C3=A4fer?= Date: Sat, 7 Nov 2020 23:08:21 +0100 Subject: [PATCH 29/65] docs: networking: phy: s/2.5 times faster/2.5 times as fast/ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 2.5 times faster would be 3.5 Gbps (4.375 Gbaud after 8b/10b encoding). Signed-off-by: Jonathan Neuschäfer Link: https://lore.kernel.org/r/20201107220822.1291215-1-j.neuschaefer@gmx.net Signed-off-by: Jakub Kicinski --- Documentation/networking/phy.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/networking/phy.rst b/Documentation/networking/phy.rst index 256106054c8cb0..b2f7ec794bc8b6 100644 --- a/Documentation/networking/phy.rst +++ b/Documentation/networking/phy.rst @@ -247,8 +247,8 @@ Some of the interface modes are described below: speeds (see below.) ``PHY_INTERFACE_MODE_2500BASEX`` - This defines a variant of 1000BASE-X which is clocked 2.5 times faster, - than the 802.3 standard giving a fixed bit rate of 3.125Gbaud. + This defines a variant of 1000BASE-X which is clocked 2.5 times as fast + as the 802.3 standard, giving a fixed bit rate of 3.125Gbaud. ``PHY_INTERFACE_MODE_SGMII`` This is used for Cisco SGMII, which is a modification of 1000BASE-X From 989ef49bdf100cc772b3a8737089df36b1ab1e30 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Sun, 8 Nov 2020 19:49:59 +0100 Subject: [PATCH 30/65] mptcp: provide rmem[0] limit The mptcp proto struct currently does not provide the required limit for forward memory scheduling. Under pressure sk_rmem_schedule() will unconditionally try to use such field and will oops. Address the issue inheriting the tcp limit, as we already do for the wmem one. Fixes: 9c3f94e1681b ("mptcp: add missing memory scheduling in the rx path") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts Link: https://lore.kernel.org/r/37af798bd46f402fb7c79f57ebbdd00614f5d7fa.1604861097.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski --- net/mptcp/protocol.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index e7419fd15d84c9..88f2a7a0ccb86b 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2467,6 +2467,7 @@ static struct proto mptcp_prot = { .memory_pressure = &tcp_memory_pressure, .stream_memory_free = mptcp_memory_free, .sysctl_wmem_offset = offsetof(struct net, ipv4.sysctl_tcp_wmem), + .sysctl_rmem_offset = offsetof(struct net, ipv4.sysctl_tcp_rmem), .sysctl_mem = sysctl_tcp_mem, .obj_size = sizeof(struct mptcp_sock), .slab_flags = SLAB_TYPESAFE_BY_RCU, From 97adb13dc9ba08ecd4758bc59efc0205f5cbf377 Mon Sep 17 00:00:00 2001 From: Vlad Buslov Date: Sat, 7 Nov 2020 13:19:28 +0200 Subject: [PATCH 31/65] selftest: fix flower terse dump tests Iproute2 tc classifier terse dump has been accepted with modified syntax. Update the tests accordingly. Signed-off-by: Vlad Buslov Fixes: e7534fd42a99 ("selftests: implement flower classifier terse dump tests") Link: https://lore.kernel.org/r/20201107111928.453534-1-vlad@buslov.dev Signed-off-by: Jakub Kicinski --- .../testing/selftests/tc-testing/tc-tests/filters/tests.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json index bb543bf69d6945..361235ad574be1 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json +++ b/tools/testing/selftests/tc-testing/tc-tests/filters/tests.json @@ -100,7 +100,7 @@ ], "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", "expExitCode": "0", - "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "verifyCmd": "$TC -br filter show dev $DEV2 ingress", "matchPattern": "filter protocol ip pref 1 flower.*handle", "matchCount": "1", "teardown": [ @@ -119,7 +119,7 @@ ], "cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop", "expExitCode": "0", - "verifyCmd": "$TC filter show terse dev $DEV2 ingress", + "verifyCmd": "$TC -br filter show dev $DEV2 ingress", "matchPattern": " dst_mac e4:11:22:11:4a:51", "matchCount": "0", "teardown": [ From 3a7001788fed0311d6fb77ed0dabe7bed3567bc0 Mon Sep 17 00:00:00 2001 From: Slawomir Laba Date: Wed, 14 Oct 2020 08:54:09 +0000 Subject: [PATCH 32/65] i40e: Fix MAC address setting for a VF via Host/VM Fix MAC setting flow for the PF driver. Update the unicast VF's MAC address in VF structure if it is a new setting in i40e_vc_add_mac_addr_msg. When unicast MAC address gets deleted, record that and set the new unicast MAC address that is already waiting in the filter list. This logic is based on the order of messages arriving to the PF driver. Without this change the MAC address setting was interpreted incorrectly in the following use cases: 1) Print incorrect VF MAC or zero MAC ip link show dev $pf 2) Don't preserve MAC between driver reload rmmod iavf; modprobe iavf 3) Update VF MAC when macvlan was set ip link add link $vf address $mac $vf.1 type macvlan 4) Failed to update mac address when VF was trusted ip link set dev $vf address $mac This includes all other configurations including above commands. Fixes: f657a6e1313b ("i40e: Fix VF driver MAC address configuration") Signed-off-by: Slawomir Laba Tested-by: Konrad Jankowski Signed-off-by: Tony Nguyen --- .../ethernet/intel/i40e/i40e_virtchnl_pf.c | 26 +++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index c96e2f2d4cba56..4919d22d7b6bde 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2713,6 +2713,10 @@ static int i40e_vc_add_mac_addr_msg(struct i40e_vf *vf, u8 *msg) spin_unlock_bh(&vsi->mac_filter_hash_lock); goto error_param; } + if (is_valid_ether_addr(al->list[i].addr) && + is_zero_ether_addr(vf->default_lan_addr.addr)) + ether_addr_copy(vf->default_lan_addr.addr, + al->list[i].addr); } } spin_unlock_bh(&vsi->mac_filter_hash_lock); @@ -2740,6 +2744,7 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) { struct virtchnl_ether_addr_list *al = (struct virtchnl_ether_addr_list *)msg; + bool was_unimac_deleted = false; struct i40e_pf *pf = vf->pf; struct i40e_vsi *vsi = NULL; i40e_status ret = 0; @@ -2759,6 +2764,8 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) ret = I40E_ERR_INVALID_MAC_ADDR; goto error_param; } + if (ether_addr_equal(al->list[i].addr, vf->default_lan_addr.addr)) + was_unimac_deleted = true; } vsi = pf->vsi[vf->lan_vsi_idx]; @@ -2779,10 +2786,25 @@ static int i40e_vc_del_mac_addr_msg(struct i40e_vf *vf, u8 *msg) dev_err(&pf->pdev->dev, "Unable to program VF %d MAC filters, error %d\n", vf->vf_id, ret); + if (vf->trusted && was_unimac_deleted) { + struct i40e_mac_filter *f; + struct hlist_node *h; + u8 *macaddr = NULL; + int bkt; + + /* set last unicast mac address as default */ + spin_lock_bh(&vsi->mac_filter_hash_lock); + hash_for_each_safe(vsi->mac_filter_hash, bkt, h, f, hlist) { + if (is_valid_ether_addr(f->macaddr)) + macaddr = f->macaddr; + } + if (macaddr) + ether_addr_copy(vf->default_lan_addr.addr, macaddr); + spin_unlock_bh(&vsi->mac_filter_hash_lock); + } error_param: /* send the response to the VF */ - return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DEL_ETH_ADDR, - ret); + return i40e_vc_send_resp_to_vf(vf, VIRTCHNL_OP_DEL_ETH_ADDR, ret); } /** From 1773482fd8cecd5b060d409853f8145be3064a41 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 16 Sep 2020 17:32:28 +0300 Subject: [PATCH 33/65] i40e, xsk: uninitialized variable in i40e_clean_rx_irq_zc() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "failure" variable is used without being initialized. It should be set to false. Fixes: 8cbf74149903 ("i40e, xsk: move buffer allocation out of the Rx processing loop") Signed-off-by: Dan Carpenter Acked-by: Björn Töpel Tested-by: George Kuruvinakunnel Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/i40e/i40e_xsk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 6acede0acdca59..567fd67e900efd 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -281,8 +281,8 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) unsigned int total_rx_bytes = 0, total_rx_packets = 0; u16 cleaned_count = I40E_DESC_UNUSED(rx_ring); unsigned int xdp_res, xdp_xmit = 0; + bool failure = false; struct sk_buff *skb; - bool failure; while (likely(total_rx_packets < (unsigned int)budget)) { union i40e_rx_desc *rx_desc; From 6b7ed22ae4c96a415001f0c3116ebee15bb8491a Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Fri, 25 Sep 2020 11:35:37 -0700 Subject: [PATCH 34/65] igc: Fix returning wrong statistics 'igc_update_stats()' was not updating 'netdev->stats', so the returned statistics, for example, requested by: $ ip -s link show dev enp3s0 were not being updated and were always zero. Fix by returning a set of statistics that are actually being updated (adapter->stats64). Fixes: c9a11c23ceb6 ("igc: Add netdev") Signed-off-by: Vinicius Costa Gomes Tested-by: Aaron Brown Signed-off-by: Tony Nguyen --- drivers/net/ethernet/intel/igc/igc_main.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index 9112dff075cf7d..b673ac1199bbc7 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -3891,21 +3891,23 @@ static int igc_change_mtu(struct net_device *netdev, int new_mtu) } /** - * igc_get_stats - Get System Network Statistics + * igc_get_stats64 - Get System Network Statistics * @netdev: network interface device structure + * @stats: rtnl_link_stats64 pointer * * Returns the address of the device statistics structure. * The statistics are updated here and also from the timer callback. */ -static struct net_device_stats *igc_get_stats(struct net_device *netdev) +static void igc_get_stats64(struct net_device *netdev, + struct rtnl_link_stats64 *stats) { struct igc_adapter *adapter = netdev_priv(netdev); + spin_lock(&adapter->stats64_lock); if (!test_bit(__IGC_RESETTING, &adapter->state)) igc_update_stats(adapter); - - /* only return the current stats */ - return &netdev->stats; + memcpy(stats, &adapter->stats64, sizeof(*stats)); + spin_unlock(&adapter->stats64_lock); } static netdev_features_t igc_fix_features(struct net_device *netdev, @@ -4855,7 +4857,7 @@ static const struct net_device_ops igc_netdev_ops = { .ndo_set_rx_mode = igc_set_rx_mode, .ndo_set_mac_address = igc_set_mac, .ndo_change_mtu = igc_change_mtu, - .ndo_get_stats = igc_get_stats, + .ndo_get_stats64 = igc_get_stats64, .ndo_fix_features = igc_fix_features, .ndo_set_features = igc_set_features, .ndo_features_check = igc_features_check, From 5fb7f75bc138c868df2df40d386c7244122cca77 Mon Sep 17 00:00:00 2001 From: Tony Nguyen Date: Mon, 9 Nov 2020 16:07:35 -0800 Subject: [PATCH 35/65] MAINTAINERS: Update repositories for Intel Ethernet Drivers Update Intel Ethernet Drivers repositories to new locations. Signed-off-by: Tony Nguyen --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cd123d0a6a2dc2..9e826b55fcd9eb 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8829,8 +8829,8 @@ S: Supported W: http://www.intel.com/support/feedback.htm W: http://e1000.sourceforge.net/ Q: http://patchwork.ozlabs.org/project/intel-wired-lan/list/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-queue.git -T: git git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/net-queue.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/tnguy/next-queue.git F: Documentation/networking/device_drivers/ethernet/intel/ F: drivers/net/ethernet/intel/ F: drivers/net/ethernet/intel/*/ From 866358ec331f8faa394995fb4b511af1db0247c8 Mon Sep 17 00:00:00 2001 From: Paul Moore Date: Sun, 8 Nov 2020 09:08:26 -0500 Subject: [PATCH 36/65] netlabel: fix our progress tracking in netlbl_unlabel_staticlist() The current NetLabel code doesn't correctly keep track of the netlink dump state in some cases, in particular when multiple interfaces with large configurations are loaded. The problem manifests itself by not reporting the full configuration to userspace, even though it is loaded and active in the kernel. This patch fixes this by ensuring that the dump state is properly reset when necessary inside the netlbl_unlabel_staticlist() function. Fixes: 8cc44579d1bd ("NetLabel: Introduce static network labels for unlabeled connections") Signed-off-by: Paul Moore Link: https://lore.kernel.org/r/160484450633.3752.16512718263560813473.stgit@sifl Signed-off-by: Jakub Kicinski --- net/netlabel/netlabel_unlabeled.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c index 2e8e3f7b21110e..fc55c9116da0a8 100644 --- a/net/netlabel/netlabel_unlabeled.c +++ b/net/netlabel/netlabel_unlabeled.c @@ -1166,12 +1166,13 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, struct netlbl_unlhsh_walk_arg cb_arg; u32 skip_bkt = cb->args[0]; u32 skip_chain = cb->args[1]; - u32 iter_bkt; - u32 iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0; + u32 skip_addr4 = cb->args[2]; + u32 iter_bkt, iter_chain, iter_addr4 = 0, iter_addr6 = 0; struct netlbl_unlhsh_iface *iface; struct list_head *iter_list; struct netlbl_af4list *addr4; #if IS_ENABLED(CONFIG_IPV6) + u32 skip_addr6 = cb->args[3]; struct netlbl_af6list *addr6; #endif @@ -1182,7 +1183,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, rcu_read_lock(); for (iter_bkt = skip_bkt; iter_bkt < rcu_dereference(netlbl_unlhsh)->size; - iter_bkt++, iter_chain = 0, iter_addr4 = 0, iter_addr6 = 0) { + iter_bkt++) { iter_list = &rcu_dereference(netlbl_unlhsh)->tbl[iter_bkt]; list_for_each_entry_rcu(iface, iter_list, list) { if (!iface->valid || @@ -1190,7 +1191,7 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, continue; netlbl_af4list_foreach_rcu(addr4, &iface->addr4_list) { - if (iter_addr4++ < cb->args[2]) + if (iter_addr4++ < skip_addr4) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1203,10 +1204,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr4 = 0; + skip_addr4 = 0; #if IS_ENABLED(CONFIG_IPV6) netlbl_af6list_foreach_rcu(addr6, &iface->addr6_list) { - if (iter_addr6++ < cb->args[3]) + if (iter_addr6++ < skip_addr6) continue; if (netlbl_unlabel_staticlist_gen( NLBL_UNLABEL_C_STATICLIST, @@ -1219,8 +1222,12 @@ static int netlbl_unlabel_staticlist(struct sk_buff *skb, goto unlabel_staticlist_return; } } + iter_addr6 = 0; + skip_addr6 = 0; #endif /* IPv6 */ } + iter_chain = 0; + skip_chain = 0; } unlabel_staticlist_return: From 902a66e08ceaadb9a7a1ab3a4f3af611cd1d8cba Mon Sep 17 00:00:00 2001 From: Sven Van Asbroeck Date: Sun, 8 Nov 2020 12:12:24 -0500 Subject: [PATCH 37/65] lan743x: correctly handle chips with internal PHY Commit 6f197fb63850 ("lan743x: Added fixed link and RGMII support") assumes that chips with an internal PHY will never have a devicetree entry. This is incorrect: even for these chips, a devicetree entry can be useful e.g. to pass the mac address from bootloader to chip: &pcie { status = "okay"; host@0 { reg = <0 0 0 0 0>; #address-cells = <3>; #size-cells = <2>; lan7430: ethernet@0 { /* LAN7430 with internal PHY */ compatible = "microchip,lan743x"; status = "okay"; reg = <0 0 0 0 0>; /* filled in by bootloader */ local-mac-address = [00 00 00 00 00 00]; }; }; }; If a devicetree entry is present, the driver will not attach the chip to its internal phy, and the chip will be non-operational. Fix by tweaking the phy connection algorithm: - first try to connect to a phy specified in the devicetree (could be 'real' phy, or just a 'fixed-link') - if that doesn't succeed, try to connect to an internal phy, even if the chip has a devnode Tested on a LAN7430 with internal PHY. I cannot test a device using fixed-link, as I do not have access to one. Fixes: 6f197fb63850 ("lan743x: Added fixed link and RGMII support") Tested-by: Sven Van Asbroeck # lan7430 Reviewed-by: Andrew Lunn Signed-off-by: Sven Van Asbroeck Link: https://lore.kernel.org/r/20201108171224.23829-1-TheSven73@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index a1938842f828e3..bd77877cf1ccce 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1026,9 +1026,9 @@ static int lan743x_phy_open(struct lan743x_adapter *adapter) netdev = adapter->netdev; phynode = of_node_get(adapter->pdev->dev.of_node); - adapter->phy_mode = PHY_INTERFACE_MODE_GMII; if (phynode) { + /* try devicetree phy, or fixed link */ of_get_phy_mode(phynode, &adapter->phy_mode); if (of_phy_is_fixed_link(phynode)) { @@ -1044,13 +1044,15 @@ static int lan743x_phy_open(struct lan743x_adapter *adapter) lan743x_phy_link_status_change, 0, adapter->phy_mode); of_node_put(phynode); - if (!phydev) - goto return_error; - } else { + } + + if (!phydev) { + /* try internal phy */ phydev = phy_find_first(adapter->mdiobus); if (!phydev) goto return_error; + adapter->phy_mode = PHY_INTERFACE_MODE_GMII; ret = phy_connect_direct(netdev, phydev, lan743x_phy_link_status_change, adapter->phy_mode); From f3037c5a31b58a73b32a36e938ad0560085acadd Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Sun, 8 Nov 2020 22:44:02 +0100 Subject: [PATCH 38/65] net: phy: realtek: support paged operations on RTL8201CP The RTL8401-internal PHY identifies as RTL8201CP, and the init sequence in r8169, copied from vendor driver r8168, uses paged operations. Therefore set the same paged operation callbacks as for the other Realtek PHY's. Fixes: cdafdc29ef75 ("r8169: sync support for RTL8401 with vendor driver") Signed-off-by: Heiner Kallweit Link: https://lore.kernel.org/r/69882f7a-ca2f-e0c7-ae83-c9b6937282cd@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/phy/realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c index fb1db713b7fb73..575580d3ffe041 100644 --- a/drivers/net/phy/realtek.c +++ b/drivers/net/phy/realtek.c @@ -551,6 +551,8 @@ static struct phy_driver realtek_drvs[] = { { PHY_ID_MATCH_EXACT(0x00008201), .name = "RTL8201CP Ethernet", + .read_page = rtl821x_read_page, + .write_page = rtl821x_write_page, }, { PHY_ID_MATCH_EXACT(0x001cc816), .name = "RTL8201F Fast Ethernet", From 909172a149749242990a6e64cb55d55460d4e417 Mon Sep 17 00:00:00 2001 From: Mao Wenan Date: Tue, 10 Nov 2020 08:16:31 +0800 Subject: [PATCH 39/65] net: Update window_clamp if SOCK_RCVBUF is set When net.ipv4.tcp_syncookies=1 and syn flood is happened, cookie_v4_check or cookie_v6_check tries to redo what tcp_v4_send_synack or tcp_v6_send_synack did, rsk_window_clamp will be changed if SOCK_RCVBUF is set, which will make rcv_wscale is different, the client still operates with initial window scale and can overshot granted window, the client use the initial scale but local server use new scale to advertise window value, and session work abnormally. Fixes: e88c64f0a425 ("tcp: allow effective reduction of TCP's rcv-buffer via setsockopt") Signed-off-by: Mao Wenan Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/1604967391-123737-1-git-send-email-wenan.mao@linux.alibaba.com Signed-off-by: Jakub Kicinski --- net/ipv4/syncookies.c | 9 +++++++-- net/ipv6/syncookies.c | 10 ++++++++-- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 6ac473b47f30d4..00dc3f943c80b5 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -331,7 +331,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct rtable *rt; __u8 rcv_wscale; struct flowi4 fl4; @@ -427,8 +427,13 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) /* Try to redo what tcp_v4_send_synack did. */ req->rsk_window_clamp = tp->window_clamp ? :dst_metric(&rt->dst, RTAX_WINDOW); + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; - tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, + tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(&rt->dst, RTAX_INITRWND)); diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c index e796a64be308cc..9b6cae1e49d910 100644 --- a/net/ipv6/syncookies.c +++ b/net/ipv6/syncookies.c @@ -136,7 +136,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) __u32 cookie = ntohl(th->ack_seq) - 1; struct sock *ret = sk; struct request_sock *req; - int mss; + int full_space, mss; struct dst_entry *dst; __u8 rcv_wscale; u32 tsoff = 0; @@ -241,7 +241,13 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) } req->rsk_window_clamp = tp->window_clamp ? :dst_metric(dst, RTAX_WINDOW); - tcp_select_initial_window(sk, tcp_full_space(sk), req->mss, + /* limit the window selection if the user enforce a smaller rx buffer */ + full_space = tcp_full_space(sk); + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->rsk_window_clamp > full_space || req->rsk_window_clamp == 0)) + req->rsk_window_clamp = full_space; + + tcp_select_initial_window(sk, full_space, req->mss, &req->rsk_rcv_wnd, &req->rsk_window_clamp, ireq->wscale_ok, &rcv_wscale, dst_metric(dst, RTAX_INITRWND)); From 2bae900b9419db3f3e43bbda3194657235fee096 Mon Sep 17 00:00:00 2001 From: zhangxiaoxu Date: Mon, 9 Nov 2020 09:44:16 -0500 Subject: [PATCH 40/65] net: dsa: mv88e6xxx: Fix memleak in mv88e6xxx_region_atu_snapshot When mv88e6xxx_fid_map return error, we lost free the table. Fix it. Fixes: bfb255428966 ("net: dsa: mv88e6xxx: Add devlink regions") Reported-by: Hulk Robot Signed-off-by: zhangxiaoxu Reviewed-by: Andrew Lunn Link: https://lore.kernel.org/r/20201109144416.1540867-1-zhangxiaoxu5@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/dsa/mv88e6xxx/devlink.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/mv88e6xxx/devlink.c b/drivers/net/dsa/mv88e6xxx/devlink.c index 10cd1bfd81a0a6..ade04c036fd9d2 100644 --- a/drivers/net/dsa/mv88e6xxx/devlink.c +++ b/drivers/net/dsa/mv88e6xxx/devlink.c @@ -393,8 +393,10 @@ static int mv88e6xxx_region_atu_snapshot(struct devlink *dl, mv88e6xxx_reg_lock(chip); err = mv88e6xxx_fid_map(chip, fid_bitmap); - if (err) + if (err) { + kfree(table); goto out; + } while (1) { fid = find_next_bit(fid_bitmap, MV88E6XXX_N_FID, fid + 1); From 2b52a4b65bc8f14520fe6e996ea7fb3f7e400761 Mon Sep 17 00:00:00 2001 From: Sven Van Asbroeck Date: Mon, 9 Nov 2020 15:38:28 -0500 Subject: [PATCH 41/65] lan743x: fix "BUG: invalid wait context" when setting rx mode In the net core, the struct net_device_ops -> ndo_set_rx_mode() callback is called with the dev->addr_list_lock spinlock held. However, this driver's ndo_set_rx_mode callback eventually calls lan743x_dp_write(), which acquires a mutex. Mutex acquisition may sleep, and this is not allowed when holding a spinlock. Fix by removing the dp_lock mutex entirely. Its purpose is to prevent concurrent accesses to the data port. No concurrent accesses are possible, because the dev->addr_list_lock spinlock in the core only lets through one thread at a time. Fixes: 23f0703c125b ("lan743x: Add main source files for new lan743x driver") Signed-off-by: Sven Van Asbroeck Link: https://lore.kernel.org/r/20201109203828.5115-1-TheSven73@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 12 +++--------- drivers/net/ethernet/microchip/lan743x_main.h | 3 --- 2 files changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index bd77877cf1ccce..173158656559d7 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -674,14 +674,12 @@ static int lan743x_intr_open(struct lan743x_adapter *adapter) static int lan743x_dp_write(struct lan743x_adapter *adapter, u32 select, u32 addr, u32 length, u32 *buf) { - int ret = -EIO; u32 dp_sel; int i; - mutex_lock(&adapter->dp_lock); if (lan743x_csr_wait_for_bit(adapter, DP_SEL, DP_SEL_DPRDY_, 1, 40, 100, 100)) - goto unlock; + return -EIO; dp_sel = lan743x_csr_read(adapter, DP_SEL); dp_sel &= ~DP_SEL_MASK_; dp_sel |= select; @@ -693,13 +691,10 @@ static int lan743x_dp_write(struct lan743x_adapter *adapter, lan743x_csr_write(adapter, DP_CMD, DP_CMD_WRITE_); if (lan743x_csr_wait_for_bit(adapter, DP_SEL, DP_SEL_DPRDY_, 1, 40, 100, 100)) - goto unlock; + return -EIO; } - ret = 0; -unlock: - mutex_unlock(&adapter->dp_lock); - return ret; + return 0; } static u32 lan743x_mac_mii_access(u16 id, u16 index, int read) @@ -2735,7 +2730,6 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter, adapter->intr.irq = adapter->pdev->irq; lan743x_csr_write(adapter, INT_EN_CLR, 0xFFFFFFFF); - mutex_init(&adapter->dp_lock); ret = lan743x_gpio_init(adapter); if (ret) diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index c61a404113179d..a536f4a4994dfa 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -712,9 +712,6 @@ struct lan743x_adapter { struct lan743x_csr csr; struct lan743x_intr intr; - /* lock, used to prevent concurrent access to data port */ - struct mutex dp_lock; - struct lan743x_gpio gpio; struct lan743x_ptp ptp; From 4031eeafa71eaf22ae40a15606a134ae86345daf Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Nov 2020 08:57:05 +0100 Subject: [PATCH 42/65] net/af_iucv: fix null pointer dereference on shutdown syzbot reported the following KASAN finding: BUG: KASAN: nullptr-dereference in iucv_send_ctrl+0x390/0x3f0 net/iucv/af_iucv.c:385 Read of size 2 at addr 000000000000021e by task syz-executor907/519 CPU: 0 PID: 519 Comm: syz-executor907 Not tainted 5.9.0-syzkaller-07043-gbcf9877ad213 #0 Hardware name: IBM 3906 M04 701 (KVM/Linux) Call Trace: [<00000000c576af60>] unwind_start arch/s390/include/asm/unwind.h:65 [inline] [<00000000c576af60>] show_stack+0x180/0x228 arch/s390/kernel/dumpstack.c:135 [<00000000c9dcd1f8>] __dump_stack lib/dump_stack.c:77 [inline] [<00000000c9dcd1f8>] dump_stack+0x268/0x2f0 lib/dump_stack.c:118 [<00000000c5fed016>] print_address_description.constprop.0+0x5e/0x218 mm/kasan/report.c:383 [<00000000c5fec82a>] __kasan_report mm/kasan/report.c:517 [inline] [<00000000c5fec82a>] kasan_report+0x11a/0x168 mm/kasan/report.c:534 [<00000000c98b5b60>] iucv_send_ctrl+0x390/0x3f0 net/iucv/af_iucv.c:385 [<00000000c98b6262>] iucv_sock_shutdown+0x44a/0x4c0 net/iucv/af_iucv.c:1457 [<00000000c89d3a54>] __sys_shutdown+0x12c/0x1c8 net/socket.c:2204 [<00000000c89d3b70>] __do_sys_shutdown net/socket.c:2212 [inline] [<00000000c89d3b70>] __s390x_sys_shutdown+0x38/0x48 net/socket.c:2210 [<00000000c9e36eac>] system_call+0xe0/0x28c arch/s390/kernel/entry.S:415 There is nothing to shutdown if a connection has never been established. Besides that iucv->hs_dev is not yet initialized if a socket is in IUCV_OPEN state and iucv->path is not yet initialized if socket is in IUCV_BOUND state. So, just skip the shutdown calls for a socket in these states. Fixes: eac3731bd04c ("[S390]: Add AF_IUCV socket support") Fixes: 82492a355fac ("af_iucv: add shutdown for HS transport") Reviewed-by: Vasily Gorbik Signed-off-by: Ursula Braun [jwi: correct one Fixes tag] Signed-off-by: Julian Wiedmann Signed-off-by: Jakub Kicinski --- net/iucv/af_iucv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c index d805720746672d..047238f01ba6d0 100644 --- a/net/iucv/af_iucv.c +++ b/net/iucv/af_iucv.c @@ -1434,7 +1434,8 @@ static int iucv_sock_shutdown(struct socket *sock, int how) break; } - if (how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) { + if ((how == SEND_SHUTDOWN || how == SHUTDOWN_MASK) && + sk->sk_state == IUCV_CONNECTED) { if (iucv->transport == AF_IUCV_TRANS_IUCV) { txmsg.class = 0; txmsg.tag = 0; From 4711497ae85d90de903671989daf5145054c123e Mon Sep 17 00:00:00 2001 From: Ursula Braun Date: Mon, 9 Nov 2020 08:57:06 +0100 Subject: [PATCH 43/65] MAINTAINERS: remove Ursula Braun as s390 network maintainer I am retiring soon. Thus this patch removes myself from the MAINTAINERS file (s390 network). Signed-off-by: Ursula Braun [jwi: fix up the subject] Signed-off-by: Julian Wiedmann Signed-off-by: Jakub Kicinski --- MAINTAINERS | 3 --- 1 file changed, 3 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index cd123d0a6a2dc2..fa11fe773df510 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15246,7 +15246,6 @@ F: drivers/iommu/s390-iommu.c S390 IUCV NETWORK LAYER M: Julian Wiedmann M: Karsten Graul -M: Ursula Braun L: linux-s390@vger.kernel.org S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ @@ -15257,7 +15256,6 @@ F: net/iucv/ S390 NETWORK DRIVERS M: Julian Wiedmann M: Karsten Graul -M: Ursula Braun L: linux-s390@vger.kernel.org S: Supported W: http://www.ibm.com/developerworks/linux/linux390/ @@ -15828,7 +15826,6 @@ S: Maintained F: drivers/misc/sgi-xp/ SHARED MEMORY COMMUNICATIONS (SMC) SOCKETS -M: Ursula Braun M: Karsten Graul L: linux-s390@vger.kernel.org S: Supported From fa6882c63621821f73cc806f291208e1c6ea6187 Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Mon, 9 Nov 2020 22:09:13 +0800 Subject: [PATCH 44/65] tipc: fix memory leak in tipc_topsrv_start() kmemleak report a memory leak as follows: unreferenced object 0xffff88810a596800 (size 512): comm "ip", pid 21558, jiffies 4297568990 (age 112.120s) hex dump (first 32 bytes): 00 00 00 00 ad 4e ad de ff ff ff ff 00 00 00 00 .....N.......... ff ff ff ff ff ff ff ff 00 83 60 b0 ff ff ff ff ..........`..... backtrace: [<0000000022bbe21f>] tipc_topsrv_init_net+0x1f3/0xa70 [<00000000fe15ddf7>] ops_init+0xa8/0x3c0 [<00000000138af6f2>] setup_net+0x2de/0x7e0 [<000000008c6807a3>] copy_net_ns+0x27d/0x530 [<000000006b21adbd>] create_new_namespaces+0x382/0xa30 [<00000000bb169746>] unshare_nsproxy_namespaces+0xa1/0x1d0 [<00000000fe2e42bc>] ksys_unshare+0x39c/0x780 [<0000000009ba3b19>] __x64_sys_unshare+0x2d/0x40 [<00000000614ad866>] do_syscall_64+0x56/0xa0 [<00000000a1b5ca3c>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 'srv' is malloced in tipc_topsrv_start() but not free before leaving from the error handling cases. We need to free it. Fixes: 5c45ab24ac77 ("tipc: make struct tipc_server private for server.c") Reported-by: Hulk Robot Signed-off-by: Wang Hai Link: https://lore.kernel.org/r/20201109140913.47370-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- net/tipc/topsrv.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 5f6f86051c8301..13f3143609f9ef 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -664,12 +664,18 @@ static int tipc_topsrv_start(struct net *net) ret = tipc_topsrv_work_start(srv); if (ret < 0) - return ret; + goto err_start; ret = tipc_topsrv_create_listener(srv); if (ret < 0) - tipc_topsrv_work_stop(srv); + goto err_create; + return 0; + +err_create: + tipc_topsrv_work_stop(srv); +err_start: + kfree(srv); return ret; } From df392aefe96b9f94efb01ef298b617bab346a9be Mon Sep 17 00:00:00 2001 From: Michael Walle Date: Mon, 9 Nov 2020 12:04:36 +0100 Subject: [PATCH 45/65] arm64: dts: fsl-ls1028a-kontron-sl28: specify in-band mode for ENETC Since commit 71b77a7a27a3 ("enetc: Migrate to PHYLINK and PCS_LYNX") the network port of the Kontron sl28 board is broken. After the migration to phylink the device tree has to specify the in-band-mode property. Add it. Fixes: 71b77a7a27a3 ("enetc: Migrate to PHYLINK and PCS_LYNX") Suggested-by: Vladimir Oltean Signed-off-by: Michael Walle Reviewed-by: Vladimir Oltean Link: https://lore.kernel.org/r/20201109110436.5906-1-michael@walle.cc Signed-off-by: Jakub Kicinski --- arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts index f46eb47cfa4df0..8161dd23797122 100644 --- a/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts +++ b/arch/arm64/boot/dts/freescale/fsl-ls1028a-kontron-sl28.dts @@ -75,6 +75,7 @@ &enetc_port0 { phy-handle = <&phy0>; phy-connection-type = "sgmii"; + managed = "in-band-status"; status = "okay"; mdio { From 361182308766a265b6c521879b34302617a8c209 Mon Sep 17 00:00:00 2001 From: Martin Schiller Date: Mon, 9 Nov 2020 07:54:49 +0100 Subject: [PATCH 46/65] net/x25: Fix null-ptr-deref in x25_connect This fixes a regression for blocking connects introduced by commit 4becb7ee5b3d ("net/x25: Fix x25_neigh refcnt leak when x25 disconnect"). The x25->neighbour is already set to "NULL" by x25_disconnect() now, while a blocking connect is waiting in x25_wait_for_connection_establishment(). Therefore x25->neighbour must not be accessed here again and x25->state is also already set to X25_STATE_0 by x25_disconnect(). Fixes: 4becb7ee5b3d ("net/x25: Fix x25_neigh refcnt leak when x25 disconnect") Signed-off-by: Martin Schiller Reviewed-by: Xie He Link: https://lore.kernel.org/r/20201109065449.9014-1-ms@dev.tdt.de Signed-off-by: Jakub Kicinski --- net/x25/af_x25.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 0bbb283f23c96f..046d3fee66a901 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -825,7 +825,7 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, sock->state = SS_CONNECTED; rc = 0; out_put_neigh: - if (rc) { + if (rc && x25->neighbour) { read_lock_bh(&x25_list_lock); x25_neigh_put(x25->neighbour); x25->neighbour = NULL; From 9d2e5e9eeb59524a59b461fe256139826d464e1e Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:31 +0530 Subject: [PATCH 47/65] cxgb4/ch_ktls: decrypted bit is not enough If skb has retransmit data starting before start marker, e.g. ccs, decrypted bit won't be set for that, and if it has some data to encrypt, then it must be given to crypto ULD. So in place of decrypted, check if socket is tls offloaded. Also, unless skb has some data to encrypt, no need to give it for tls offload handling. v2->v3: - Removed ifdef. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c | 1 + drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 5 +++++ drivers/net/ethernet/chelsio/cxgb4/sge.c | 3 ++- .../net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 4 ---- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c index a952fe198eb923..7fd264a6d08546 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c @@ -1176,6 +1176,7 @@ static u16 cxgb_select_queue(struct net_device *dev, struct sk_buff *skb, txq = netdev_pick_tx(dev, skb, sb_dev); if (xfrm_offload(skb) || is_ptp_enabled(skb, dev) || skb->encapsulation || + cxgb4_is_ktls_skb(skb) || (proto != IPPROTO_TCP && proto != IPPROTO_UDP)) txq = txq % pi->nqsets; diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index b169776ab4845a..e2a4941fa802d0 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -493,6 +493,11 @@ struct cxgb4_uld_info { #endif }; +static inline bool cxgb4_is_ktls_skb(struct sk_buff *skb) +{ + return skb->sk && tls_is_sk_tx_device_offloaded(skb->sk); +} + void cxgb4_uld_enable(struct adapter *adap); void cxgb4_register_uld(enum cxgb4_uld type, const struct cxgb4_uld_info *p); int cxgb4_unregister_uld(enum cxgb4_uld type); diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index a9e9c7ae565dca..01bd9c0dfe4eff 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -1422,7 +1422,8 @@ static netdev_tx_t cxgb4_eth_xmit(struct sk_buff *skb, struct net_device *dev) #endif /* CHELSIO_IPSEC_INLINE */ #if IS_ENABLED(CONFIG_CHELSIO_TLS_DEVICE) - if (skb->decrypted) + if (cxgb4_is_ktls_skb(skb) && + (skb->len - (skb_transport_offset(skb) + tcp_hdrlen(skb)))) return adap->uld[CXGB4_ULD_KTLS].tx_handler(skb, dev); #endif /* CHELSIO_TLS_DEVICE */ diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 5195f692f14d1f..43c723c72c618f 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1878,10 +1878,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) mss = skb_is_gso(skb) ? skb_shinfo(skb)->gso_size : skb->data_len; - /* check if we haven't set it for ktls offload */ - if (!skb->sk || !tls_is_sk_tx_device_offloaded(skb->sk)) - goto out; - tls_ctx = tls_get_ctx(skb->sk); if (unlikely(tls_ctx->netdev != dev)) goto out; From b1b5cb18032b37ab69b23a461eb8be1a44fcfc3b Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:32 +0530 Subject: [PATCH 48/65] ch_ktls: Correction in finding correct length There is a possibility of linear skbs coming in. Correcting the length extraction logic. v2->v3: - Separated un-related changes from this patch. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 43c723c72c618f..447aec7ae95485 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -967,7 +967,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, /* packet length = eth hdr len + ip hdr len + tcp hdr len * (including options). */ - pktlen = skb->len - skb->data_len; + pktlen = skb_transport_offset(skb) + tcp_hdrlen(skb); ctrl = sizeof(*cpl) + pktlen; len16 = DIV_ROUND_UP(sizeof(*wr) + ctrl, 16); @@ -1860,6 +1860,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, /* nic tls TX handler */ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) { + u32 tls_end_offset, tcp_seq, skb_data_len, skb_offset; struct ch_ktls_port_stats_debug *port_stats; struct chcr_ktls_ofld_ctx_tx *tx_ctx; struct ch_ktls_stats_debug *stats; @@ -1867,7 +1868,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) int data_len, qidx, ret = 0, mss; struct tls_record_info *record; struct chcr_ktls_info *tx_info; - u32 tls_end_offset, tcp_seq; struct tls_context *tls_ctx; struct sk_buff *local_skb; struct sge_eth_txq *q; @@ -1875,8 +1875,11 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) unsigned long flags; tcp_seq = ntohl(th->seq); + skb_offset = skb_transport_offset(skb) + tcp_hdrlen(skb); + skb_data_len = skb->len - skb_offset; + data_len = skb_data_len; - mss = skb_is_gso(skb) ? skb_shinfo(skb)->gso_size : skb->data_len; + mss = skb_is_gso(skb) ? skb_shinfo(skb)->gso_size : data_len; tls_ctx = tls_get_ctx(skb->sk); if (unlikely(tls_ctx->netdev != dev)) @@ -1922,8 +1925,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) /* copy skb contents into local skb */ chcr_ktls_skb_copy(skb, local_skb); - /* go through the skb and send only one record at a time. */ - data_len = skb->data_len; /* TCP segments can be in received either complete or partial. * chcr_end_part_handler will handle cases if complete record or end * part of the record is received. Incase of partial end part of record, @@ -2020,9 +2021,9 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) } while (data_len > 0); - tx_info->prev_seq = ntohl(th->seq) + skb->data_len; + tx_info->prev_seq = ntohl(th->seq) + skb_data_len; atomic64_inc(&port_stats->ktls_tx_encrypted_packets); - atomic64_add(skb->data_len, &port_stats->ktls_tx_encrypted_bytes); + atomic64_add(skb_data_len, &port_stats->ktls_tx_encrypted_bytes); /* tcp finish is set, send a separate tcp msg including all the options * as well. From 86716b51d14fc2201938939b323ba3ad99186910 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:33 +0530 Subject: [PATCH 49/65] ch_ktls: Update cheksum information Checksum update was missing in the WR. Fixes: 429765a149f1 ("chcr: handle partial end part of a record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 447aec7ae95485..b7a3e757ee72da 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -959,6 +959,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct iphdr *ip; int credits; u8 buf[150]; + u64 cntrl1; void *pos; iplen = skb_network_header_len(skb); @@ -997,22 +998,28 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, TXPKT_PF_V(tx_info->adap->pf)); cpl->pack = 0; cpl->len = htons(pktlen); - /* checksum offload */ - cpl->ctrl1 = 0; - - pos = cpl + 1; memcpy(buf, skb->data, pktlen); if (tx_info->ip_family == AF_INET) { /* we need to correct ip header len */ ip = (struct iphdr *)(buf + maclen); ip->tot_len = htons(pktlen - maclen); + cntrl1 = TXPKT_CSUM_TYPE_V(TX_CSUM_TCPIP); #if IS_ENABLED(CONFIG_IPV6) } else { ip6 = (struct ipv6hdr *)(buf + maclen); ip6->payload_len = htons(pktlen - maclen - iplen); + cntrl1 = TXPKT_CSUM_TYPE_V(TX_CSUM_TCPIP6); #endif } + + cntrl1 |= T6_TXPKT_ETHHDR_LEN_V(maclen - ETH_HLEN) | + TXPKT_IPHDR_LEN_V(iplen); + /* checksum offload */ + cpl->ctrl1 = cpu_to_be64(cntrl1); + + pos = cpl + 1; + /* now take care of the tcp header, if fin is not set then clear push * bit as well, and if fin is set, it will be sent at the last so we * need to update the tcp sequence number as per the last packet. From 687823d2d104df8226eacba74fda9f4ba3aecd6c Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:34 +0530 Subject: [PATCH 50/65] cxgb4/ch_ktls: creating skbs causes panic Creating SKB per tls record and freeing the original one causes panic. There will be race if connection reset is requested. By freeing original skb, refcnt will be decremented and that means, there is no pending record to send, and so tls_dev_del will be requested in control path while SKB of related connection is in queue. Better approach is to use same SKB to send one record (partial data) at a time. We still have to create a new SKB when partial last part of a record is requested. This fix introduces new API cxgb4_write_partial_sgl() to send partial part of skb. Present cxgb4_write_sgl can only provide feasibility to start from an offset which limits to header only and it can write sgls for the whole skb len. But this new API will help in both. It can start from any offset and can end writing in middle of the skb. v4->v5: - Removed extra changes. Fixes: 429765a149f1 ("chcr: handle partial end part of a record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/chelsio/cxgb4/cxgb4.h | 3 + drivers/net/ethernet/chelsio/cxgb4/sge.c | 108 +++++++ .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 284 +++++++----------- 3 files changed, 226 insertions(+), 169 deletions(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h index 3352dad6ca9993..27308600da1532 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h @@ -2124,6 +2124,9 @@ void cxgb4_inline_tx_skb(const struct sk_buff *skb, const struct sge_txq *q, void cxgb4_write_sgl(const struct sk_buff *skb, struct sge_txq *q, struct ulptx_sgl *sgl, u64 *end, unsigned int start, const dma_addr_t *addr); +void cxgb4_write_partial_sgl(const struct sk_buff *skb, struct sge_txq *q, + struct ulptx_sgl *sgl, u64 *end, + const dma_addr_t *addr, u32 start, u32 send_len); void cxgb4_ring_tx_db(struct adapter *adap, struct sge_txq *q, int n); int t4_set_vlan_acl(struct adapter *adap, unsigned int mbox, unsigned int vf, u16 vlan); diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 01bd9c0dfe4eff..196652a114c5fb 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -890,6 +890,114 @@ void cxgb4_write_sgl(const struct sk_buff *skb, struct sge_txq *q, } EXPORT_SYMBOL(cxgb4_write_sgl); +/* cxgb4_write_partial_sgl - populate SGL for partial packet + * @skb: the packet + * @q: the Tx queue we are writing into + * @sgl: starting location for writing the SGL + * @end: points right after the end of the SGL + * @addr: the list of bus addresses for the SGL elements + * @start: start offset in the SKB where partial data starts + * @len: length of data from @start to send out + * + * This API will handle sending out partial data of a skb if required. + * Unlike cxgb4_write_sgl, @start can be any offset into the skb data, + * and @len will decide how much data after @start offset to send out. + */ +void cxgb4_write_partial_sgl(const struct sk_buff *skb, struct sge_txq *q, + struct ulptx_sgl *sgl, u64 *end, + const dma_addr_t *addr, u32 start, u32 len) +{ + struct ulptx_sge_pair buf[MAX_SKB_FRAGS / 2 + 1] = {0}, *to; + u32 frag_size, skb_linear_data_len = skb_headlen(skb); + struct skb_shared_info *si = skb_shinfo(skb); + u8 i = 0, frag_idx = 0, nfrags = 0; + skb_frag_t *frag; + + /* Fill the first SGL either from linear data or from partial + * frag based on @start. + */ + if (unlikely(start < skb_linear_data_len)) { + frag_size = min(len, skb_linear_data_len - start); + sgl->len0 = htonl(frag_size); + sgl->addr0 = cpu_to_be64(addr[0] + start); + len -= frag_size; + nfrags++; + } else { + start -= skb_linear_data_len; + frag = &si->frags[frag_idx]; + frag_size = skb_frag_size(frag); + /* find the first frag */ + while (start >= frag_size) { + start -= frag_size; + frag_idx++; + frag = &si->frags[frag_idx]; + frag_size = skb_frag_size(frag); + } + + frag_size = min(len, skb_frag_size(frag) - start); + sgl->len0 = cpu_to_be32(frag_size); + sgl->addr0 = cpu_to_be64(addr[frag_idx + 1] + start); + len -= frag_size; + nfrags++; + frag_idx++; + } + + /* If the entire partial data fit in one SGL, then send it out + * now. + */ + if (!len) + goto done; + + /* Most of the complexity below deals with the possibility we hit the + * end of the queue in the middle of writing the SGL. For this case + * only we create the SGL in a temporary buffer and then copy it. + */ + to = (u8 *)end > (u8 *)q->stat ? buf : sgl->sge; + + /* If the skb couldn't fit in first SGL completely, fill the + * rest of the frags in subsequent SGLs. Note that each SGL + * pair can store 2 frags. + */ + while (len) { + frag_size = min(len, skb_frag_size(&si->frags[frag_idx])); + to->len[i & 1] = cpu_to_be32(frag_size); + to->addr[i & 1] = cpu_to_be64(addr[frag_idx + 1]); + if (i && (i & 1)) + to++; + nfrags++; + frag_idx++; + i++; + len -= frag_size; + } + + /* If we ended in an odd boundary, then set the second SGL's + * length in the pair to 0. + */ + if (i & 1) + to->len[1] = cpu_to_be32(0); + + /* Copy from temporary buffer to Tx ring, in case we hit the + * end of the queue in the middle of writing the SGL. + */ + if (unlikely((u8 *)end > (u8 *)q->stat)) { + u32 part0 = (u8 *)q->stat - (u8 *)sgl->sge, part1; + + if (likely(part0)) + memcpy(sgl->sge, buf, part0); + part1 = (u8 *)end - (u8 *)q->stat; + memcpy(q->desc, (u8 *)buf + part0, part1); + end = (void *)q->desc + part1; + } + + /* 0-pad to multiple of 16 */ + if ((uintptr_t)end & 8) + *end = 0; +done: + sgl->cmd_nsge = htonl(ULPTX_CMD_V(ULP_TX_SC_DSGL) | + ULPTX_NSGE_V(nfrags)); +} +EXPORT_SYMBOL(cxgb4_write_partial_sgl); + /* This function copies 64 byte coalesced work request to * memory mapped BAR2 space. For coalesced WR SGE fetches * data from the FIFO instead of from Host. diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index b7a3e757ee72da..950841988ffeaf 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -14,6 +14,50 @@ static LIST_HEAD(uld_ctx_list); static DEFINE_MUTEX(dev_mutex); +/* chcr_get_nfrags_to_send: get the remaining nfrags after start offset + * @skb: skb + * @start: start offset. + * @len: how much data to send after @start + */ +static int chcr_get_nfrags_to_send(struct sk_buff *skb, u32 start, u32 len) +{ + struct skb_shared_info *si = skb_shinfo(skb); + u32 frag_size, skb_linear_data_len = skb_headlen(skb); + u8 nfrags = 0, frag_idx = 0; + skb_frag_t *frag; + + /* if its a linear skb then return 1 */ + if (!skb_is_nonlinear(skb)) + return 1; + + if (unlikely(start < skb_linear_data_len)) { + frag_size = min(len, skb_linear_data_len - start); + start = 0; + } else { + start -= skb_linear_data_len; + + frag = &si->frags[frag_idx]; + frag_size = skb_frag_size(frag); + while (start >= frag_size) { + start -= frag_size; + frag_idx++; + frag = &si->frags[frag_idx]; + frag_size = skb_frag_size(frag); + } + frag_size = min(len, skb_frag_size(frag) - start); + } + len -= frag_size; + nfrags++; + + while (len) { + frag_size = min(len, skb_frag_size(&si->frags[frag_idx])); + len -= frag_size; + nfrags++; + frag_idx++; + } + return nfrags; +} + static int chcr_init_tcb_fields(struct chcr_ktls_info *tx_info); /* * chcr_ktls_save_keys: calculate and save crypto keys. @@ -865,35 +909,15 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, return 0; } -/* - * chcr_ktls_skb_copy - * @nskb - new skb where the frags to be added. - * @skb - old skb from which frags will be copied. - */ -static void chcr_ktls_skb_copy(struct sk_buff *skb, struct sk_buff *nskb) -{ - int i; - - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { - skb_shinfo(nskb)->frags[i] = skb_shinfo(skb)->frags[i]; - __skb_frag_ref(&skb_shinfo(nskb)->frags[i]); - } - - skb_shinfo(nskb)->nr_frags = skb_shinfo(skb)->nr_frags; - nskb->len += skb->data_len; - nskb->data_len = skb->data_len; - nskb->truesize += skb->data_len; -} - /* * chcr_ktls_get_tx_flits * returns number of flits to be sent out, it includes key context length, WR * size and skb fragments. */ static unsigned int -chcr_ktls_get_tx_flits(const struct sk_buff *skb, unsigned int key_ctx_len) +chcr_ktls_get_tx_flits(u32 nr_frags, unsigned int key_ctx_len) { - return chcr_sgl_len(skb_shinfo(skb)->nr_frags) + + return chcr_sgl_len(nr_frags) + DIV_ROUND_UP(key_ctx_len + CHCR_KTLS_WR_SIZE, 8); } @@ -1038,71 +1062,6 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, return 0; } -/* chcr_ktls_skb_shift - Shifts request length paged data from skb to another. - * @tgt- buffer into which tail data gets added - * @skb- buffer from which the paged data comes from - * @shiftlen- shift up to this many bytes - */ -static int chcr_ktls_skb_shift(struct sk_buff *tgt, struct sk_buff *skb, - int shiftlen) -{ - skb_frag_t *fragfrom, *fragto; - int from, to, todo; - - WARN_ON(shiftlen > skb->data_len); - - todo = shiftlen; - from = 0; - to = 0; - fragfrom = &skb_shinfo(skb)->frags[from]; - - while ((todo > 0) && (from < skb_shinfo(skb)->nr_frags)) { - fragfrom = &skb_shinfo(skb)->frags[from]; - fragto = &skb_shinfo(tgt)->frags[to]; - - if (todo >= skb_frag_size(fragfrom)) { - *fragto = *fragfrom; - todo -= skb_frag_size(fragfrom); - from++; - to++; - - } else { - __skb_frag_ref(fragfrom); - skb_frag_page_copy(fragto, fragfrom); - skb_frag_off_copy(fragto, fragfrom); - skb_frag_size_set(fragto, todo); - - skb_frag_off_add(fragfrom, todo); - skb_frag_size_sub(fragfrom, todo); - todo = 0; - - to++; - break; - } - } - - /* Ready to "commit" this state change to tgt */ - skb_shinfo(tgt)->nr_frags = to; - - /* Reposition in the original skb */ - to = 0; - while (from < skb_shinfo(skb)->nr_frags) - skb_shinfo(skb)->frags[to++] = skb_shinfo(skb)->frags[from++]; - - skb_shinfo(skb)->nr_frags = to; - - WARN_ON(todo > 0 && !skb_shinfo(skb)->nr_frags); - - skb->len -= shiftlen; - skb->data_len -= shiftlen; - skb->truesize -= shiftlen; - tgt->len += shiftlen; - tgt->data_len += shiftlen; - tgt->truesize += shiftlen; - - return shiftlen; -} - /* * chcr_ktls_xmit_wr_complete: This sends out the complete record. If an skb * received has partial end part of the record, send out the complete record, so @@ -1118,6 +1077,8 @@ static int chcr_ktls_skb_shift(struct sk_buff *tgt, struct sk_buff *skb, static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, struct chcr_ktls_info *tx_info, struct sge_eth_txq *q, u32 tcp_seq, + bool is_last_wr, u32 data_len, + u32 skb_offset, u32 nfrags, bool tcp_push, u32 mss) { u32 len16, wr_mid = 0, flits = 0, ndesc, cipher_start; @@ -1133,7 +1094,7 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, u64 *end; /* get the number of flits required */ - flits = chcr_ktls_get_tx_flits(skb, tx_info->key_ctx_len); + flits = chcr_ktls_get_tx_flits(nfrags, tx_info->key_ctx_len); /* number of descriptors */ ndesc = chcr_flits_to_desc(flits); /* check if enough credits available */ @@ -1162,6 +1123,9 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, return NETDEV_TX_BUSY; } + if (!is_last_wr) + skb_get(skb); + pos = &q->q.desc[q->q.pidx]; end = (u64 *)pos + flits; /* FW_ULPTX_WR */ @@ -1194,7 +1158,7 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, CPL_TX_SEC_PDU_CPLLEN_V(CHCR_CPL_TX_SEC_PDU_LEN_64BIT) | CPL_TX_SEC_PDU_PLACEHOLDER_V(1) | CPL_TX_SEC_PDU_IVINSRTOFST_V(TLS_HEADER_SIZE + 1)); - cpl->pldlen = htonl(skb->data_len); + cpl->pldlen = htonl(data_len); /* encryption should start after tls header size + iv size */ cipher_start = TLS_HEADER_SIZE + tx_info->iv_size + 1; @@ -1236,7 +1200,7 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, /* CPL_TX_DATA */ tx_data = (void *)pos; OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tx_info->tid)); - tx_data->len = htonl(TX_DATA_MSS_V(mss) | TX_LENGTH_V(skb->data_len)); + tx_data->len = htonl(TX_DATA_MSS_V(mss) | TX_LENGTH_V(data_len)); tx_data->rsvd = htonl(tcp_seq); @@ -1256,8 +1220,8 @@ static int chcr_ktls_xmit_wr_complete(struct sk_buff *skb, } /* send the complete packet except the header */ - cxgb4_write_sgl(skb, &q->q, pos, end, skb->len - skb->data_len, - sgl_sdesc->addr); + cxgb4_write_partial_sgl(skb, &q->q, pos, end, sgl_sdesc->addr, + skb_offset, data_len); sgl_sdesc->skb = skb; chcr_txq_advance(&q->q, ndesc); @@ -1289,10 +1253,11 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, struct sge_eth_txq *q, u32 tcp_seq, bool tcp_push, u32 mss, u32 tls_rec_offset, u8 *prior_data, - u32 prior_data_len) + u32 prior_data_len, u32 data_len, + u32 skb_offset) { + u32 len16, wr_mid = 0, cipher_start, nfrags; struct adapter *adap = tx_info->adap; - u32 len16, wr_mid = 0, cipher_start; unsigned int flits = 0, ndesc; int credits, left, last_desc; struct tx_sw_desc *sgl_sdesc; @@ -1305,10 +1270,11 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, void *pos; u64 *end; + nfrags = chcr_get_nfrags_to_send(skb, skb_offset, data_len); /* get the number of flits required, it's a partial record so 2 flits * (AES_BLOCK_SIZE) will be added. */ - flits = chcr_ktls_get_tx_flits(skb, tx_info->key_ctx_len) + 2; + flits = chcr_ktls_get_tx_flits(nfrags, tx_info->key_ctx_len) + 2; /* get the correct 8 byte IV of this record */ iv_record = cpu_to_be64(tx_info->iv + tx_info->record_no); /* If it's a middle record and not 16 byte aligned to run AES CTR, need @@ -1380,7 +1346,7 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, htonl(CPL_TX_SEC_PDU_OPCODE_V(CPL_TX_SEC_PDU) | CPL_TX_SEC_PDU_CPLLEN_V(CHCR_CPL_TX_SEC_PDU_LEN_64BIT) | CPL_TX_SEC_PDU_IVINSRTOFST_V(1)); - cpl->pldlen = htonl(skb->data_len + AES_BLOCK_LEN + prior_data_len); + cpl->pldlen = htonl(data_len + AES_BLOCK_LEN + prior_data_len); cpl->aadstart_cipherstop_hi = htonl(CPL_TX_SEC_PDU_CIPHERSTART_V(cipher_start)); cpl->cipherstop_lo_authinsert = 0; @@ -1411,7 +1377,7 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, tx_data = (void *)pos; OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tx_info->tid)); tx_data->len = htonl(TX_DATA_MSS_V(mss) | - TX_LENGTH_V(skb->data_len + prior_data_len)); + TX_LENGTH_V(data_len + prior_data_len)); tx_data->rsvd = htonl(tcp_seq); tx_data->flags = htonl(TX_BYPASS_F); if (tcp_push) @@ -1444,8 +1410,8 @@ static int chcr_ktls_xmit_wr_short(struct sk_buff *skb, if (prior_data_len) pos = chcr_copy_to_txd(prior_data, &q->q, pos, 16); /* send the complete packet except the header */ - cxgb4_write_sgl(skb, &q->q, pos, end, skb->len - skb->data_len, - sgl_sdesc->addr); + cxgb4_write_partial_sgl(skb, &q->q, pos, end, sgl_sdesc->addr, + skb_offset, data_len); sgl_sdesc->skb = skb; chcr_txq_advance(&q->q, ndesc); @@ -1473,6 +1439,7 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, struct sk_buff *skb, u32 tcp_seq, u32 mss, bool tcp_push, struct sge_eth_txq *q, u32 port_id, u8 *prior_data, + u32 data_len, u32 skb_offset, u32 prior_data_len) { int credits, left, len16, last_desc; @@ -1482,14 +1449,16 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, struct ulptx_idata *idata; struct ulp_txpkt *ulptx; struct fw_ulptx_wr *wr; - u32 wr_mid = 0; + u32 wr_mid = 0, nfrags; void *pos; u64 *end; flits = DIV_ROUND_UP(CHCR_PLAIN_TX_DATA_LEN, 8); - flits += chcr_sgl_len(skb_shinfo(skb)->nr_frags); + nfrags = chcr_get_nfrags_to_send(skb, skb_offset, data_len); + flits += chcr_sgl_len(nfrags); if (prior_data_len) flits += 2; + /* WR will need len16 */ len16 = DIV_ROUND_UP(flits, 2); /* check how many descriptors needed */ @@ -1542,7 +1511,7 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, tx_data = (struct cpl_tx_data *)(idata + 1); OPCODE_TID(tx_data) = htonl(MK_OPCODE_TID(CPL_TX_DATA, tx_info->tid)); tx_data->len = htonl(TX_DATA_MSS_V(mss) | - TX_LENGTH_V(skb->data_len + prior_data_len)); + TX_LENGTH_V(data_len + prior_data_len)); /* set tcp seq number */ tx_data->rsvd = htonl(tcp_seq); tx_data->flags = htonl(TX_BYPASS_F); @@ -1566,8 +1535,8 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, end = pos + left; } /* send the complete packet including the header */ - cxgb4_write_sgl(skb, &q->q, pos, end, skb->len - skb->data_len, - sgl_sdesc->addr); + cxgb4_write_partial_sgl(skb, &q->q, pos, end, sgl_sdesc->addr, + skb_offset, data_len); sgl_sdesc->skb = skb; chcr_txq_advance(&q->q, ndesc); @@ -1578,9 +1547,11 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, /* * chcr_ktls_copy_record_in_skb * @nskb - new skb where the frags to be added. + * @skb - old skb, to copy socket and destructor details. * @record - specific record which has complete 16k record in frags. */ static void chcr_ktls_copy_record_in_skb(struct sk_buff *nskb, + struct sk_buff *skb, struct tls_record_info *record) { int i = 0; @@ -1595,6 +1566,9 @@ static void chcr_ktls_copy_record_in_skb(struct sk_buff *nskb, nskb->data_len = record->len; nskb->len += record->len; nskb->truesize += record->len; + nskb->sk = skb->sk; + nskb->destructor = skb->destructor; + refcount_add(nskb->truesize, &nskb->sk->sk_wmem_alloc); } /* @@ -1666,7 +1640,7 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct tls_record_info *record, u32 tcp_seq, int mss, bool tcp_push_no_fin, - struct sge_eth_txq *q, + struct sge_eth_txq *q, u32 skb_offset, u32 tls_end_offset, bool last_wr) { struct sk_buff *nskb = NULL; @@ -1675,13 +1649,14 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, nskb = skb; atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_complete_pkts); } else { - dev_kfree_skb_any(skb); - - nskb = alloc_skb(0, GFP_KERNEL); - if (!nskb) + nskb = alloc_skb(0, GFP_ATOMIC); + if (!nskb) { + dev_kfree_skb_any(skb); return NETDEV_TX_BUSY; + } + /* copy complete record in skb */ - chcr_ktls_copy_record_in_skb(nskb, record); + chcr_ktls_copy_record_in_skb(nskb, skb, record); /* packet is being sent from the beginning, update the tcp_seq * accordingly. */ @@ -1691,10 +1666,20 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, */ if (chcr_ktls_update_snd_una(tx_info, q)) goto out; + /* reset skb offset */ + skb_offset = 0; + + if (last_wr) + dev_kfree_skb_any(skb); + + last_wr = true; + atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_end_pkts); } if (chcr_ktls_xmit_wr_complete(nskb, tx_info, q, tcp_seq, + last_wr, record->len, skb_offset, + record->num_frags, (last_wr && tcp_push_no_fin), mss)) { goto out; @@ -1730,41 +1715,32 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct tls_record_info *record, u32 tcp_seq, int mss, bool tcp_push_no_fin, + u32 data_len, u32 skb_offset, struct sge_eth_txq *q, u32 tls_end_offset) { u32 tls_rec_offset = tcp_seq - tls_record_start_seq(record); u8 prior_data[16] = {0}; u32 prior_data_len = 0; - u32 data_len; /* check if the skb is ending in middle of tag/HASH, its a big * trouble, send the packet before the HASH. */ - int remaining_record = tls_end_offset - skb->data_len; + int remaining_record = tls_end_offset - data_len; if (remaining_record > 0 && remaining_record < TLS_CIPHER_AES_GCM_128_TAG_SIZE) { - int trimmed_len = skb->data_len - + int trimmed_len = data_len - (TLS_CIPHER_AES_GCM_128_TAG_SIZE - remaining_record); - struct sk_buff *tmp_skb = NULL; /* don't process the pkt if it is only a partial tag */ - if (skb->data_len < TLS_CIPHER_AES_GCM_128_TAG_SIZE) + if (data_len < TLS_CIPHER_AES_GCM_128_TAG_SIZE) goto out; - WARN_ON(trimmed_len > skb->data_len); - - /* shift to those many bytes */ - tmp_skb = alloc_skb(0, GFP_KERNEL); - if (unlikely(!tmp_skb)) - goto out; + WARN_ON(trimmed_len > data_len); - chcr_ktls_skb_shift(tmp_skb, skb, trimmed_len); - /* free the last trimmed portion */ - dev_kfree_skb_any(skb); - skb = tmp_skb; + data_len = trimmed_len; atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_trimmed_pkts); } - data_len = skb->data_len; + /* check if the middle record's start point is 16 byte aligned. CTR * needs 16 byte aligned start point to start encryption. */ @@ -1825,9 +1801,6 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, } /* reset tcp_seq as per the prior_data_required len */ tcp_seq -= prior_data_len; - /* include prio_data_len for further calculation. - */ - data_len += prior_data_len; } /* reset snd una, so the middle record won't send the already * sent part. @@ -1844,6 +1817,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, tcp_push_no_fin, q, tx_info->port_id, prior_data, + data_len, skb_offset, prior_data_len)) { goto out; } @@ -1854,7 +1828,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, if (chcr_ktls_xmit_wr_short(skb, tx_info, q, tcp_seq, tcp_push_no_fin, mss, tls_rec_offset, prior_data, - prior_data_len)) { + prior_data_len, data_len, skb_offset)) { goto out; } @@ -1876,7 +1850,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) struct tls_record_info *record; struct chcr_ktls_info *tx_info; struct tls_context *tls_ctx; - struct sk_buff *local_skb; struct sge_eth_txq *q; struct adapter *adap; unsigned long flags; @@ -1898,14 +1871,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) if (unlikely(!tx_info)) goto out; - /* don't touch the original skb, make a new skb to extract each records - * and send them separately. - */ - local_skb = alloc_skb(0, GFP_KERNEL); - - if (unlikely(!local_skb)) - return NETDEV_TX_BUSY; - adap = tx_info->adap; stats = &adap->ch_ktls_stats; port_stats = &stats->ktls_port[tx_info->port_id]; @@ -1925,13 +1890,9 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) ntohl(th->ack_seq), ntohs(th->window)); if (ret) { - dev_kfree_skb_any(local_skb); return NETDEV_TX_BUSY; } - /* copy skb contents into local skb */ - chcr_ktls_skb_copy(skb, local_skb); - /* TCP segments can be in received either complete or partial. * chcr_end_part_handler will handle cases if complete record or end * part of the record is received. Incase of partial end part of record, @@ -1961,7 +1922,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) atomic64_inc(&port_stats->ktls_tx_skip_no_sync_data); goto out; } - /* increase page reference count of the record, so that there * won't be any chance of page free in middle if in case stack * receives ACK and try to delete the record. @@ -1977,44 +1937,28 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) tcp_seq, record->end_seq, tx_info->prev_seq, data_len); /* if a tls record is finishing in this SKB */ if (tls_end_offset <= data_len) { - struct sk_buff *nskb = NULL; - - if (tls_end_offset < data_len) { - nskb = alloc_skb(0, GFP_KERNEL); - if (unlikely(!nskb)) { - ret = -ENOMEM; - goto clear_ref; - } - - chcr_ktls_skb_shift(nskb, local_skb, - tls_end_offset); - } else { - /* its the only record in this skb, directly - * point it. - */ - nskb = local_skb; - } - ret = chcr_end_part_handler(tx_info, nskb, record, + ret = chcr_end_part_handler(tx_info, skb, record, tcp_seq, mss, (!th->fin && th->psh), q, + skb_offset, tls_end_offset, - (nskb == local_skb)); - - if (ret && nskb != local_skb) - dev_kfree_skb_any(local_skb); + skb_offset + + tls_end_offset == skb->len); data_len -= tls_end_offset; /* tcp_seq increment is required to handle next record. */ tcp_seq += tls_end_offset; + skb_offset += tls_end_offset; } else { - ret = chcr_short_record_handler(tx_info, local_skb, + ret = chcr_short_record_handler(tx_info, skb, record, tcp_seq, mss, (!th->fin && th->psh), + data_len, skb_offset, q, tls_end_offset); data_len = 0; } -clear_ref: + /* clear the frag ref count which increased locally before */ for (i = 0; i < record->num_frags; i++) { /* clear the frag ref count */ @@ -2022,7 +1966,8 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) } /* if any failure, come out from the loop. */ if (ret) - goto out; + return NETDEV_TX_OK; + /* length should never be less than 0 */ WARN_ON(data_len < 0); @@ -2038,6 +1983,7 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) if (th->fin) chcr_ktls_write_tcp_options(tx_info, skb, q, tx_info->tx_chan); + return NETDEV_TX_OK; out: dev_kfree_skb_any(skb); return NETDEV_TX_OK; From c68a28a9e2798a4602dde1c77046a3b577eb31f4 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:35 +0530 Subject: [PATCH 51/65] ch_ktls: Correction in trimmed_len calculation trimmed length calculation goes wrong if skb has only tag part to send. It should be zero if there is no data bytes apart from TAG. Fixes: dc05f3df8fac ("chcr: Handle first or middle part of record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 950841988ffeaf..4286decce0957e 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1729,10 +1729,13 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, if (remaining_record > 0 && remaining_record < TLS_CIPHER_AES_GCM_128_TAG_SIZE) { - int trimmed_len = data_len - - (TLS_CIPHER_AES_GCM_128_TAG_SIZE - remaining_record); - /* don't process the pkt if it is only a partial tag */ - if (data_len < TLS_CIPHER_AES_GCM_128_TAG_SIZE) + int trimmed_len = 0; + + if (tls_end_offset > TLS_CIPHER_AES_GCM_128_TAG_SIZE) + trimmed_len = data_len - + (TLS_CIPHER_AES_GCM_128_TAG_SIZE - + remaining_record); + if (!trimmed_len) goto out; WARN_ON(trimmed_len > data_len); From 83deb094dd5c636a790da3914008570c9fd1693f Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:36 +0530 Subject: [PATCH 52/65] ch_ktls: missing handling of header alone If an skb has only header part which doesn't start from beginning, is not being handled properly. Fixes: dc05f3df8fac ("chcr: Handle first or middle part of record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 4286decce0957e..8a54fce9bfaec0 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1744,6 +1744,17 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_trimmed_pkts); } + /* check if it is only the header part. */ + if (tls_rec_offset + data_len <= (TLS_HEADER_SIZE + tx_info->iv_size)) { + if (chcr_ktls_tx_plaintxt(tx_info, skb, tcp_seq, mss, + tcp_push_no_fin, q, + tx_info->port_id, prior_data, + data_len, skb_offset, prior_data_len)) + goto out; + + return 0; + } + /* check if the middle record's start point is 16 byte aligned. CTR * needs 16 byte aligned start point to start encryption. */ @@ -1812,20 +1823,6 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, goto out; atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_middle_pkts); } else { - /* Else means, its a partial first part of the record. Check if - * its only the header, don't need to send for encryption then. - */ - if (data_len <= TLS_HEADER_SIZE + tx_info->iv_size) { - if (chcr_ktls_tx_plaintxt(tx_info, skb, tcp_seq, mss, - tcp_push_no_fin, q, - tx_info->port_id, - prior_data, - data_len, skb_offset, - prior_data_len)) { - goto out; - } - return 0; - } atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_start_pkts); } From 63ee4591fa2f97dc08ce37514f214fc0430e9dc3 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:37 +0530 Subject: [PATCH 53/65] ch_ktls: Correction in middle record handling If a record starts in middle, reset TCB UNA so that we could avoid sending out extra packet which is needed to make it 16 byte aligned to start AES CTR. Check also considers prev_seq, which should be what is actually sent, not the skb data length. Avoid updating partial TAG to HW at any point of time, that's why we need to check if remaining part is smaller than TAG size, then reset TX_MAX to be TAG starting sequence number. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 50 ++++++++++++------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 8a54fce9bfaec0..026c66599d1e66 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -827,7 +827,7 @@ static void *chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, */ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, struct sge_eth_txq *q, u64 tcp_seq, - u64 tcp_ack, u64 tcp_win) + u64 tcp_ack, u64 tcp_win, bool offset) { bool first_wr = ((tx_info->prev_ack == 0) && (tx_info->prev_win == 0)); struct ch_ktls_port_stats_debug *port_stats; @@ -862,7 +862,7 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, cpl++; } /* reset snd una if it's a re-transmit pkt */ - if (tcp_seq != tx_info->prev_seq) { + if (tcp_seq != tx_info->prev_seq || offset) { /* reset snd_una */ port_stats = &tx_info->adap->ch_ktls_stats.ktls_port[tx_info->port_id]; @@ -871,7 +871,8 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, TCB_SND_UNA_RAW_V (TCB_SND_UNA_RAW_M), TCB_SND_UNA_RAW_V(0), 0); - atomic64_inc(&port_stats->ktls_tx_ooo); + if (tcp_seq != tx_info->prev_seq) + atomic64_inc(&port_stats->ktls_tx_ooo); cpl++; } /* update ack */ @@ -1661,11 +1662,6 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, * accordingly. */ tcp_seq = tls_record_start_seq(record); - /* reset snd una, so the middle record won't send the already - * sent part. - */ - if (chcr_ktls_update_snd_una(tx_info, q)) - goto out; /* reset skb offset */ skb_offset = 0; @@ -1684,6 +1680,7 @@ static int chcr_end_part_handler(struct chcr_ktls_info *tx_info, mss)) { goto out; } + tx_info->prev_seq = record->end_seq; return 0; out: dev_kfree_skb_any(nskb); @@ -1752,6 +1749,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, data_len, skb_offset, prior_data_len)) goto out; + tx_info->prev_seq = tcp_seq + data_len; return 0; } @@ -1832,6 +1830,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, goto out; } + tx_info->prev_seq = tcp_seq + data_len + prior_data_len; return 0; out: dev_kfree_skb_any(skb); @@ -1885,13 +1884,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) if (ret) return NETDEV_TX_BUSY; } - /* update tcb */ - ret = chcr_ktls_xmit_tcb_cpls(tx_info, q, ntohl(th->seq), - ntohl(th->ack_seq), - ntohs(th->window)); - if (ret) { - return NETDEV_TX_BUSY; - } /* TCP segments can be in received either complete or partial. * chcr_end_part_handler will handle cases if complete record or end @@ -1922,6 +1914,30 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) atomic64_inc(&port_stats->ktls_tx_skip_no_sync_data); goto out; } + tls_end_offset = record->end_seq - tcp_seq; + + pr_debug("seq 0x%x, end_seq 0x%x prev_seq 0x%x, datalen 0x%x\n", + tcp_seq, record->end_seq, tx_info->prev_seq, data_len); + /* update tcb for the skb */ + if (skb_data_len == data_len) { + u32 tx_max = tcp_seq; + + if (!tls_record_is_start_marker(record) && + tls_end_offset < TLS_CIPHER_AES_GCM_128_TAG_SIZE) + tx_max = record->end_seq - + TLS_CIPHER_AES_GCM_128_TAG_SIZE; + + ret = chcr_ktls_xmit_tcb_cpls(tx_info, q, tx_max, + ntohl(th->ack_seq), + ntohs(th->window), + tls_end_offset != + record->len); + if (ret) { + spin_unlock_irqrestore(&tx_ctx->base.lock, + flags); + goto out; + } + } /* increase page reference count of the record, so that there * won't be any chance of page free in middle if in case stack * receives ACK and try to delete the record. @@ -1931,10 +1947,7 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) /* lock cleared */ spin_unlock_irqrestore(&tx_ctx->base.lock, flags); - tls_end_offset = record->end_seq - tcp_seq; - pr_debug("seq 0x%x, end_seq 0x%x prev_seq 0x%x, datalen 0x%x\n", - tcp_seq, record->end_seq, tx_info->prev_seq, data_len); /* if a tls record is finishing in this SKB */ if (tls_end_offset <= data_len) { ret = chcr_end_part_handler(tx_info, skb, record, @@ -1973,7 +1986,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) } while (data_len > 0); - tx_info->prev_seq = ntohl(th->seq) + skb_data_len; atomic64_inc(&port_stats->ktls_tx_encrypted_packets); atomic64_add(skb_data_len, &port_stats->ktls_tx_encrypted_bytes); From 9478e083941c873d60a97b232760a14dec6c69d3 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:38 +0530 Subject: [PATCH 54/65] ch_ktls: packet handling prior to start marker There could be a case where ACK for tls exchanges prior to start marker is missed out, and by the time tls is offloaded. This pkt should not be discarded and handled carefully. It could be plaintext alone or plaintext + finish as well. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 38 ++++++++++++++++--- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index 026c66599d1e66..bbda71b7f98bb2 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1909,11 +1909,6 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) goto out; } - if (unlikely(tls_record_is_start_marker(record))) { - spin_unlock_irqrestore(&tx_ctx->base.lock, flags); - atomic64_inc(&port_stats->ktls_tx_skip_no_sync_data); - goto out; - } tls_end_offset = record->end_seq - tcp_seq; pr_debug("seq 0x%x, end_seq 0x%x prev_seq 0x%x, datalen 0x%x\n", @@ -1938,6 +1933,39 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) goto out; } } + + if (unlikely(tls_record_is_start_marker(record))) { + atomic64_inc(&port_stats->ktls_tx_skip_no_sync_data); + /* If tls_end_offset < data_len, means there is some + * data after start marker, which needs encryption, send + * plaintext first and take skb refcount. else send out + * complete pkt as plaintext. + */ + if (tls_end_offset < data_len) + skb_get(skb); + else + tls_end_offset = data_len; + + ret = chcr_ktls_tx_plaintxt(tx_info, skb, tcp_seq, mss, + (!th->fin && th->psh), q, + tx_info->port_id, NULL, + tls_end_offset, skb_offset, + 0); + + spin_unlock_irqrestore(&tx_ctx->base.lock, flags); + if (ret) { + /* free the refcount taken earlier */ + if (tls_end_offset < data_len) + dev_kfree_skb_any(skb); + goto out; + } + + data_len -= tls_end_offset; + tcp_seq = record->end_seq; + skb_offset += tls_end_offset; + continue; + } + /* increase page reference count of the record, so that there * won't be any chance of page free in middle if in case stack * receives ACK and try to delete the record. From 659bf0383d15b07e492e27443d87736b24171558 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:39 +0530 Subject: [PATCH 55/65] ch_ktls: don't free skb before sending FIN If its a last packet and fin is set. Make sure FIN is informed to HW before skb gets freed. Fixes: 429765a149f1 ("chcr: handle partial end part of a record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index bbda71b7f98bb2..a8062e038ebce6 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1932,6 +1932,9 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) flags); goto out; } + + if (th->fin) + skb_get(skb); } if (unlikely(tls_record_is_start_marker(record))) { @@ -2006,8 +2009,11 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) __skb_frag_unref(&record->frags[i]); } /* if any failure, come out from the loop. */ - if (ret) + if (ret) { + if (th->fin) + dev_kfree_skb_any(skb); return NETDEV_TX_OK; + } /* length should never be less than 0 */ WARN_ON(data_len < 0); @@ -2020,8 +2026,10 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) /* tcp finish is set, send a separate tcp msg including all the options * as well. */ - if (th->fin) + if (th->fin) { chcr_ktls_write_tcp_options(tx_info, skb, q, tx_info->tx_chan); + dev_kfree_skb_any(skb); + } return NETDEV_TX_OK; out: From 21f82acbb8b4e8812521d405479b6fc3790078de Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:40 +0530 Subject: [PATCH 56/65] ch_ktls/cxgb4: handle partial tag alone SKBs If TCP congestion caused a very small packets which only has some part fo the TAG, and that too is not till the end. HW can't handle such case, so falling back to sw crypto in such cases. v1->v2: - Marked chcr_ktls_sw_fallback() static. Fixes: dc05f3df8fac ("chcr: Handle first or middle part of record") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../ethernet/chelsio/cxgb4/cxgb4_debugfs.c | 2 + .../net/ethernet/chelsio/cxgb4/cxgb4_uld.h | 1 + .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 116 +++++++++++++++++- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.h | 1 + 4 files changed, 119 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c index 0273f40b85f766..17410fe8662679 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c @@ -3573,6 +3573,8 @@ static int chcr_stats_show(struct seq_file *seq, void *v) atomic64_read(&adap->ch_ktls_stats.ktls_tx_complete_pkts)); seq_printf(seq, "TX trim pkts : %20llu\n", atomic64_read(&adap->ch_ktls_stats.ktls_tx_trimmed_pkts)); + seq_printf(seq, "TX sw fallback : %20llu\n", + atomic64_read(&adap->ch_ktls_stats.ktls_tx_fallback)); while (i < MAX_NPORTS) { ktls_port = &adap->ch_ktls_stats.ktls_port[i]; seq_printf(seq, "Port %d\n", i); diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h index e2a4941fa802d0..1b49f2fa9b1852 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h +++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h @@ -388,6 +388,7 @@ struct ch_ktls_stats_debug { atomic64_t ktls_tx_retransmit_pkts; atomic64_t ktls_tx_complete_pkts; atomic64_t ktls_tx_trimmed_pkts; + atomic64_t ktls_tx_fallback; }; #endif diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index a8062e038ebce6..b182c940b4a099 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -1545,6 +1545,88 @@ static int chcr_ktls_tx_plaintxt(struct chcr_ktls_info *tx_info, return 0; } +static int chcr_ktls_tunnel_pkt(struct chcr_ktls_info *tx_info, + struct sk_buff *skb, + struct sge_eth_txq *q) +{ + u32 ctrl, iplen, maclen, wr_mid = 0, len16; + struct tx_sw_desc *sgl_sdesc; + struct fw_eth_tx_pkt_wr *wr; + struct cpl_tx_pkt_core *cpl; + unsigned int flits, ndesc; + int credits, last_desc; + u64 cntrl1, *end; + void *pos; + + ctrl = sizeof(*cpl); + flits = DIV_ROUND_UP(sizeof(*wr) + ctrl, 8); + + flits += chcr_sgl_len(skb_shinfo(skb)->nr_frags + 1); + len16 = DIV_ROUND_UP(flits, 2); + /* check how many descriptors needed */ + ndesc = DIV_ROUND_UP(flits, 8); + + credits = chcr_txq_avail(&q->q) - ndesc; + if (unlikely(credits < 0)) { + chcr_eth_txq_stop(q); + return -ENOMEM; + } + + if (unlikely(credits < ETHTXQ_STOP_THRES)) { + chcr_eth_txq_stop(q); + wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; + } + + last_desc = q->q.pidx + ndesc - 1; + if (last_desc >= q->q.size) + last_desc -= q->q.size; + sgl_sdesc = &q->q.sdesc[last_desc]; + + if (unlikely(cxgb4_map_skb(tx_info->adap->pdev_dev, skb, + sgl_sdesc->addr) < 0)) { + memset(sgl_sdesc->addr, 0, sizeof(sgl_sdesc->addr)); + q->mapping_err++; + return -ENOMEM; + } + + iplen = skb_network_header_len(skb); + maclen = skb_mac_header_len(skb); + + pos = &q->q.desc[q->q.pidx]; + end = (u64 *)pos + flits; + wr = pos; + + /* Firmware work request header */ + wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) | + FW_WR_IMMDLEN_V(ctrl)); + + wr->equiq_to_len16 = htonl(wr_mid | FW_WR_LEN16_V(len16)); + wr->r3 = 0; + + cpl = (void *)(wr + 1); + + /* CPL header */ + cpl->ctrl0 = htonl(TXPKT_OPCODE_V(CPL_TX_PKT) | + TXPKT_INTF_V(tx_info->tx_chan) | + TXPKT_PF_V(tx_info->adap->pf)); + cpl->pack = 0; + cntrl1 = TXPKT_CSUM_TYPE_V(tx_info->ip_family == AF_INET ? + TX_CSUM_TCPIP : TX_CSUM_TCPIP6); + cntrl1 |= T6_TXPKT_ETHHDR_LEN_V(maclen - ETH_HLEN) | + TXPKT_IPHDR_LEN_V(iplen); + /* checksum offload */ + cpl->ctrl1 = cpu_to_be64(cntrl1); + cpl->len = htons(skb->len); + + pos = cpl + 1; + + cxgb4_write_sgl(skb, &q->q, pos, end, 0, sgl_sdesc->addr); + sgl_sdesc->skb = skb; + chcr_txq_advance(&q->q, ndesc); + cxgb4_ring_tx_db(tx_info->adap, &q->q, ndesc); + return 0; +} + /* * chcr_ktls_copy_record_in_skb * @nskb - new skb where the frags to be added. @@ -1733,7 +1815,7 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, (TLS_CIPHER_AES_GCM_128_TAG_SIZE - remaining_record); if (!trimmed_len) - goto out; + return FALLBACK; WARN_ON(trimmed_len > data_len); @@ -1837,6 +1919,34 @@ static int chcr_short_record_handler(struct chcr_ktls_info *tx_info, return NETDEV_TX_BUSY; } +static int chcr_ktls_sw_fallback(struct sk_buff *skb, + struct chcr_ktls_info *tx_info, + struct sge_eth_txq *q) +{ + u32 data_len, skb_offset; + struct sk_buff *nskb; + struct tcphdr *th; + + nskb = tls_encrypt_skb(skb); + + if (!nskb) + return 0; + + th = tcp_hdr(nskb); + skb_offset = skb_transport_offset(nskb) + tcp_hdrlen(nskb); + data_len = nskb->len - skb_offset; + skb_tx_timestamp(nskb); + + if (chcr_ktls_tunnel_pkt(tx_info, nskb, q)) + goto out; + + tx_info->prev_seq = ntohl(th->seq) + data_len; + atomic64_inc(&tx_info->adap->ch_ktls_stats.ktls_tx_fallback); + return 0; +out: + dev_kfree_skb_any(nskb); + return 0; +} /* nic tls TX handler */ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -2012,6 +2122,10 @@ static int chcr_ktls_xmit(struct sk_buff *skb, struct net_device *dev) if (ret) { if (th->fin) dev_kfree_skb_any(skb); + + if (ret == FALLBACK) + return chcr_ktls_sw_fallback(skb, tx_info, q); + return NETDEV_TX_OK; } diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.h b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.h index c1651b1431a031..18b3b1f024156c 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.h +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.h @@ -26,6 +26,7 @@ #define CHCR_KTLS_WR_SIZE (CHCR_PLAIN_TX_DATA_LEN +\ sizeof(struct cpl_tx_sec_pdu)) +#define FALLBACK 35 enum ch_ktls_open_state { CH_KTLS_OPEN_SUCCESS = 0, From 7d01c428c86b525dc780226924d74df2048cf411 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:41 +0530 Subject: [PATCH 57/65] ch_ktls: tcb update fails sometimes context id and port id should be filled while sending tcb update. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index b182c940b4a099..a732051b21e416 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -733,7 +733,8 @@ static int chcr_ktls_cpl_set_tcb_rpl(struct adapter *adap, unsigned char *input) } static void *__chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, - u32 tid, void *pos, u16 word, u64 mask, + u32 tid, void *pos, u16 word, + struct sge_eth_txq *q, u64 mask, u64 val, u32 reply) { struct cpl_set_tcb_field_core *cpl; @@ -742,7 +743,10 @@ static void *__chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, /* ULP_TXPKT */ txpkt = pos; - txpkt->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | ULP_TXPKT_DEST_V(0)); + txpkt->cmd_dest = htonl(ULPTX_CMD_V(ULP_TX_PKT) | + ULP_TXPKT_CHANNELID_V(tx_info->port_id) | + ULP_TXPKT_FID_V(q->q.cntxt_id) | + ULP_TXPKT_RO_F); txpkt->len = htonl(DIV_ROUND_UP(CHCR_SET_TCB_FIELD_LEN, 16)); /* ULPTX_IDATA sub-command */ @@ -797,7 +801,7 @@ static void *chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, } else { u8 buf[48] = {0}; - __chcr_write_cpl_set_tcb_ulp(tx_info, tid, buf, word, + __chcr_write_cpl_set_tcb_ulp(tx_info, tid, buf, word, q, mask, val, reply); return chcr_copy_to_txd(buf, &q->q, pos, @@ -805,7 +809,7 @@ static void *chcr_write_cpl_set_tcb_ulp(struct chcr_ktls_info *tx_info, } } - pos = __chcr_write_cpl_set_tcb_ulp(tx_info, tid, pos, word, + pos = __chcr_write_cpl_set_tcb_ulp(tx_info, tid, pos, word, q, mask, val, reply); /* check again if we are at the end of the queue */ From 83a95df04bee77c74df5151c961b19d870a70180 Mon Sep 17 00:00:00 2001 From: Rohit Maheshwari Date: Mon, 9 Nov 2020 16:21:42 +0530 Subject: [PATCH 58/65] ch_ktls: stop the txq if reaches threshold Stop the queue and ask for the credits if queue reaches to threashold. Fixes: 5a4b9fe7fece ("cxgb4/chcr: complete record tx handling") Signed-off-by: Rohit Maheshwari Signed-off-by: Jakub Kicinski --- .../chelsio/inline_crypto/ch_ktls/chcr_ktls.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c index a732051b21e416..c24485c0d512aa 100644 --- a/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c +++ b/drivers/net/ethernet/chelsio/inline_crypto/ch_ktls/chcr_ktls.c @@ -835,7 +835,7 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, { bool first_wr = ((tx_info->prev_ack == 0) && (tx_info->prev_win == 0)); struct ch_ktls_port_stats_debug *port_stats; - u32 len, cpl = 0, ndesc, wr_len; + u32 len, cpl = 0, ndesc, wr_len, wr_mid = 0; struct fw_ulptx_wr *wr; int credits; void *pos; @@ -851,6 +851,11 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, return NETDEV_TX_BUSY; } + if (unlikely(credits < ETHTXQ_STOP_THRES)) { + chcr_eth_txq_stop(q); + wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; + } + pos = &q->q.desc[q->q.pidx]; /* make space for WR, we'll fill it later when we know all the cpls * being sent out and have complete length. @@ -905,7 +910,8 @@ static int chcr_ktls_xmit_tcb_cpls(struct chcr_ktls_info *tx_info, wr->op_to_compl = htonl(FW_WR_OP_V(FW_ULPTX_WR)); wr->cookie = 0; /* fill len in wr field */ - wr->flowid_len16 = htonl(FW_WR_LEN16_V(DIV_ROUND_UP(len, 16))); + wr->flowid_len16 = htonl(wr_mid | + FW_WR_LEN16_V(DIV_ROUND_UP(len, 16))); ndesc = DIV_ROUND_UP(len, 64); chcr_txq_advance(&q->q, ndesc); @@ -986,6 +992,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, struct tcphdr *tcp; int len16, pktlen; struct iphdr *ip; + u32 wr_mid = 0; int credits; u8 buf[150]; u64 cntrl1; @@ -1010,6 +1017,11 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, return NETDEV_TX_BUSY; } + if (unlikely(credits < ETHTXQ_STOP_THRES)) { + chcr_eth_txq_stop(q); + wr_mid |= FW_WR_EQUEQ_F | FW_WR_EQUIQ_F; + } + pos = &q->q.desc[q->q.pidx]; wr = pos; @@ -1017,7 +1029,7 @@ chcr_ktls_write_tcp_options(struct chcr_ktls_info *tx_info, struct sk_buff *skb, wr->op_immdlen = htonl(FW_WR_OP_V(FW_ETH_TX_PKT_WR) | FW_WR_IMMDLEN_V(ctrl)); - wr->equiq_to_len16 = htonl(FW_WR_LEN16_V(len16)); + wr->equiq_to_len16 = htonl(wr_mid | FW_WR_LEN16_V(len16)); wr->r3 = 0; cpl = (void *)(wr + 1); From 460cd17e9f7d60eaa22028baa6a056c478fa7dc6 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 9 Nov 2020 19:51:20 -0800 Subject: [PATCH 59/65] net: switch to the kernel.org patchwork instance Move to the kernel.org patchwork instance, it has significantly lower latency for accessing from Europe and the US. Other quirks include the reply bot. Link: https://lore.kernel.org/r/20201110035120.642746-1-kuba@kernel.org Signed-off-by: Jakub Kicinski --- Documentation/networking/netdev-FAQ.rst | 4 ++-- Documentation/process/stable-kernel-rules.rst | 2 +- .../it_IT/process/stable-kernel-rules.rst | 2 +- MAINTAINERS | 20 +++++++++---------- 4 files changed, 14 insertions(+), 14 deletions(-) diff --git a/Documentation/networking/netdev-FAQ.rst b/Documentation/networking/netdev-FAQ.rst index d5c9320901c327..21537766be4d16 100644 --- a/Documentation/networking/netdev-FAQ.rst +++ b/Documentation/networking/netdev-FAQ.rst @@ -110,7 +110,7 @@ Q: I sent a patch and I'm wondering what happened to it? Q: How can I tell whether it got merged? A: Start by looking at the main patchworks queue for netdev: - http://patchwork.ozlabs.org/project/netdev/list/ + https://patchwork.kernel.org/project/netdevbpf/list/ The "State" field will tell you exactly where things are at with your patch. @@ -152,7 +152,7 @@ networking subsystem, and then hands them off to Greg. There is a patchworks queue that you can see here: - http://patchwork.ozlabs.org/bundle/davem/stable/?state=* + https://patchwork.kernel.org/bundle/netdev/stable/?state=* It contains the patches which Dave has selected, but not yet handed off to Greg. If Greg already has the patch, then it will be here: diff --git a/Documentation/process/stable-kernel-rules.rst b/Documentation/process/stable-kernel-rules.rst index 06f743b612c48b..3973556250e170 100644 --- a/Documentation/process/stable-kernel-rules.rst +++ b/Documentation/process/stable-kernel-rules.rst @@ -39,7 +39,7 @@ Procedure for submitting patches to the -stable tree submission guidelines as described in :ref:`Documentation/networking/netdev-FAQ.rst ` after first checking the stable networking queue at - https://patchwork.ozlabs.org/bundle/davem/stable/?series=&submitter=&state=*&q=&archive= + https://patchwork.kernel.org/bundle/netdev/stable/?state=* to ensure the requested patch is not already queued up. - Security patches should not be handled (solely) by the -stable review process but should follow the procedures in diff --git a/Documentation/translations/it_IT/process/stable-kernel-rules.rst b/Documentation/translations/it_IT/process/stable-kernel-rules.rst index 4f206cee31a74e..283d62541c4ff1 100644 --- a/Documentation/translations/it_IT/process/stable-kernel-rules.rst +++ b/Documentation/translations/it_IT/process/stable-kernel-rules.rst @@ -46,7 +46,7 @@ Procedura per sottomettere patch per i sorgenti -stable :ref:`Documentation/translations/it_IT/networking/netdev-FAQ.rst `; ma solo dopo aver verificato al seguente indirizzo che la patch non sia già in coda: - https://patchwork.ozlabs.org/bundle/davem/stable/?series=&submitter=&state=*&q=&archive= + https://patchwork.kernel.org/bundle/netdev/stable/?state=* - Una patch di sicurezza non dovrebbero essere gestite (solamente) dal processo di revisione -stable, ma dovrebbe seguire le procedure descritte in :ref:`Documentation/translations/it_IT/admin-guide/security-bugs.rst `. diff --git a/MAINTAINERS b/MAINTAINERS index fa11fe773df510..eaded39a3858f5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1279,7 +1279,7 @@ M: Igor Russkikh L: netdev@vger.kernel.org S: Supported W: https://www.marvell.com/ -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: Documentation/networking/device_drivers/ethernet/aquantia/atlantic.rst F: drivers/net/ethernet/aquantia/atlantic/ @@ -11173,7 +11173,7 @@ M: Tariq Toukan L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx4/en_* MELLANOX ETHERNET DRIVER (mlx5e) @@ -11181,7 +11181,7 @@ M: Saeed Mahameed L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx5/core/en_* MELLANOX ETHERNET INNOVA DRIVERS @@ -11189,7 +11189,7 @@ R: Boris Pismenny L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx5/core/accel/* F: drivers/net/ethernet/mellanox/mlx5/core/en_accel/* F: drivers/net/ethernet/mellanox/mlx5/core/fpga/* @@ -11201,7 +11201,7 @@ M: Ido Schimmel L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlxsw/ F: tools/testing/selftests/drivers/net/mlxsw/ @@ -11210,7 +11210,7 @@ M: mlxsw@nvidia.com L: netdev@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlxfw/ MELLANOX HARDWARE PLATFORM SUPPORT @@ -11229,7 +11229,7 @@ L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: drivers/net/ethernet/mellanox/mlx4/ F: include/linux/mlx4/ @@ -11250,7 +11250,7 @@ L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org S: Supported W: http://www.mellanox.com -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ F: Documentation/networking/device_drivers/ethernet/mellanox/ F: drivers/net/ethernet/mellanox/mlx5/core/ F: include/linux/mlx5/ @@ -12130,7 +12130,7 @@ M: Jakub Kicinski L: netdev@vger.kernel.org S: Maintained W: http://www.linuxfoundation.org/en/Net -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git F: Documentation/devicetree/bindings/net/ @@ -12175,7 +12175,7 @@ M: Jakub Kicinski L: netdev@vger.kernel.org S: Maintained W: http://www.linuxfoundation.org/en/Net -Q: http://patchwork.ozlabs.org/project/netdev/list/ +Q: https://patchwork.kernel.org/project/netdevbpf/list/ B: mailto:netdev@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/netdev/net-next.git From 52755b66ddcef2e897778fac5656df18817b59ab Mon Sep 17 00:00:00 2001 From: Wang Hai Date: Tue, 10 Nov 2020 22:46:14 +0800 Subject: [PATCH 60/65] cosa: Add missing kfree in error path of cosa_write If memory allocation for 'kbuf' succeed, cosa_write() doesn't have a corresponding kfree() in exception handling. Thus add kfree() for this function implementation. Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Reported-by: Hulk Robot Signed-off-by: Wang Hai Acked-by: Jan "Yenya" Kasprzak Link: https://lore.kernel.org/r/20201110144614.43194-1-wanghai38@huawei.com Signed-off-by: Jakub Kicinski --- drivers/net/wan/cosa.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c index f8aed0696d7752..2369ca250cd65e 100644 --- a/drivers/net/wan/cosa.c +++ b/drivers/net/wan/cosa.c @@ -889,6 +889,7 @@ static ssize_t cosa_write(struct file *file, chan->tx_status = 1; spin_unlock_irqrestore(&cosa->lock, flags); up(&chan->wsem); + kfree(kbuf); return -ERESTARTSYS; } } From 9e2b7fa2df4365e99934901da4fb4af52d81e820 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Fri, 6 Nov 2020 08:30:30 +0100 Subject: [PATCH 61/65] vrf: Fix fast path output packet handling with async Netfilter rules VRF devices use an optimized direct path on output if a default qdisc is involved, calling Netfilter hooks directly. This path, however, does not consider Netfilter rules completing asynchronously, such as with NFQUEUE. The Netfilter okfn() is called for asynchronously accepted packets, but the VRF never passes that packet down the stack to send it out over the slave device. Using the slower redirect path for this seems not feasible, as we do not know beforehand if a Netfilter hook has asynchronously completing rules. Fix the use of asynchronously completing Netfilter rules in OUTPUT and POSTROUTING by using a special completion function that additionally calls dst_output() to pass the packet down the stack. Also, slightly adjust the use of nf_reset_ct() so that is called in the asynchronous case, too. Fixes: dcdd43c41e60 ("net: vrf: performance improvements for IPv4") Fixes: a9ec54d1b0cd ("net: vrf: performance improvements for IPv6") Signed-off-by: Martin Willi Link: https://lore.kernel.org/r/20201106073030.3974927-1-martin@strongswan.org Signed-off-by: Jakub Kicinski --- drivers/net/vrf.c | 92 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 69 insertions(+), 23 deletions(-) diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c index 60c1aadece89a7..f2793ffde19134 100644 --- a/drivers/net/vrf.c +++ b/drivers/net/vrf.c @@ -608,8 +608,7 @@ static netdev_tx_t vrf_xmit(struct sk_buff *skb, struct net_device *dev) return ret; } -static int vrf_finish_direct(struct net *net, struct sock *sk, - struct sk_buff *skb) +static void vrf_finish_direct(struct sk_buff *skb) { struct net_device *vrf_dev = skb->dev; @@ -628,7 +627,8 @@ static int vrf_finish_direct(struct net *net, struct sock *sk, skb_pull(skb, ETH_HLEN); } - return 1; + /* reset skb device */ + nf_reset_ct(skb); } #if IS_ENABLED(CONFIG_IPV6) @@ -707,15 +707,41 @@ static struct sk_buff *vrf_ip6_out_redirect(struct net_device *vrf_dev, return skb; } +static int vrf_output6_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + vrf_finish_direct(skb); + + return vrf_ip6_local_out(net, sk, skb); +} + static int vrf_output6_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { + int err = 1; + skb->protocol = htons(ETH_P_IPV6); - return NF_HOOK_COND(NFPROTO_IPV6, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb->dev, - vrf_finish_direct, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + if (!(IPCB(skb)->flags & IPSKB_REROUTED)) + err = nf_hook(NFPROTO_IPV6, NF_INET_POST_ROUTING, net, sk, skb, + NULL, skb->dev, vrf_output6_direct_finish); + + if (likely(err == 1)) + vrf_finish_direct(skb); + + return err; +} + +static int vrf_ip6_out_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int err; + + err = vrf_output6_direct(net, sk, skb); + if (likely(err == 1)) + err = vrf_ip6_local_out(net, sk, skb); + + return err; } static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, @@ -728,18 +754,15 @@ static struct sk_buff *vrf_ip6_out_direct(struct net_device *vrf_dev, skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV6, NF_INET_LOCAL_OUT, net, sk, - skb, NULL, vrf_dev, vrf_output6_direct); + skb, NULL, vrf_dev, vrf_ip6_out_direct_finish); if (likely(err == 1)) err = vrf_output6_direct(net, sk, skb); - /* reset skb device */ if (likely(err == 1)) - nf_reset_ct(skb); - else - skb = NULL; + return skb; - return skb; + return NULL; } static struct sk_buff *vrf_ip6_out(struct net_device *vrf_dev, @@ -919,15 +942,41 @@ static struct sk_buff *vrf_ip_out_redirect(struct net_device *vrf_dev, return skb; } +static int vrf_output_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + vrf_finish_direct(skb); + + return vrf_ip_local_out(net, sk, skb); +} + static int vrf_output_direct(struct net *net, struct sock *sk, struct sk_buff *skb) { + int err = 1; + skb->protocol = htons(ETH_P_IP); - return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, - net, sk, skb, NULL, skb->dev, - vrf_finish_direct, - !(IPCB(skb)->flags & IPSKB_REROUTED)); + if (!(IPCB(skb)->flags & IPSKB_REROUTED)) + err = nf_hook(NFPROTO_IPV4, NF_INET_POST_ROUTING, net, sk, skb, + NULL, skb->dev, vrf_output_direct_finish); + + if (likely(err == 1)) + vrf_finish_direct(skb); + + return err; +} + +static int vrf_ip_out_direct_finish(struct net *net, struct sock *sk, + struct sk_buff *skb) +{ + int err; + + err = vrf_output_direct(net, sk, skb); + if (likely(err == 1)) + err = vrf_ip_local_out(net, sk, skb); + + return err; } static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, @@ -940,18 +989,15 @@ static struct sk_buff *vrf_ip_out_direct(struct net_device *vrf_dev, skb->dev = vrf_dev; err = nf_hook(NFPROTO_IPV4, NF_INET_LOCAL_OUT, net, sk, - skb, NULL, vrf_dev, vrf_output_direct); + skb, NULL, vrf_dev, vrf_ip_out_direct_finish); if (likely(err == 1)) err = vrf_output_direct(net, sk, skb); - /* reset skb device */ if (likely(err == 1)) - nf_reset_ct(skb); - else - skb = NULL; + return skb; - return skb; + return NULL; } static struct sk_buff *vrf_ip_out(struct net_device *vrf_dev, From 9f73bd1c2c4c304b238051fc92b3f807326f0a89 Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 11 Nov 2020 05:47:44 +0200 Subject: [PATCH 62/65] devlink: Avoid overwriting port attributes of registered port Cited commit in fixes tag overwrites the port attributes for the registered port. Avoid such error by checking registered flag before setting attributes. Fixes: 71ad8d55f8e5 ("devlink: Replace devlink_port_attrs_set parameters with a struct") Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20201111034744.35554-1-parav@nvidia.com Signed-off-by: Jakub Kicinski --- net/core/devlink.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/net/core/devlink.c b/net/core/devlink.c index a932d95be79847..ab4b1368904f4a 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8254,8 +8254,6 @@ static int __devlink_port_attrs_set(struct devlink_port *devlink_port, { struct devlink_port_attrs *attrs = &devlink_port->attrs; - if (WARN_ON(devlink_port->registered)) - return -EEXIST; devlink_port->attrs_set = true; attrs->flavour = flavour; if (attrs->switch_id.id_len) { @@ -8279,6 +8277,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; + if (WARN_ON(devlink_port->registered)) + return; devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) @@ -8301,6 +8301,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) @@ -8326,6 +8328,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; + if (WARN_ON(devlink_port->registered)) + return; ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) From 4b1a86281cc1d0de46df3ad2cb8c1f86ac07681c Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Nov 2020 20:45:25 +0000 Subject: [PATCH 63/65] net: udp: fix UDP header access on Fast/frag0 UDP GRO UDP GRO uses udp_hdr(skb) in its .gro_receive() callback. While it's probably OK for non-frag0 paths (when all headers or even the entire frame are already in skb head), this inline points to junk when using Fast GRO (napi_gro_frags() or napi_gro_receive() with only Ethernet header in skb head and all the rest in the frags) and breaks GRO packet compilation and the packet flow itself. To support both modes, skb_gro_header_fast() + skb_gro_header_slow() are typically used. UDP even has an inline helper that makes use of them, udp_gro_udphdr(). Use that instead of troublemaking udp_hdr() to get rid of the out-of-order delivers. Present since the introduction of plain UDP GRO in 5.0-rc1. Fixes: e20cf8d3f1f7 ("udp: implement GRO for plain UDP sockets.") Cc: Eric Dumazet Signed-off-by: Alexander Lobakin Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index e67a66fbf27b83..13740e9fe6ec02 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -366,7 +366,7 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb, static struct sk_buff *udp_gro_receive_segment(struct list_head *head, struct sk_buff *skb) { - struct udphdr *uh = udp_hdr(skb); + struct udphdr *uh = udp_gro_udphdr(skb); struct sk_buff *pp = NULL; struct udphdr *uh2; struct sk_buff *p; From 55e729889bb07d68ab071660ce3f5e7a7872ebe8 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Wed, 11 Nov 2020 20:45:38 +0000 Subject: [PATCH 64/65] net: udp: fix IP header access and skb lookup on Fast/frag0 UDP GRO udp{4,6}_lib_lookup_skb() use ip{,v6}_hdr() to get IP header of the packet. While it's probably OK for non-frag0 paths, this helpers will also point to junk on Fast/frag0 GRO when all headers are located in frags. As a result, sk/skb lookup may fail or give wrong results. To support both GRO modes, skb_gro_network_header() might be used. To not modify original functions, add private versions of udp{4,6}_lib_lookup_skb() only to perform correct sk lookups on GRO. Present since the introduction of "application-level" UDP GRO in 4.7-rc1. Misc: replace totally unneeded ternaries with plain ifs. Fixes: a6024562ffd7 ("udp: Add GRO functions to UDP socket") Suggested-by: Willem de Bruijn Cc: Eric Dumazet Signed-off-by: Alexander Lobakin Acked-by: Willem de Bruijn Signed-off-by: Jakub Kicinski --- net/ipv4/udp_offload.c | 17 +++++++++++++++-- net/ipv6/udp_offload.c | 17 +++++++++++++++-- 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c index 13740e9fe6ec02..c62805cd31319f 100644 --- a/net/ipv4/udp_offload.c +++ b/net/ipv4/udp_offload.c @@ -500,12 +500,22 @@ struct sk_buff *udp_gro_receive(struct list_head *head, struct sk_buff *skb, } EXPORT_SYMBOL(udp_gro_receive); +static struct sock *udp4_gro_lookup_skb(struct sk_buff *skb, __be16 sport, + __be16 dport) +{ + const struct iphdr *iph = skb_gro_network_header(skb); + + return __udp4_lib_lookup(dev_net(skb->dev), iph->saddr, sport, + iph->daddr, dport, inet_iif(skb), + inet_sdif(skb), &udp_table, NULL); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sock *sk = NULL; struct sk_buff *pp; - struct sock *sk; if (unlikely(!uh)) goto flush; @@ -523,7 +533,10 @@ struct sk_buff *udp4_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 0; rcu_read_lock(); - sk = static_branch_unlikely(&udp_encap_needed_key) ? udp4_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + + if (static_branch_unlikely(&udp_encap_needed_key)) + sk = udp4_gro_lookup_skb(skb, uh->source, uh->dest); + pp = udp_gro_receive(head, skb, uh, sk); rcu_read_unlock(); return pp; diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c index 584157a0775969..f9e888d1b9af8b 100644 --- a/net/ipv6/udp_offload.c +++ b/net/ipv6/udp_offload.c @@ -111,12 +111,22 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb, return segs; } +static struct sock *udp6_gro_lookup_skb(struct sk_buff *skb, __be16 sport, + __be16 dport) +{ + const struct ipv6hdr *iph = skb_gro_network_header(skb); + + return __udp6_lib_lookup(dev_net(skb->dev), &iph->saddr, sport, + &iph->daddr, dport, inet6_iif(skb), + inet6_sdif(skb), &udp_table, NULL); +} + INDIRECT_CALLABLE_SCOPE struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) { struct udphdr *uh = udp_gro_udphdr(skb); + struct sock *sk = NULL; struct sk_buff *pp; - struct sock *sk; if (unlikely(!uh)) goto flush; @@ -135,7 +145,10 @@ struct sk_buff *udp6_gro_receive(struct list_head *head, struct sk_buff *skb) skip: NAPI_GRO_CB(skb)->is_ipv6 = 1; rcu_read_lock(); - sk = static_branch_unlikely(&udpv6_encap_needed_key) ? udp6_lib_lookup_skb(skb, uh->source, uh->dest) : NULL; + + if (static_branch_unlikely(&udpv6_encap_needed_key)) + sk = udp6_gro_lookup_skb(skb, uh->source, uh->dest); + pp = udp_gro_receive(head, skb, uh, sk); rcu_read_unlock(); return pp; From edbc21113bde13ca3d06eec24b621b1f628583dd Mon Sep 17 00:00:00 2001 From: Sven Van Asbroeck Date: Thu, 12 Nov 2020 10:25:13 -0500 Subject: [PATCH 65/65] lan743x: fix use of uninitialized variable When no devicetree is present, the driver will use an uninitialized variable. Fix by initializing this variable. Fixes: 902a66e08cea ("lan743x: correctly handle chips with internal PHY") Reported-by: kernel test robot Signed-off-by: Sven Van Asbroeck Link: https://lore.kernel.org/r/20201112152513.1941-1-TheSven73@gmail.com Signed-off-by: Jakub Kicinski --- drivers/net/ethernet/microchip/lan743x_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 173158656559d7..e2c99d90924770 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -1014,8 +1014,8 @@ static void lan743x_phy_close(struct lan743x_adapter *adapter) static int lan743x_phy_open(struct lan743x_adapter *adapter) { struct lan743x_phy *phy = &adapter->phy; + struct phy_device *phydev = NULL; struct device_node *phynode; - struct phy_device *phydev; struct net_device *netdev; int ret = -EIO;