diff --git a/pkg/plugin/lib/common/libbpf/_include/asm/barrier.h b/pkg/plugin/lib/common/libbpf/_include/asm/barrier.h new file mode 100644 index 0000000000..1fc6aee1f3 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/asm/barrier.h @@ -0,0 +1,7 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef __ASM_BARRIER_H +#define __ASM_BARRIER_H + +#include + +#endif diff --git a/pkg/plugin/lib/common/libbpf/_include/asm/placeholder.go b/pkg/plugin/lib/common/libbpf/_include/asm/placeholder.go new file mode 100644 index 0000000000..cb6a8c2bd2 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/asm/placeholder.go @@ -0,0 +1,3 @@ +package asm //nolint:all + +// This file is a placeholder to make Go include this directory when vendoring. diff --git a/pkg/plugin/lib/common/libbpf/_include/linux/compiler.h b/pkg/plugin/lib/common/libbpf/_include/linux/compiler.h new file mode 100644 index 0000000000..26336dc702 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/linux/compiler.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_COMPILER_H +#define __LINUX_COMPILER_H + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +#define READ_ONCE(x) (*(volatile typeof(x) *)&x) +#define WRITE_ONCE(x, v) (*(volatile typeof(x) *)&x) = (v) + +#define barrier() asm volatile("" ::: "memory") + +#if defined(__x86_64__) + +# define smp_rmb() barrier() +# define smp_wmb() barrier() +# define smp_mb() asm volatile("lock; addl $0,-132(%%rsp)" ::: "memory", "cc") + +# define smp_store_release(p, v) \ +do { \ + barrier(); \ + WRITE_ONCE(*p, v); \ +} while (0) + +# define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p = READ_ONCE(*p); \ + barrier(); \ + ___p; \ +}) + +#elif defined(__aarch64__) + +# define smp_rmb() asm volatile("dmb ishld" ::: "memory") +# define smp_wmb() asm volatile("dmb ishst" ::: "memory") +# define smp_mb() asm volatile("dmb ish" ::: "memory") + +#endif + +#ifndef smp_mb +# define smp_mb() __sync_synchronize() +#endif + +#ifndef smp_rmb +# define smp_rmb() smp_mb() +#endif + +#ifndef smp_wmb +# define smp_wmb() smp_mb() +#endif + +#ifndef smp_store_release +# define smp_store_release(p, v) \ +do { \ + smp_mb(); \ + WRITE_ONCE(*p, v); \ +} while (0) +#endif + +#ifndef smp_load_acquire +# define smp_load_acquire(p) \ +({ \ + typeof(*p) ___p = READ_ONCE(*p); \ + smp_mb(); \ + ___p; \ +}) +#endif + +#endif /* __LINUX_COMPILER_H */ diff --git a/pkg/plugin/lib/common/libbpf/_include/linux/err.h b/pkg/plugin/lib/common/libbpf/_include/linux/err.h new file mode 100644 index 0000000000..1b1dafbcba --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/linux/err.h @@ -0,0 +1,38 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_ERR_H +#define __LINUX_ERR_H + +#include +#include + +#define MAX_ERRNO 4095 + +#define IS_ERR_VALUE(x) ((x) >= (unsigned long)-MAX_ERRNO) + +static inline void * ERR_PTR(long error_) +{ + return (void *) error_; +} + +static inline long PTR_ERR(const void *ptr) +{ + return (long) ptr; +} + +static inline bool IS_ERR(const void *ptr) +{ + return IS_ERR_VALUE((unsigned long)ptr); +} + +static inline bool IS_ERR_OR_NULL(const void *ptr) +{ + return (!ptr) || IS_ERR_VALUE((unsigned long)ptr); +} + +static inline long PTR_ERR_OR_ZERO(const void *ptr) +{ + return IS_ERR(ptr) ? PTR_ERR(ptr) : 0; +} + +#endif diff --git a/pkg/plugin/lib/common/libbpf/_include/linux/filter.h b/pkg/plugin/lib/common/libbpf/_include/linux/filter.h new file mode 100644 index 0000000000..5e3f0552bf --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/linux/filter.h @@ -0,0 +1,142 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_FILTER_H +#define __LINUX_FILTER_H + +#include + +#define BPF_RAW_INSN(CODE, DST, SRC, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = CODE, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_ALU32_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_ALU64_IMM(OP, DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_MOV64_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_CALL_REL(DST) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = BPF_PSEUDO_CALL, \ + .off = 0, \ + .imm = DST }) + +#define BPF_EXIT_INSN() \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_EXIT, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_EMIT_CALL(FUNC) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_CALL, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = 0, \ + .imm = ((FUNC) - BPF_FUNC_unspec) }) + +#define BPF_LDX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_LDX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_STX_MEM(SIZE, DST, SRC, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_STX | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF, \ + .imm = 0 }) + +#define BPF_ST_MEM(SIZE, DST, OFF, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ST | BPF_SIZE(SIZE) | BPF_MEM, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_MOV64_REG(DST, SRC) \ + ((struct bpf_insn) { \ + .code = BPF_ALU64 | BPF_MOV | BPF_X, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = 0, \ + .imm = 0 }) + +#define BPF_MOV32_IMM(DST, IMM) \ + ((struct bpf_insn) { \ + .code = BPF_ALU | BPF_MOV | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = 0, \ + .imm = IMM }) + +#define BPF_LD_IMM64_RAW_FULL(DST, SRC, OFF1, OFF2, IMM1, IMM2) \ + ((struct bpf_insn) { \ + .code = BPF_LD | BPF_DW | BPF_IMM, \ + .dst_reg = DST, \ + .src_reg = SRC, \ + .off = OFF1, \ + .imm = IMM1 }), \ + ((struct bpf_insn) { \ + .code = 0, \ + .dst_reg = 0, \ + .src_reg = 0, \ + .off = OFF2, \ + .imm = IMM2 }) + +#define BPF_LD_MAP_FD(DST, MAP_FD) \ + BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_FD, 0, 0, \ + MAP_FD, 0) + +#define BPF_LD_MAP_VALUE(DST, MAP_FD, VALUE_OFF) \ + BPF_LD_IMM64_RAW_FULL(DST, BPF_PSEUDO_MAP_VALUE, 0, 0, \ + MAP_FD, VALUE_OFF) + +#define BPF_JMP_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#define BPF_JMP32_IMM(OP, DST, IMM, OFF) \ + ((struct bpf_insn) { \ + .code = BPF_JMP32 | BPF_OP(OP) | BPF_K, \ + .dst_reg = DST, \ + .src_reg = 0, \ + .off = OFF, \ + .imm = IMM }) + +#endif diff --git a/pkg/plugin/lib/common/libbpf/_include/linux/kernel.h b/pkg/plugin/lib/common/libbpf/_include/linux/kernel.h new file mode 100644 index 0000000000..a26c9cf2b1 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/linux/kernel.h @@ -0,0 +1,46 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_KERNEL_H +#define __LINUX_KERNEL_H + +#include + +#ifndef offsetof +#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) +#endif + +#ifndef container_of +#define container_of(ptr, type, member) ({ \ + const typeof(((type *)0)->member) * __mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); }) +#endif + +#ifndef max +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) +#endif + +#ifndef min +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) +#endif + +#ifndef roundup +#define roundup(x, y) ( \ +{ \ + const typeof(y) __y = y; \ + (((x) + (__y - 1)) / __y) * __y; \ +} \ +) +#endif + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) +#define __KERNEL_DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + +#endif diff --git a/pkg/plugin/lib/common/libbpf/_include/linux/list.h b/pkg/plugin/lib/common/libbpf/_include/linux/list.h new file mode 100644 index 0000000000..fc91c34e1d --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/linux/list.h @@ -0,0 +1,91 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_LIST_H +#define __LINUX_LIST_H + +#define LIST_HEAD_INIT(name) { &(name), &(name) } +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +#define POISON_POINTER_DELTA 0 +#define LIST_POISON1 ((void *) 0x100 + POISON_POINTER_DELTA) +#define LIST_POISON2 ((void *) 0x200 + POISON_POINTER_DELTA) + + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +static inline void __list_add(struct list_head *new, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new; + new->next = next; + new->prev = prev; + prev->next = new; +} + +/** + * list_add - add a new entry + * @new: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new, struct list_head *head) +{ + __list_add(new, head, head->next); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +static inline void __list_del_entry(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); +} + +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = LIST_POISON1; + entry->prev = LIST_POISON2; +} + +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) +#define list_next_entry(pos, member) \ + list_entry((pos)->member.next, typeof(*(pos)), member) +#define list_for_each_entry(pos, head, member) \ + for (pos = list_first_entry(head, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_next_entry(pos, member)) + +#endif diff --git a/pkg/plugin/lib/common/libbpf/_include/linux/overflow.h b/pkg/plugin/lib/common/libbpf/_include/linux/overflow.h new file mode 100644 index 0000000000..53d758036c --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/linux/overflow.h @@ -0,0 +1,90 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_OVERFLOW_H +#define __LINUX_OVERFLOW_H + +#define is_signed_type(type) (((type)(-1)) < (type)1) +#define __type_half_max(type) ((type)1 << (8*sizeof(type) - 1 - is_signed_type(type))) +#define type_max(T) ((T)((__type_half_max(T) - 1) + __type_half_max(T))) +#define type_min(T) ((T)((T)-type_max(T)-(T)1)) + +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif + +#ifdef __GNUC__ +#define GCC_VERSION (__GNUC__ * 10000 \ + + __GNUC_MINOR__ * 100 \ + + __GNUC_PATCHLEVEL__) +#if GCC_VERSION >= 50100 +#define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1 +#endif +#endif + +#ifdef COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW + +#define check_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + __builtin_mul_overflow(__a, __b, __d); \ +}) + +#else + +/* + * If one of a or b is a compile-time constant, this avoids a division. + */ +#define __unsigned_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = __a * __b; \ + __builtin_constant_p(__b) ? \ + __b > 0 && __a > type_max(typeof(__a)) / __b : \ + __a > 0 && __b > type_max(typeof(__b)) / __a; \ +}) + +/* + * Signed multiplication is rather hard. gcc always follows C99, so + * division is truncated towards 0. This means that we can write the + * overflow check like this: + * + * (a > 0 && (b > MAX/a || b < MIN/a)) || + * (a < -1 && (b > MIN/a || b < MAX/a) || + * (a == -1 && b == MIN) + * + * The redundant casts of -1 are to silence an annoying -Wtype-limits + * (included in -Wextra) warning: When the type is u8 or u16, the + * __b_c_e in check_mul_overflow obviously selects + * __unsigned_mul_overflow, but unfortunately gcc still parses this + * code and warns about the limited range of __b. + */ + +#define __signed_mul_overflow(a, b, d) ({ \ + typeof(a) __a = (a); \ + typeof(b) __b = (b); \ + typeof(d) __d = (d); \ + typeof(a) __tmax = type_max(typeof(a)); \ + typeof(a) __tmin = type_min(typeof(a)); \ + (void) (&__a == &__b); \ + (void) (&__a == __d); \ + *__d = (__u64)__a * (__u64)__b; \ + (__b > 0 && (__a > __tmax/__b || __a < __tmin/__b)) || \ + (__b < (typeof(__b))-1 && (__a > __tmin/__b || __a < __tmax/__b)) || \ + (__b == (typeof(__b))-1 && __a == __tmin); \ +}) + +#define check_mul_overflow(a, b, d) \ + __builtin_choose_expr(is_signed_type(typeof(a)), \ + __signed_mul_overflow(a, b, d), \ + __unsigned_mul_overflow(a, b, d)) + + +#endif /* COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW */ + +#endif diff --git a/pkg/plugin/lib/common/libbpf/_include/linux/placeholder.go b/pkg/plugin/lib/common/libbpf/_include/linux/placeholder.go new file mode 100644 index 0000000000..79ace600a5 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/linux/placeholder.go @@ -0,0 +1,3 @@ +package linux //nolint:all + +// This file is a placeholder to make Go include this directory when vendoring. diff --git a/pkg/plugin/lib/common/libbpf/_include/linux/ring_buffer.h b/pkg/plugin/lib/common/libbpf/_include/linux/ring_buffer.h new file mode 100644 index 0000000000..fc4677bc0a --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/linux/ring_buffer.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ +#ifndef _TOOLS_LINUX_RING_BUFFER_H_ +#define _TOOLS_LINUX_RING_BUFFER_H_ + +#include + +static inline __u64 ring_buffer_read_head(struct perf_event_mmap_page *base) +{ + return smp_load_acquire(&base->data_head); +} + +static inline void ring_buffer_write_tail(struct perf_event_mmap_page *base, + __u64 tail) +{ + smp_store_release(&base->data_tail, tail); +} + +#endif /* _TOOLS_LINUX_RING_BUFFER_H_ */ diff --git a/pkg/plugin/lib/common/libbpf/_include/linux/types.h b/pkg/plugin/lib/common/libbpf/_include/linux/types.h new file mode 100644 index 0000000000..b15252a4ed --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/linux/types.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LINUX_TYPES_H +#define __LINUX_TYPES_H + +#include +#include +#include + +#include + +#include +#include + +#define __bitwise__ +#define __bitwise __bitwise__ + +typedef __u16 __bitwise __le16; +typedef __u16 __bitwise __be16; +typedef __u32 __bitwise __le32; +typedef __u32 __bitwise __be32; +typedef __u64 __bitwise __le64; +typedef __u64 __bitwise __be64; + +#ifndef __aligned_u64 +# define __aligned_u64 __u64 __attribute__((aligned(8))) +#endif + +struct list_head { + struct list_head *next, *prev; +}; + +#endif diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/bpf.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/bpf.h new file mode 100644 index 0000000000..35bcf52dbc --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/bpf.h @@ -0,0 +1,7515 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef _UAPI__LINUX_BPF_H__ +#define _UAPI__LINUX_BPF_H__ + +#include +#include + +/* Extended instruction set based on top of classic BPF */ + +/* instruction classes */ +#define BPF_JMP32 0x06 /* jmp mode in word width */ +#define BPF_ALU64 0x07 /* alu mode in double word width */ + +/* ld/ldx fields */ +#define BPF_DW 0x18 /* double word (64-bit) */ +#define BPF_MEMSX 0x80 /* load with sign extension */ +#define BPF_ATOMIC 0xc0 /* atomic memory ops - op type in immediate */ +#define BPF_XADD 0xc0 /* exclusive add - legacy name */ + +/* alu/jmp fields */ +#define BPF_MOV 0xb0 /* mov reg to reg */ +#define BPF_ARSH 0xc0 /* sign extending arithmetic shift right */ + +/* change endianness of a register */ +#define BPF_END 0xd0 /* flags for endianness conversion: */ +#define BPF_TO_LE 0x00 /* convert to little-endian */ +#define BPF_TO_BE 0x08 /* convert to big-endian */ +#define BPF_FROM_LE BPF_TO_LE +#define BPF_FROM_BE BPF_TO_BE + +/* jmp encodings */ +#define BPF_JNE 0x50 /* jump != */ +#define BPF_JLT 0xa0 /* LT is unsigned, '<' */ +#define BPF_JLE 0xb0 /* LE is unsigned, '<=' */ +#define BPF_JSGT 0x60 /* SGT is signed '>', GT in x86 */ +#define BPF_JSGE 0x70 /* SGE is signed '>=', GE in x86 */ +#define BPF_JSLT 0xc0 /* SLT is signed, '<' */ +#define BPF_JSLE 0xd0 /* SLE is signed, '<=' */ +#define BPF_JCOND 0xe0 /* conditional pseudo jumps: may_goto, goto_or_nop */ +#define BPF_CALL 0x80 /* function call */ +#define BPF_EXIT 0x90 /* function return */ + +/* atomic op type fields (stored in immediate) */ +#define BPF_FETCH 0x01 /* not an opcode on its own, used to build others */ +#define BPF_XCHG (0xe0 | BPF_FETCH) /* atomic exchange */ +#define BPF_CMPXCHG (0xf0 | BPF_FETCH) /* atomic compare-and-write */ + +enum bpf_cond_pseudo_jmp { + BPF_MAY_GOTO = 0, +}; + +/* Register numbers */ +enum { + BPF_REG_0 = 0, + BPF_REG_1, + BPF_REG_2, + BPF_REG_3, + BPF_REG_4, + BPF_REG_5, + BPF_REG_6, + BPF_REG_7, + BPF_REG_8, + BPF_REG_9, + BPF_REG_10, + __MAX_BPF_REG, +}; + +/* BPF has 10 general purpose 64-bit registers and stack frame. */ +#define MAX_BPF_REG __MAX_BPF_REG + +struct bpf_insn { + __u8 code; /* opcode */ + __u8 dst_reg:4; /* dest register */ + __u8 src_reg:4; /* source register */ + __s16 off; /* signed offset */ + __s32 imm; /* signed immediate constant */ +}; + +/* Deprecated: use struct bpf_lpm_trie_key_u8 (when the "data" member is needed for + * byte access) or struct bpf_lpm_trie_key_hdr (when using an alternative type for + * the trailing flexible array member) instead. + */ +struct bpf_lpm_trie_key { + __u32 prefixlen; /* up to 32 for AF_INET, 128 for AF_INET6 */ + __u8 data[0]; /* Arbitrary size */ +}; + +/* Header for bpf_lpm_trie_key structs */ +struct bpf_lpm_trie_key_hdr { + __u32 prefixlen; +}; + +/* Key of an a BPF_MAP_TYPE_LPM_TRIE entry, with trailing byte array. */ +struct bpf_lpm_trie_key_u8 { + union { + struct bpf_lpm_trie_key_hdr hdr; + __u32 prefixlen; + }; + __u8 data[]; /* Arbitrary size */ +}; + +struct bpf_cgroup_storage_key { + __u64 cgroup_inode_id; /* cgroup inode id */ + __u32 attach_type; /* program attach type (enum bpf_attach_type) */ +}; + +enum bpf_cgroup_iter_order { + BPF_CGROUP_ITER_ORDER_UNSPEC = 0, + BPF_CGROUP_ITER_SELF_ONLY, /* process only a single object. */ + BPF_CGROUP_ITER_DESCENDANTS_PRE, /* walk descendants in pre-order. */ + BPF_CGROUP_ITER_DESCENDANTS_POST, /* walk descendants in post-order. */ + BPF_CGROUP_ITER_ANCESTORS_UP, /* walk ancestors upward. */ +}; + +union bpf_iter_link_info { + struct { + __u32 map_fd; + } map; + struct { + enum bpf_cgroup_iter_order order; + + /* At most one of cgroup_fd and cgroup_id can be non-zero. If + * both are zero, the walk starts from the default cgroup v2 + * root. For walking v1 hierarchy, one should always explicitly + * specify cgroup_fd. + */ + __u32 cgroup_fd; + __u64 cgroup_id; + } cgroup; + /* Parameters of task iterators. */ + struct { + __u32 tid; + __u32 pid; + __u32 pid_fd; + } task; +}; + +/* BPF syscall commands, see bpf(2) man-page for more details. */ +/** + * DOC: eBPF Syscall Preamble + * + * The operation to be performed by the **bpf**\ () system call is determined + * by the *cmd* argument. Each operation takes an accompanying argument, + * provided via *attr*, which is a pointer to a union of type *bpf_attr* (see + * below). The size argument is the size of the union pointed to by *attr*. + */ +/** + * DOC: eBPF Syscall Commands + * + * BPF_MAP_CREATE + * Description + * Create a map and return a file descriptor that refers to the + * map. The close-on-exec file descriptor flag (see **fcntl**\ (2)) + * is automatically enabled for the new file descriptor. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_MAP_CREATE** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_LOOKUP_ELEM + * Description + * Look up an element with a given *key* in the map referred to + * by the file descriptor *map_fd*. + * + * The *flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_ELEM + * Description + * Create or update an element (key/value pair) in a specified map. + * + * The *flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create a new element or update an existing element. + * **BPF_NOEXIST** + * Create a new element only if it did not exist. + * **BPF_EXIST** + * Update an existing element. + * **BPF_F_LOCK** + * Update a spin_lock-ed map element. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, + * **E2BIG**, **EEXIST**, or **ENOENT**. + * + * **E2BIG** + * The number of elements in the map reached the + * *max_entries* limit specified at map creation time. + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_ELEM + * Description + * Look up and delete an element by key in a specified map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_GET_NEXT_KEY + * Description + * Look up an element by key in a specified map and return the key + * of the next element. Can be used to iterate over all elements + * in the map. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * The following cases can be used to iterate over all elements of + * the map: + * + * * If *key* is not found, the operation returns zero and sets + * the *next_key* pointer to the key of the first element. + * * If *key* is found, the operation returns zero and sets the + * *next_key* pointer to the key of the next element. + * * If *key* is the last element, returns -1 and *errno* is set + * to **ENOENT**. + * + * May set *errno* to **ENOMEM**, **EFAULT**, **EPERM**, or + * **EINVAL** on error. + * + * BPF_PROG_LOAD + * Description + * Verify and load an eBPF program, returning a new file + * descriptor associated with the program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_PROG_LOAD** will unload the eBPF program (but see NOTES). + * + * The close-on-exec file descriptor flag (see **fcntl**\ (2)) is + * automatically enabled for the new file descriptor. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_PIN + * Description + * Pin an eBPF program or map referred by the specified *bpf_fd* + * to the provided *pathname* on the filesystem. + * + * The *pathname* argument must not contain a dot ("."). + * + * On success, *pathname* retains a reference to the eBPF object, + * preventing deallocation of the object when the original + * *bpf_fd* is closed. This allow the eBPF object to live beyond + * **close**\ (\ *bpf_fd*\ ), and hence the lifetime of the parent + * process. + * + * Applying **unlink**\ (2) or similar calls to the *pathname* + * unpins the object from the filesystem, removing the reference. + * If no other file descriptors or filesystem nodes refer to the + * same object, it will be deallocated (see NOTES). + * + * The filesystem type for the parent directory of *pathname* must + * be **BPF_FS_MAGIC**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_OBJ_GET + * Description + * Open a file descriptor for the eBPF object pinned to the + * specified *pathname*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_PROG_ATTACH + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook. + * + * The *attach_type* specifies the eBPF attachment point to + * attach the program to, and must be one of *bpf_attach_type* + * (see below). + * + * The *attach_bpf_fd* must be a valid file descriptor for a + * loaded eBPF program of a cgroup, flow dissector, LIRC, sockmap + * or sock_ops type corresponding to the specified *attach_type*. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_TYPE_SK_SKB**, + * **BPF_PROG_TYPE_SK_MSG** + * + * eBPF map of socket type (eg **BPF_MAP_TYPE_SOCKHASH**). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_DETACH + * Description + * Detach the eBPF program associated with the *target_fd* at the + * hook specified by *attach_type*. The program must have been + * previously attached using **BPF_PROG_ATTACH**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_TEST_RUN + * Description + * Run the eBPF program associated with the *prog_fd* a *repeat* + * number of times against a provided program context *ctx_in* and + * data *data_in*, and return the modified program context + * *ctx_out*, *data_out* (for example, packet data), result of the + * execution *retval*, and *duration* of the test run. + * + * The sizes of the buffers provided as input and output + * parameters *ctx_in*, *ctx_out*, *data_in*, and *data_out* must + * be provided in the corresponding variables *ctx_size_in*, + * *ctx_size_out*, *data_size_in*, and/or *data_size_out*. If any + * of these parameters are not provided (ie set to NULL), the + * corresponding size field must be zero. + * + * Some program types have particular requirements: + * + * **BPF_PROG_TYPE_SK_LOOKUP** + * *data_in* and *data_out* must be NULL. + * + * **BPF_PROG_TYPE_RAW_TRACEPOINT**, + * **BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE** + * + * *ctx_out*, *data_in* and *data_out* must be NULL. + * *repeat* must be zero. + * + * BPF_PROG_RUN is an alias for BPF_PROG_TEST_RUN. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * **ENOSPC** + * Either *data_size_out* or *ctx_size_out* is too small. + * **ENOTSUPP** + * This command is not supported by the program type of + * the program referred to by *prog_fd*. + * + * BPF_PROG_GET_NEXT_ID + * Description + * Fetch the next eBPF program currently loaded into the kernel. + * + * Looks for the eBPF program with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF programs + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_GET_NEXT_ID + * Description + * Fetch the next eBPF map currently loaded into the kernel. + * + * Looks for the eBPF map with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF maps + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_PROG_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF program corresponding to + * *prog_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_MAP_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF map corresponding to + * *map_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_OBJ_GET_INFO_BY_FD + * Description + * Obtain information about the eBPF object corresponding to + * *bpf_fd*. + * + * Populates up to *info_len* bytes of *info*, which will be in + * one of the following formats depending on the eBPF object type + * of *bpf_fd*: + * + * * **struct bpf_prog_info** + * * **struct bpf_map_info** + * * **struct bpf_btf_info** + * * **struct bpf_link_info** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_QUERY + * Description + * Obtain information about eBPF programs associated with the + * specified *attach_type* hook. + * + * The *target_fd* must be a valid file descriptor for a kernel + * object which depends on the attach type of *attach_bpf_fd*: + * + * **BPF_PROG_TYPE_CGROUP_DEVICE**, + * **BPF_PROG_TYPE_CGROUP_SKB**, + * **BPF_PROG_TYPE_CGROUP_SOCK**, + * **BPF_PROG_TYPE_CGROUP_SOCK_ADDR**, + * **BPF_PROG_TYPE_CGROUP_SOCKOPT**, + * **BPF_PROG_TYPE_CGROUP_SYSCTL**, + * **BPF_PROG_TYPE_SOCK_OPS** + * + * Control Group v2 hierarchy with the eBPF controller + * enabled. Requires the kernel to be compiled with + * **CONFIG_CGROUP_BPF**. + * + * **BPF_PROG_TYPE_FLOW_DISSECTOR** + * + * Network namespace (eg /proc/self/ns/net). + * + * **BPF_PROG_TYPE_LIRC_MODE2** + * + * LIRC device path (eg /dev/lircN). Requires the kernel + * to be compiled with **CONFIG_BPF_LIRC_MODE2**. + * + * **BPF_PROG_QUERY** always fetches the number of programs + * attached and the *attach_flags* which were used to attach those + * programs. Additionally, if *prog_ids* is nonzero and the number + * of attached programs is less than *prog_cnt*, populates + * *prog_ids* with the eBPF program ids of the programs attached + * at *target_fd*. + * + * The following flags may alter the result: + * + * **BPF_F_QUERY_EFFECTIVE** + * Only return information regarding programs which are + * currently effective at the specified *target_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_RAW_TRACEPOINT_OPEN + * Description + * Attach an eBPF program to a tracepoint *name* to access kernel + * internal arguments of the tracepoint in their raw form. + * + * The *prog_fd* must be a valid file descriptor associated with + * a loaded eBPF program of type **BPF_PROG_TYPE_RAW_TRACEPOINT**. + * + * No ABI guarantees are made about the content of tracepoint + * arguments exposed to the corresponding eBPF program. + * + * Applying **close**\ (2) to the file descriptor returned by + * **BPF_RAW_TRACEPOINT_OPEN** will delete the map (but see NOTES). + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_LOAD + * Description + * Verify and load BPF Type Format (BTF) metadata into the kernel, + * returning a new file descriptor associated with the metadata. + * BTF is described in more detail at + * https://www.kernel.org/doc/html/latest/bpf/btf.html. + * + * The *btf* parameter must point to valid memory providing + * *btf_size* bytes of BTF binary metadata. + * + * The returned file descriptor can be passed to other **bpf**\ () + * subcommands such as **BPF_PROG_LOAD** or **BPF_MAP_CREATE** to + * associate the BTF with those objects. + * + * Similar to **BPF_PROG_LOAD**, **BPF_BTF_LOAD** has optional + * parameters to specify a *btf_log_buf*, *btf_log_size* and + * *btf_log_level* which allow the kernel to return freeform log + * output regarding the BTF verification process. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_BTF_GET_FD_BY_ID + * Description + * Open a file descriptor for the BPF Type Format (BTF) + * corresponding to *btf_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_TASK_FD_QUERY + * Description + * Obtain information about eBPF programs associated with the + * target process identified by *pid* and *fd*. + * + * If the *pid* and *fd* are associated with a tracepoint, kprobe + * or uprobe perf event, then the *prog_id* and *fd_type* will + * be populated with the eBPF program id and file descriptor type + * of type **bpf_task_fd_type**. If associated with a kprobe or + * uprobe, the *probe_offset* and *probe_addr* will also be + * populated. Optionally, if *buf* is provided, then up to + * *buf_len* bytes of *buf* will be populated with the name of + * the tracepoint, kprobe or uprobe. + * + * The resulting *prog_id* may be introspected in deeper detail + * using **BPF_PROG_GET_FD_BY_ID** and **BPF_OBJ_GET_INFO_BY_FD**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_LOOKUP_AND_DELETE_ELEM + * Description + * Look up an element with the given *key* in the map referred to + * by the file descriptor *fd*, and if found, delete the element. + * + * For **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map + * types, the *flags* argument needs to be set to 0, but for other + * map types, it may be specified as: + * + * **BPF_F_LOCK** + * Look up and delete the value of a spin-locked map + * without returning the lock. This must be specified if + * the elements contain a spinlock. + * + * The **BPF_MAP_TYPE_QUEUE** and **BPF_MAP_TYPE_STACK** map types + * implement this command as a "pop" operation, deleting the top + * element rather than one corresponding to *key*. + * The *key* and *key_len* parameters should be zeroed when + * issuing this operation for these map types. + * + * This command is only valid for the following map types: + * * **BPF_MAP_TYPE_QUEUE** + * * **BPF_MAP_TYPE_STACK** + * * **BPF_MAP_TYPE_HASH** + * * **BPF_MAP_TYPE_PERCPU_HASH** + * * **BPF_MAP_TYPE_LRU_HASH** + * * **BPF_MAP_TYPE_LRU_PERCPU_HASH** + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_FREEZE + * Description + * Freeze the permissions of the specified map. + * + * Write permissions may be frozen by passing zero *flags*. + * Upon success, no future syscall invocations may alter the + * map state of *map_fd*. Write operations from eBPF programs + * are still possible for a frozen map. + * + * Not supported for maps of type **BPF_MAP_TYPE_STRUCT_OPS**. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_BTF_GET_NEXT_ID + * Description + * Fetch the next BPF Type Format (BTF) object currently loaded + * into the kernel. + * + * Looks for the BTF object with an id greater than *start_id* + * and updates *next_id* on success. If no other BTF objects + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_MAP_LOOKUP_BATCH + * Description + * Iterate and fetch multiple elements in a map. + * + * Two opaque values are used to manage batch operations, + * *in_batch* and *out_batch*. Initially, *in_batch* must be set + * to NULL to begin the batched operation. After each subsequent + * **BPF_MAP_LOOKUP_BATCH**, the caller should pass the resultant + * *out_batch* as the *in_batch* for the next operation to + * continue iteration from the current point. Both *in_batch* and + * *out_batch* must point to memory large enough to hold a key, + * except for maps of type **BPF_MAP_TYPE_{HASH, PERCPU_HASH, + * LRU_HASH, LRU_PERCPU_HASH}**, for which batch parameters + * must be at least 4 bytes wide regardless of key size. + * + * The *keys* and *values* are output parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are copied into the + * user buffer, with the keys copied into *keys* and the values + * copied into the corresponding indices in *values*. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **ENOSPC** to indicate that *keys* or + * *values* is too small to dump an entire bucket during + * iteration of a hash-based map type. + * + * BPF_MAP_LOOKUP_AND_DELETE_BATCH + * Description + * Iterate and delete all elements in a map. + * + * This operation has the same behavior as + * **BPF_MAP_LOOKUP_BATCH** with two exceptions: + * + * * Every element that is successfully returned is also deleted + * from the map. This is at least *count* elements. Note that + * *count* is both an input and an output parameter. + * * Upon returning with *errno* set to **EFAULT**, up to + * *count* elements may be deleted without returning the keys + * and values of the deleted elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_MAP_UPDATE_BATCH + * Description + * Update multiple elements in a map by *key*. + * + * The *keys* and *values* are input parameters which must point + * to memory large enough to hold *count* items based on the key + * and value size of the map *map_fd*. The *keys* buffer must be + * of *key_size* * *count*. The *values* buffer must be of + * *value_size* * *count*. + * + * Each element specified in *keys* is sequentially updated to the + * value in the corresponding index in *values*. The *in_batch* + * and *out_batch* parameters are ignored and should be zeroed. + * + * The *elem_flags* argument should be specified as one of the + * following: + * + * **BPF_ANY** + * Create new elements or update a existing elements. + * **BPF_NOEXIST** + * Create new elements only if they do not exist. + * **BPF_EXIST** + * Update existing elements. + * **BPF_F_LOCK** + * Update spin_lock-ed map elements. This must be + * specified if the map value contains a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * May set *errno* to **EINVAL**, **EPERM**, **ENOMEM**, or + * **E2BIG**. **E2BIG** indicates that the number of elements in + * the map reached the *max_entries* limit specified at map + * creation time. + * + * May set *errno* to one of the following error codes under + * specific circumstances: + * + * **EEXIST** + * If *flags* specifies **BPF_NOEXIST** and the element + * with *key* already exists in the map. + * **ENOENT** + * If *flags* specifies **BPF_EXIST** and the element with + * *key* does not exist in the map. + * + * BPF_MAP_DELETE_BATCH + * Description + * Delete multiple elements in a map by *key*. + * + * The *keys* parameter is an input parameter which must point + * to memory large enough to hold *count* items based on the key + * size of the map *map_fd*, that is, *key_size* * *count*. + * + * Each element specified in *keys* is sequentially deleted. The + * *in_batch*, *out_batch*, and *values* parameters are ignored + * and should be zeroed. + * + * The *elem_flags* argument may be specified as one of the + * following: + * + * **BPF_F_LOCK** + * Look up the value of a spin-locked map without + * returning the lock. This must be specified if the + * elements contain a spinlock. + * + * On success, *count* elements from the map are updated. + * + * If an error is returned and *errno* is not **EFAULT**, *count* + * is set to the number of successfully processed elements. If + * *errno* is **EFAULT**, up to *count* elements may be been + * deleted. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_CREATE + * Description + * Attach an eBPF program to a *target_fd* at the specified + * *attach_type* hook and return a file descriptor handle for + * managing the link. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_UPDATE + * Description + * Update the eBPF program in the specified *link_fd* to + * *new_prog_fd*. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_LINK_GET_FD_BY_ID + * Description + * Open a file descriptor for the eBPF Link corresponding to + * *link_id*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_GET_NEXT_ID + * Description + * Fetch the next eBPF link currently loaded into the kernel. + * + * Looks for the eBPF link with an id greater than *start_id* + * and updates *next_id* on success. If no other eBPF links + * remain with ids higher than *start_id*, returns -1 and sets + * *errno* to **ENOENT**. + * + * Return + * Returns zero on success. On error, or when no id remains, -1 + * is returned and *errno* is set appropriately. + * + * BPF_ENABLE_STATS + * Description + * Enable eBPF runtime statistics gathering. + * + * Runtime statistics gathering for the eBPF runtime is disabled + * by default to minimize the corresponding performance overhead. + * This command enables statistics globally. + * + * Multiple programs may independently enable statistics. + * After gathering the desired statistics, eBPF runtime statistics + * may be disabled again by calling **close**\ (2) for the file + * descriptor returned by this function. Statistics will only be + * disabled system-wide when all outstanding file descriptors + * returned by prior calls for this subcommand are closed. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_ITER_CREATE + * Description + * Create an iterator on top of the specified *link_fd* (as + * previously created using **BPF_LINK_CREATE**) and return a + * file descriptor that can be used to trigger the iteration. + * + * If the resulting file descriptor is pinned to the filesystem + * using **BPF_OBJ_PIN**, then subsequent **read**\ (2) syscalls + * for that path will trigger the iterator to read kernel state + * using the eBPF program attached to *link_fd*. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * BPF_LINK_DETACH + * Description + * Forcefully detach the specified *link_fd* from its + * corresponding attachment point. + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_PROG_BIND_MAP + * Description + * Bind a map to the lifetime of an eBPF program. + * + * The map identified by *map_fd* is bound to the program + * identified by *prog_fd* and only released when *prog_fd* is + * released. This may be used in cases where metadata should be + * associated with a program which otherwise does not contain any + * references to the map (for example, embedded in the eBPF + * program instructions). + * + * Return + * Returns zero on success. On error, -1 is returned and *errno* + * is set appropriately. + * + * BPF_TOKEN_CREATE + * Description + * Create BPF token with embedded information about what + * BPF-related functionality it allows: + * - a set of allowed bpf() syscall commands; + * - a set of allowed BPF map types to be created with + * BPF_MAP_CREATE command, if BPF_MAP_CREATE itself is allowed; + * - a set of allowed BPF program types and BPF program attach + * types to be loaded with BPF_PROG_LOAD command, if + * BPF_PROG_LOAD itself is allowed. + * + * BPF token is created (derived) from an instance of BPF FS, + * assuming it has necessary delegation mount options specified. + * This BPF token can be passed as an extra parameter to various + * bpf() syscall commands to grant BPF subsystem functionality to + * unprivileged processes. + * + * When created, BPF token is "associated" with the owning + * user namespace of BPF FS instance (super block) that it was + * derived from, and subsequent BPF operations performed with + * BPF token would be performing capabilities checks (i.e., + * CAP_BPF, CAP_PERFMON, CAP_NET_ADMIN, CAP_SYS_ADMIN) within + * that user namespace. Without BPF token, such capabilities + * have to be granted in init user namespace, making bpf() + * syscall incompatible with user namespace, for the most part. + * + * Return + * A new file descriptor (a nonnegative integer), or -1 if an + * error occurred (in which case, *errno* is set appropriately). + * + * NOTES + * eBPF objects (maps and programs) can be shared between processes. + * + * * After **fork**\ (2), the child inherits file descriptors + * referring to the same eBPF objects. + * * File descriptors referring to eBPF objects can be transferred over + * **unix**\ (7) domain sockets. + * * File descriptors referring to eBPF objects can be duplicated in the + * usual way, using **dup**\ (2) and similar calls. + * * File descriptors referring to eBPF objects can be pinned to the + * filesystem using the **BPF_OBJ_PIN** command of **bpf**\ (2). + * + * An eBPF object is deallocated only after all file descriptors referring + * to the object have been closed and no references remain pinned to the + * filesystem or attached (for example, bound to a program or device). + */ +enum bpf_cmd { + BPF_MAP_CREATE, + BPF_MAP_LOOKUP_ELEM, + BPF_MAP_UPDATE_ELEM, + BPF_MAP_DELETE_ELEM, + BPF_MAP_GET_NEXT_KEY, + BPF_PROG_LOAD, + BPF_OBJ_PIN, + BPF_OBJ_GET, + BPF_PROG_ATTACH, + BPF_PROG_DETACH, + BPF_PROG_TEST_RUN, + BPF_PROG_RUN = BPF_PROG_TEST_RUN, + BPF_PROG_GET_NEXT_ID, + BPF_MAP_GET_NEXT_ID, + BPF_PROG_GET_FD_BY_ID, + BPF_MAP_GET_FD_BY_ID, + BPF_OBJ_GET_INFO_BY_FD, + BPF_PROG_QUERY, + BPF_RAW_TRACEPOINT_OPEN, + BPF_BTF_LOAD, + BPF_BTF_GET_FD_BY_ID, + BPF_TASK_FD_QUERY, + BPF_MAP_LOOKUP_AND_DELETE_ELEM, + BPF_MAP_FREEZE, + BPF_BTF_GET_NEXT_ID, + BPF_MAP_LOOKUP_BATCH, + BPF_MAP_LOOKUP_AND_DELETE_BATCH, + BPF_MAP_UPDATE_BATCH, + BPF_MAP_DELETE_BATCH, + BPF_LINK_CREATE, + BPF_LINK_UPDATE, + BPF_LINK_GET_FD_BY_ID, + BPF_LINK_GET_NEXT_ID, + BPF_ENABLE_STATS, + BPF_ITER_CREATE, + BPF_LINK_DETACH, + BPF_PROG_BIND_MAP, + BPF_TOKEN_CREATE, + __MAX_BPF_CMD, +}; + +enum bpf_map_type { + BPF_MAP_TYPE_UNSPEC, + BPF_MAP_TYPE_HASH, + BPF_MAP_TYPE_ARRAY, + BPF_MAP_TYPE_PROG_ARRAY, + BPF_MAP_TYPE_PERF_EVENT_ARRAY, + BPF_MAP_TYPE_PERCPU_HASH, + BPF_MAP_TYPE_PERCPU_ARRAY, + BPF_MAP_TYPE_STACK_TRACE, + BPF_MAP_TYPE_CGROUP_ARRAY, + BPF_MAP_TYPE_LRU_HASH, + BPF_MAP_TYPE_LRU_PERCPU_HASH, + BPF_MAP_TYPE_LPM_TRIE, + BPF_MAP_TYPE_ARRAY_OF_MAPS, + BPF_MAP_TYPE_HASH_OF_MAPS, + BPF_MAP_TYPE_DEVMAP, + BPF_MAP_TYPE_SOCKMAP, + BPF_MAP_TYPE_CPUMAP, + BPF_MAP_TYPE_XSKMAP, + BPF_MAP_TYPE_SOCKHASH, + BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_CGROUP_STORAGE is available to bpf programs attaching + * to a cgroup. The newer BPF_MAP_TYPE_CGRP_STORAGE is available to + * both cgroup-attached and other progs and supports all functionality + * provided by BPF_MAP_TYPE_CGROUP_STORAGE. So mark + * BPF_MAP_TYPE_CGROUP_STORAGE deprecated. + */ + BPF_MAP_TYPE_CGROUP_STORAGE = BPF_MAP_TYPE_CGROUP_STORAGE_DEPRECATED, + BPF_MAP_TYPE_REUSEPORT_SOCKARRAY, + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, + /* BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE is available to bpf programs + * attaching to a cgroup. The new mechanism (BPF_MAP_TYPE_CGRP_STORAGE + + * local percpu kptr) supports all BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE + * functionality and more. So mark * BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE + * deprecated. + */ + BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE = BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE_DEPRECATED, + BPF_MAP_TYPE_QUEUE, + BPF_MAP_TYPE_STACK, + BPF_MAP_TYPE_SK_STORAGE, + BPF_MAP_TYPE_DEVMAP_HASH, + BPF_MAP_TYPE_STRUCT_OPS, + BPF_MAP_TYPE_RINGBUF, + BPF_MAP_TYPE_INODE_STORAGE, + BPF_MAP_TYPE_TASK_STORAGE, + BPF_MAP_TYPE_BLOOM_FILTER, + BPF_MAP_TYPE_USER_RINGBUF, + BPF_MAP_TYPE_CGRP_STORAGE, + BPF_MAP_TYPE_ARENA, + __MAX_BPF_MAP_TYPE +}; + +/* Note that tracing related programs such as + * BPF_PROG_TYPE_{KPROBE,TRACEPOINT,PERF_EVENT,RAW_TRACEPOINT} + * are not subject to a stable API since kernel internal data + * structures can change from release to release and may + * therefore break existing tracing BPF programs. Tracing BPF + * programs correspond to /a/ specific kernel which is to be + * analyzed, and not /a/ specific kernel /and/ all future ones. + */ +enum bpf_prog_type { + BPF_PROG_TYPE_UNSPEC, + BPF_PROG_TYPE_SOCKET_FILTER, + BPF_PROG_TYPE_KPROBE, + BPF_PROG_TYPE_SCHED_CLS, + BPF_PROG_TYPE_SCHED_ACT, + BPF_PROG_TYPE_TRACEPOINT, + BPF_PROG_TYPE_XDP, + BPF_PROG_TYPE_PERF_EVENT, + BPF_PROG_TYPE_CGROUP_SKB, + BPF_PROG_TYPE_CGROUP_SOCK, + BPF_PROG_TYPE_LWT_IN, + BPF_PROG_TYPE_LWT_OUT, + BPF_PROG_TYPE_LWT_XMIT, + BPF_PROG_TYPE_SOCK_OPS, + BPF_PROG_TYPE_SK_SKB, + BPF_PROG_TYPE_CGROUP_DEVICE, + BPF_PROG_TYPE_SK_MSG, + BPF_PROG_TYPE_RAW_TRACEPOINT, + BPF_PROG_TYPE_CGROUP_SOCK_ADDR, + BPF_PROG_TYPE_LWT_SEG6LOCAL, + BPF_PROG_TYPE_LIRC_MODE2, + BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, + BPF_PROG_TYPE_CGROUP_SYSCTL, + BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, + BPF_PROG_TYPE_CGROUP_SOCKOPT, + BPF_PROG_TYPE_TRACING, + BPF_PROG_TYPE_STRUCT_OPS, + BPF_PROG_TYPE_EXT, + BPF_PROG_TYPE_LSM, + BPF_PROG_TYPE_SK_LOOKUP, + BPF_PROG_TYPE_SYSCALL, /* a program that can execute syscalls */ + BPF_PROG_TYPE_NETFILTER, + __MAX_BPF_PROG_TYPE +}; + +enum bpf_attach_type { + BPF_CGROUP_INET_INGRESS, + BPF_CGROUP_INET_EGRESS, + BPF_CGROUP_INET_SOCK_CREATE, + BPF_CGROUP_SOCK_OPS, + BPF_SK_SKB_STREAM_PARSER, + BPF_SK_SKB_STREAM_VERDICT, + BPF_CGROUP_DEVICE, + BPF_SK_MSG_VERDICT, + BPF_CGROUP_INET4_BIND, + BPF_CGROUP_INET6_BIND, + BPF_CGROUP_INET4_CONNECT, + BPF_CGROUP_INET6_CONNECT, + BPF_CGROUP_INET4_POST_BIND, + BPF_CGROUP_INET6_POST_BIND, + BPF_CGROUP_UDP4_SENDMSG, + BPF_CGROUP_UDP6_SENDMSG, + BPF_LIRC_MODE2, + BPF_FLOW_DISSECTOR, + BPF_CGROUP_SYSCTL, + BPF_CGROUP_UDP4_RECVMSG, + BPF_CGROUP_UDP6_RECVMSG, + BPF_CGROUP_GETSOCKOPT, + BPF_CGROUP_SETSOCKOPT, + BPF_TRACE_RAW_TP, + BPF_TRACE_FENTRY, + BPF_TRACE_FEXIT, + BPF_MODIFY_RETURN, + BPF_LSM_MAC, + BPF_TRACE_ITER, + BPF_CGROUP_INET4_GETPEERNAME, + BPF_CGROUP_INET6_GETPEERNAME, + BPF_CGROUP_INET4_GETSOCKNAME, + BPF_CGROUP_INET6_GETSOCKNAME, + BPF_XDP_DEVMAP, + BPF_CGROUP_INET_SOCK_RELEASE, + BPF_XDP_CPUMAP, + BPF_SK_LOOKUP, + BPF_XDP, + BPF_SK_SKB_VERDICT, + BPF_SK_REUSEPORT_SELECT, + BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, + BPF_PERF_EVENT, + BPF_TRACE_KPROBE_MULTI, + BPF_LSM_CGROUP, + BPF_STRUCT_OPS, + BPF_NETFILTER, + BPF_TCX_INGRESS, + BPF_TCX_EGRESS, + BPF_TRACE_UPROBE_MULTI, + BPF_CGROUP_UNIX_CONNECT, + BPF_CGROUP_UNIX_SENDMSG, + BPF_CGROUP_UNIX_RECVMSG, + BPF_CGROUP_UNIX_GETPEERNAME, + BPF_CGROUP_UNIX_GETSOCKNAME, + BPF_NETKIT_PRIMARY, + BPF_NETKIT_PEER, + BPF_TRACE_KPROBE_SESSION, + __MAX_BPF_ATTACH_TYPE +}; + +#define MAX_BPF_ATTACH_TYPE __MAX_BPF_ATTACH_TYPE + +enum bpf_link_type { + BPF_LINK_TYPE_UNSPEC = 0, + BPF_LINK_TYPE_RAW_TRACEPOINT = 1, + BPF_LINK_TYPE_TRACING = 2, + BPF_LINK_TYPE_CGROUP = 3, + BPF_LINK_TYPE_ITER = 4, + BPF_LINK_TYPE_NETNS = 5, + BPF_LINK_TYPE_XDP = 6, + BPF_LINK_TYPE_PERF_EVENT = 7, + BPF_LINK_TYPE_KPROBE_MULTI = 8, + BPF_LINK_TYPE_STRUCT_OPS = 9, + BPF_LINK_TYPE_NETFILTER = 10, + BPF_LINK_TYPE_TCX = 11, + BPF_LINK_TYPE_UPROBE_MULTI = 12, + BPF_LINK_TYPE_NETKIT = 13, + BPF_LINK_TYPE_SOCKMAP = 14, + __MAX_BPF_LINK_TYPE, +}; + +#define MAX_BPF_LINK_TYPE __MAX_BPF_LINK_TYPE + +enum bpf_perf_event_type { + BPF_PERF_EVENT_UNSPEC = 0, + BPF_PERF_EVENT_UPROBE = 1, + BPF_PERF_EVENT_URETPROBE = 2, + BPF_PERF_EVENT_KPROBE = 3, + BPF_PERF_EVENT_KRETPROBE = 4, + BPF_PERF_EVENT_TRACEPOINT = 5, + BPF_PERF_EVENT_EVENT = 6, +}; + +/* cgroup-bpf attach flags used in BPF_PROG_ATTACH command + * + * NONE(default): No further bpf programs allowed in the subtree. + * + * BPF_F_ALLOW_OVERRIDE: If a sub-cgroup installs some bpf program, + * the program in this cgroup yields to sub-cgroup program. + * + * BPF_F_ALLOW_MULTI: If a sub-cgroup installs some bpf program, + * that cgroup program gets run in addition to the program in this cgroup. + * + * Only one program is allowed to be attached to a cgroup with + * NONE or BPF_F_ALLOW_OVERRIDE flag. + * Attaching another program on top of NONE or BPF_F_ALLOW_OVERRIDE will + * release old program and attach the new one. Attach flags has to match. + * + * Multiple programs are allowed to be attached to a cgroup with + * BPF_F_ALLOW_MULTI flag. They are executed in FIFO order + * (those that were attached first, run first) + * The programs of sub-cgroup are executed first, then programs of + * this cgroup and then programs of parent cgroup. + * When children program makes decision (like picking TCP CA or sock bind) + * parent program has a chance to override it. + * + * With BPF_F_ALLOW_MULTI a new program is added to the end of the list of + * programs for a cgroup. Though it's possible to replace an old program at + * any position by also specifying BPF_F_REPLACE flag and position itself in + * replace_bpf_fd attribute. Old program at this position will be released. + * + * A cgroup with MULTI or OVERRIDE flag allows any attach flags in sub-cgroups. + * A cgroup with NONE doesn't allow any programs in sub-cgroups. + * Ex1: + * cgrp1 (MULTI progs A, B) -> + * cgrp2 (OVERRIDE prog C) -> + * cgrp3 (MULTI prog D) -> + * cgrp4 (OVERRIDE prog E) -> + * cgrp5 (NONE prog F) + * the event in cgrp5 triggers execution of F,D,A,B in that order. + * if prog F is detached, the execution is E,D,A,B + * if prog F and D are detached, the execution is E,A,B + * if prog F, E and D are detached, the execution is C,A,B + * + * All eligible programs are executed regardless of return code from + * earlier programs. + */ +#define BPF_F_ALLOW_OVERRIDE (1U << 0) +#define BPF_F_ALLOW_MULTI (1U << 1) +/* Generic attachment flags. */ +#define BPF_F_REPLACE (1U << 2) +#define BPF_F_BEFORE (1U << 3) +#define BPF_F_AFTER (1U << 4) +#define BPF_F_ID (1U << 5) +#define BPF_F_LINK BPF_F_LINK /* 1 << 13 */ + +/* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will perform strict alignment checking as if the kernel + * has been built with CONFIG_EFFICIENT_UNALIGNED_ACCESS not set, + * and NET_IP_ALIGN defined to 2. + */ +#define BPF_F_STRICT_ALIGNMENT (1U << 0) + +/* If BPF_F_ANY_ALIGNMENT is used in BPF_PROG_LOAD command, the + * verifier will allow any alignment whatsoever. On platforms + * with strict alignment requirements for loads ands stores (such + * as sparc and mips) the verifier validates that all loads and + * stores provably follow this requirement. This flag turns that + * checking and enforcement off. + * + * It is mostly used for testing when we want to validate the + * context and memory access aspects of the verifier, but because + * of an unaligned access the alignment check would trigger before + * the one we are interested in. + */ +#define BPF_F_ANY_ALIGNMENT (1U << 1) + +/* BPF_F_TEST_RND_HI32 is used in BPF_PROG_LOAD command for testing purpose. + * Verifier does sub-register def/use analysis and identifies instructions whose + * def only matters for low 32-bit, high 32-bit is never referenced later + * through implicit zero extension. Therefore verifier notifies JIT back-ends + * that it is safe to ignore clearing high 32-bit for these instructions. This + * saves some back-ends a lot of code-gen. However such optimization is not + * necessary on some arches, for example x86_64, arm64 etc, whose JIT back-ends + * hence hasn't used verifier's analysis result. But, we really want to have a + * way to be able to verify the correctness of the described optimization on + * x86_64 on which testsuites are frequently exercised. + * + * So, this flag is introduced. Once it is set, verifier will randomize high + * 32-bit for those instructions who has been identified as safe to ignore them. + * Then, if verifier is not doing correct analysis, such randomization will + * regress tests to expose bugs. + */ +#define BPF_F_TEST_RND_HI32 (1U << 2) + +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_STATE_FREQ (1U << 3) + +/* If BPF_F_SLEEPABLE is used in BPF_PROG_LOAD command, the verifier will + * restrict map and helper usage for such programs. Sleepable BPF programs can + * only be attached to hooks where kernel execution context allows sleeping. + * Such programs are allowed to use helpers that may sleep like + * bpf_copy_from_user(). + */ +#define BPF_F_SLEEPABLE (1U << 4) + +/* If BPF_F_XDP_HAS_FRAGS is used in BPF_PROG_LOAD command, the loaded program + * fully support xdp frags. + */ +#define BPF_F_XDP_HAS_FRAGS (1U << 5) + +/* If BPF_F_XDP_DEV_BOUND_ONLY is used in BPF_PROG_LOAD command, the loaded + * program becomes device-bound but can access XDP metadata. + */ +#define BPF_F_XDP_DEV_BOUND_ONLY (1U << 6) + +/* The verifier internal test flag. Behavior is undefined */ +#define BPF_F_TEST_REG_INVARIANTS (1U << 7) + +/* link_create.kprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_KPROBE_MULTI attach type to create return probe. + */ +enum { + BPF_F_KPROBE_MULTI_RETURN = (1U << 0) +}; + +/* link_create.uprobe_multi.flags used in LINK_CREATE command for + * BPF_TRACE_UPROBE_MULTI attach type to create return probe. + */ +enum { + BPF_F_UPROBE_MULTI_RETURN = (1U << 0) +}; + +/* link_create.netfilter.flags used in LINK_CREATE command for + * BPF_PROG_TYPE_NETFILTER to enable IP packet defragmentation. + */ +#define BPF_F_NETFILTER_IP_DEFRAG (1U << 0) + +/* When BPF ldimm64's insn[0].src_reg != 0 then this can have + * the following extensions: + * + * insn[0].src_reg: BPF_PSEUDO_MAP_[FD|IDX] + * insn[0].imm: map fd or fd_idx + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map + * verifier type: CONST_PTR_TO_MAP + */ +#define BPF_PSEUDO_MAP_FD 1 +#define BPF_PSEUDO_MAP_IDX 5 + +/* insn[0].src_reg: BPF_PSEUDO_MAP_[IDX_]VALUE + * insn[0].imm: map fd or fd_idx + * insn[1].imm: offset into value + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of map[0]+offset + * verifier type: PTR_TO_MAP_VALUE + */ +#define BPF_PSEUDO_MAP_VALUE 2 +#define BPF_PSEUDO_MAP_IDX_VALUE 6 + +/* insn[0].src_reg: BPF_PSEUDO_BTF_ID + * insn[0].imm: kernel btd id of VAR + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the kernel variable + * verifier type: PTR_TO_BTF_ID or PTR_TO_MEM, depending on whether the var + * is struct/union. + */ +#define BPF_PSEUDO_BTF_ID 3 +/* insn[0].src_reg: BPF_PSEUDO_FUNC + * insn[0].imm: insn offset to the func + * insn[1].imm: 0 + * insn[0].off: 0 + * insn[1].off: 0 + * ldimm64 rewrite: address of the function + * verifier type: PTR_TO_FUNC. + */ +#define BPF_PSEUDO_FUNC 4 + +/* when bpf_call->src_reg == BPF_PSEUDO_CALL, bpf_call->imm == pc-relative + * offset to another bpf function + */ +#define BPF_PSEUDO_CALL 1 +/* when bpf_call->src_reg == BPF_PSEUDO_KFUNC_CALL, + * bpf_call->imm == btf_id of a BTF_KIND_FUNC in the running kernel + */ +#define BPF_PSEUDO_KFUNC_CALL 2 + +enum bpf_addr_space_cast { + BPF_ADDR_SPACE_CAST = 1, +}; + +/* flags for BPF_MAP_UPDATE_ELEM command */ +enum { + BPF_ANY = 0, /* create new element or update existing */ + BPF_NOEXIST = 1, /* create new element if it didn't exist */ + BPF_EXIST = 2, /* update existing element */ + BPF_F_LOCK = 4, /* spin_lock-ed map_lookup/map_update */ +}; + +/* flags for BPF_MAP_CREATE command */ +enum { + BPF_F_NO_PREALLOC = (1U << 0), +/* Instead of having one common LRU list in the + * BPF_MAP_TYPE_LRU_[PERCPU_]HASH map, use a percpu LRU list + * which can scale and perform better. + * Note, the LRU nodes (including free nodes) cannot be moved + * across different LRU lists. + */ + BPF_F_NO_COMMON_LRU = (1U << 1), +/* Specify numa node during map creation */ + BPF_F_NUMA_NODE = (1U << 2), + +/* Flags for accessing BPF object from syscall side. */ + BPF_F_RDONLY = (1U << 3), + BPF_F_WRONLY = (1U << 4), + +/* Flag for stack_map, store build_id+offset instead of pointer */ + BPF_F_STACK_BUILD_ID = (1U << 5), + +/* Zero-initialize hash function seed. This should only be used for testing. */ + BPF_F_ZERO_SEED = (1U << 6), + +/* Flags for accessing BPF object from program side. */ + BPF_F_RDONLY_PROG = (1U << 7), + BPF_F_WRONLY_PROG = (1U << 8), + +/* Clone map from listener for newly accepted socket */ + BPF_F_CLONE = (1U << 9), + +/* Enable memory-mapping BPF map */ + BPF_F_MMAPABLE = (1U << 10), + +/* Share perf_event among processes */ + BPF_F_PRESERVE_ELEMS = (1U << 11), + +/* Create a map that is suitable to be an inner map with dynamic max entries */ + BPF_F_INNER_MAP = (1U << 12), + +/* Create a map that will be registered/unregesitered by the backed bpf_link */ + BPF_F_LINK = (1U << 13), + +/* Get path from provided FD in BPF_OBJ_PIN/BPF_OBJ_GET commands */ + BPF_F_PATH_FD = (1U << 14), + +/* Flag for value_type_btf_obj_fd, the fd is available */ + BPF_F_VTYPE_BTF_OBJ_FD = (1U << 15), + +/* BPF token FD is passed in a corresponding command's token_fd field */ + BPF_F_TOKEN_FD = (1U << 16), + +/* When user space page faults in bpf_arena send SIGSEGV instead of inserting new page */ + BPF_F_SEGV_ON_FAULT = (1U << 17), + +/* Do not translate kernel bpf_arena pointers to user pointers */ + BPF_F_NO_USER_CONV = (1U << 18), +}; + +/* Flags for BPF_PROG_QUERY. */ + +/* Query effective (directly attached + inherited from ancestor cgroups) + * programs that will be executed for events within a cgroup. + * attach_flags with this flag are always returned 0. + */ +#define BPF_F_QUERY_EFFECTIVE (1U << 0) + +/* Flags for BPF_PROG_TEST_RUN */ + +/* If set, run the test on the cpu specified by bpf_attr.test.cpu */ +#define BPF_F_TEST_RUN_ON_CPU (1U << 0) +/* If set, XDP frames will be transmitted after processing */ +#define BPF_F_TEST_XDP_LIVE_FRAMES (1U << 1) +/* If set, apply CHECKSUM_COMPLETE to skb and validate the checksum */ +#define BPF_F_TEST_SKB_CHECKSUM_COMPLETE (1U << 2) + +/* type for BPF_ENABLE_STATS */ +enum bpf_stats_type { + /* enabled run_time_ns and run_cnt */ + BPF_STATS_RUN_TIME = 0, +}; + +enum bpf_stack_build_id_status { + /* user space need an empty entry to identify end of a trace */ + BPF_STACK_BUILD_ID_EMPTY = 0, + /* with valid build_id and offset */ + BPF_STACK_BUILD_ID_VALID = 1, + /* couldn't get build_id, fallback to ip */ + BPF_STACK_BUILD_ID_IP = 2, +}; + +#define BPF_BUILD_ID_SIZE 20 +struct bpf_stack_build_id { + __s32 status; + unsigned char build_id[BPF_BUILD_ID_SIZE]; + union { + __u64 offset; + __u64 ip; + }; +}; + +#define BPF_OBJ_NAME_LEN 16U + +union bpf_attr { + struct { /* anonymous struct used by BPF_MAP_CREATE command */ + __u32 map_type; /* one of enum bpf_map_type */ + __u32 key_size; /* size of key in bytes */ + __u32 value_size; /* size of value in bytes */ + __u32 max_entries; /* max number of entries in a map */ + __u32 map_flags; /* BPF_MAP_CREATE related + * flags defined above. + */ + __u32 inner_map_fd; /* fd pointing to the inner map */ + __u32 numa_node; /* numa node (effective only if + * BPF_F_NUMA_NODE is set). + */ + char map_name[BPF_OBJ_NAME_LEN]; + __u32 map_ifindex; /* ifindex of netdev to create on */ + __u32 btf_fd; /* fd pointing to a BTF type data */ + __u32 btf_key_type_id; /* BTF type_id of the key */ + __u32 btf_value_type_id; /* BTF type_id of the value */ + __u32 btf_vmlinux_value_type_id;/* BTF type_id of a kernel- + * struct stored as the + * map value + */ + /* Any per-map-type extra fields + * + * BPF_MAP_TYPE_BLOOM_FILTER - the lowest 4 bits indicate the + * number of hash functions (if 0, the bloom filter will default + * to using 5 hash functions). + * + * BPF_MAP_TYPE_ARENA - contains the address where user space + * is going to mmap() the arena. It has to be page aligned. + */ + __u64 map_extra; + + __s32 value_type_btf_obj_fd; /* fd pointing to a BTF + * type data for + * btf_vmlinux_value_type_id. + */ + /* BPF token FD to use with BPF_MAP_CREATE operation. + * If provided, map_flags should have BPF_F_TOKEN_FD flag set. + */ + __s32 map_token_fd; + }; + + struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ + __u32 map_fd; + __aligned_u64 key; + union { + __aligned_u64 value; + __aligned_u64 next_key; + }; + __u64 flags; + }; + + struct { /* struct used by BPF_MAP_*_BATCH commands */ + __aligned_u64 in_batch; /* start batch, + * NULL to start from beginning + */ + __aligned_u64 out_batch; /* output: next start batch */ + __aligned_u64 keys; + __aligned_u64 values; + __u32 count; /* input/output: + * input: # of key/value + * elements + * output: # of filled elements + */ + __u32 map_fd; + __u64 elem_flags; + __u64 flags; + } batch; + + struct { /* anonymous struct used by BPF_PROG_LOAD command */ + __u32 prog_type; /* one of enum bpf_prog_type */ + __u32 insn_cnt; + __aligned_u64 insns; + __aligned_u64 license; + __u32 log_level; /* verbosity level of verifier */ + __u32 log_size; /* size of user buffer */ + __aligned_u64 log_buf; /* user supplied buffer */ + __u32 kern_version; /* not used */ + __u32 prog_flags; + char prog_name[BPF_OBJ_NAME_LEN]; + __u32 prog_ifindex; /* ifindex of netdev to prep for */ + /* For some prog types expected attach type must be known at + * load time to verify attach type specific parts of prog + * (context accesses, allowed helpers, etc). + */ + __u32 expected_attach_type; + __u32 prog_btf_fd; /* fd pointing to BTF type data */ + __u32 func_info_rec_size; /* userspace bpf_func_info size */ + __aligned_u64 func_info; /* func info */ + __u32 func_info_cnt; /* number of bpf_func_info records */ + __u32 line_info_rec_size; /* userspace bpf_line_info size */ + __aligned_u64 line_info; /* line info */ + __u32 line_info_cnt; /* number of bpf_line_info records */ + __u32 attach_btf_id; /* in-kernel BTF type id to attach to */ + union { + /* valid prog_fd to attach to bpf prog */ + __u32 attach_prog_fd; + /* or valid module BTF object fd or 0 to attach to vmlinux */ + __u32 attach_btf_obj_fd; + }; + __u32 core_relo_cnt; /* number of bpf_core_relo */ + __aligned_u64 fd_array; /* array of FDs */ + __aligned_u64 core_relos; + __u32 core_relo_rec_size; /* sizeof(struct bpf_core_relo) */ + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 log_true_size; + /* BPF token FD to use with BPF_PROG_LOAD operation. + * If provided, prog_flags should have BPF_F_TOKEN_FD flag set. + */ + __s32 prog_token_fd; + }; + + struct { /* anonymous struct used by BPF_OBJ_* commands */ + __aligned_u64 pathname; + __u32 bpf_fd; + __u32 file_flags; + /* Same as dirfd in openat() syscall; see openat(2) + * manpage for details of path FD and pathname semantics; + * path_fd should accompanied by BPF_F_PATH_FD flag set in + * file_flags field, otherwise it should be set to zero; + * if BPF_F_PATH_FD flag is not set, AT_FDCWD is assumed. + */ + __s32 path_fd; + }; + + struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */ + union { + __u32 target_fd; /* target object to attach to or ... */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_bpf_fd; + __u32 attach_type; + __u32 attach_flags; + __u32 replace_bpf_fd; + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + }; + + struct { /* anonymous struct used by BPF_PROG_TEST_RUN command */ + __u32 prog_fd; + __u32 retval; + __u32 data_size_in; /* input: len of data_in */ + __u32 data_size_out; /* input/output: len of data_out + * returns ENOSPC if data_out + * is too small. + */ + __aligned_u64 data_in; + __aligned_u64 data_out; + __u32 repeat; + __u32 duration; + __u32 ctx_size_in; /* input: len of ctx_in */ + __u32 ctx_size_out; /* input/output: len of ctx_out + * returns ENOSPC if ctx_out + * is too small. + */ + __aligned_u64 ctx_in; + __aligned_u64 ctx_out; + __u32 flags; + __u32 cpu; + __u32 batch_size; + } test; + + struct { /* anonymous struct used by BPF_*_GET_*_ID */ + union { + __u32 start_id; + __u32 prog_id; + __u32 map_id; + __u32 btf_id; + __u32 link_id; + }; + __u32 next_id; + __u32 open_flags; + }; + + struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */ + __u32 bpf_fd; + __u32 info_len; + __aligned_u64 info; + } info; + + struct { /* anonymous struct used by BPF_PROG_QUERY command */ + union { + __u32 target_fd; /* target object to query or ... */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_type; + __u32 query_flags; + __u32 attach_flags; + __aligned_u64 prog_ids; + union { + __u32 prog_cnt; + __u32 count; + }; + __u32 :32; + /* output: per-program attach_flags. + * not allowed to be set during effective query. + */ + __aligned_u64 prog_attach_flags; + __aligned_u64 link_ids; + __aligned_u64 link_attach_flags; + __u64 revision; + } query; + + struct { /* anonymous struct used by BPF_RAW_TRACEPOINT_OPEN command */ + __u64 name; + __u32 prog_fd; + __u32 :32; + __aligned_u64 cookie; + } raw_tracepoint; + + struct { /* anonymous struct for BPF_BTF_LOAD */ + __aligned_u64 btf; + __aligned_u64 btf_log_buf; + __u32 btf_size; + __u32 btf_log_size; + __u32 btf_log_level; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + */ + __u32 btf_log_true_size; + __u32 btf_flags; + /* BPF token FD to use with BPF_BTF_LOAD operation. + * If provided, btf_flags should have BPF_F_TOKEN_FD flag set. + */ + __s32 btf_token_fd; + }; + + struct { + __u32 pid; /* input: pid */ + __u32 fd; /* input: fd */ + __u32 flags; /* input: flags */ + __u32 buf_len; /* input/output: buf len */ + __aligned_u64 buf; /* input/output: + * tp_name for tracepoint + * symbol for kprobe + * filename for uprobe + */ + __u32 prog_id; /* output: prod_id */ + __u32 fd_type; /* output: BPF_FD_TYPE_* */ + __u64 probe_offset; /* output: probe_offset */ + __u64 probe_addr; /* output: probe_addr */ + } task_fd_query; + + struct { /* struct used by BPF_LINK_CREATE command */ + union { + __u32 prog_fd; /* eBPF program to attach */ + __u32 map_fd; /* struct_ops to attach */ + }; + union { + __u32 target_fd; /* target object to attach to or ... */ + __u32 target_ifindex; /* target ifindex */ + }; + __u32 attach_type; /* attach type */ + __u32 flags; /* extra flags */ + union { + __u32 target_btf_id; /* btf_id of target to attach to */ + struct { + __aligned_u64 iter_info; /* extra bpf_iter_link_info */ + __u32 iter_info_len; /* iter_info length */ + }; + struct { + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 bpf_cookie; + } perf_event; + struct { + __u32 flags; + __u32 cnt; + __aligned_u64 syms; + __aligned_u64 addrs; + __aligned_u64 cookies; + } kprobe_multi; + struct { + /* this is overlaid with the target_btf_id above. */ + __u32 target_btf_id; + /* black box user-provided value passed through + * to BPF program at the execution time and + * accessible through bpf_get_attach_cookie() BPF helper + */ + __u64 cookie; + } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } tcx; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 cnt; + __u32 flags; + __u32 pid; + } uprobe_multi; + struct { + union { + __u32 relative_fd; + __u32 relative_id; + }; + __u64 expected_revision; + } netkit; + }; + } link_create; + + struct { /* struct used by BPF_LINK_UPDATE command */ + __u32 link_fd; /* link fd */ + union { + /* new program fd to update link with */ + __u32 new_prog_fd; + /* new struct_ops map fd to update link with */ + __u32 new_map_fd; + }; + __u32 flags; /* extra flags */ + union { + /* expected link's program fd; is specified only if + * BPF_F_REPLACE flag is set in flags. + */ + __u32 old_prog_fd; + /* expected link's map fd; is specified only + * if BPF_F_REPLACE flag is set. + */ + __u32 old_map_fd; + }; + } link_update; + + struct { + __u32 link_fd; + } link_detach; + + struct { /* struct used by BPF_ENABLE_STATS command */ + __u32 type; + } enable_stats; + + struct { /* struct used by BPF_ITER_CREATE command */ + __u32 link_fd; + __u32 flags; + } iter_create; + + struct { /* struct used by BPF_PROG_BIND_MAP command */ + __u32 prog_fd; + __u32 map_fd; + __u32 flags; /* extra flags */ + } prog_bind_map; + + struct { /* struct used by BPF_TOKEN_CREATE command */ + __u32 flags; + __u32 bpffs_fd; + } token_create; + +} __attribute__((aligned(8))); + +/* The description below is an attempt at providing documentation to eBPF + * developers about the multiple available eBPF helper functions. It can be + * parsed and used to produce a manual page. The workflow is the following, + * and requires the rst2man utility: + * + * $ ./scripts/bpf_doc.py \ + * --filename include/uapi/linux/bpf.h > /tmp/bpf-helpers.rst + * $ rst2man /tmp/bpf-helpers.rst > /tmp/bpf-helpers.7 + * $ man /tmp/bpf-helpers.7 + * + * Note that in order to produce this external documentation, some RST + * formatting is used in the descriptions to get "bold" and "italics" in + * manual pages. Also note that the few trailing white spaces are + * intentional, removing them would break paragraphs for rst2man. + * + * Start of BPF helper function descriptions: + * + * void *bpf_map_lookup_elem(struct bpf_map *map, const void *key) + * Description + * Perform a lookup in *map* for an entry associated to *key*. + * Return + * Map value associated to *key*, or **NULL** if no entry was + * found. + * + * long bpf_map_update_elem(struct bpf_map *map, const void *key, const void *value, u64 flags) + * Description + * Add or update the value of the entry associated to *key* in + * *map* with *value*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * Flag value **BPF_NOEXIST** cannot be used for maps of types + * **BPF_MAP_TYPE_ARRAY** or **BPF_MAP_TYPE_PERCPU_ARRAY** (all + * elements always exist), the helper would return an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_delete_elem(struct bpf_map *map, const void *key) + * Description + * Delete entry with *key* from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read(void *dst, u32 size, const void *unsafe_ptr) + * Description + * For tracing programs, safely attempt to read *size* bytes from + * kernel space address *unsafe_ptr* and store the data in *dst*. + * + * Generally, use **bpf_probe_read_user**\ () or + * **bpf_probe_read_kernel**\ () instead. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_ktime_get_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does not include time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_MONOTONIC**) + * Return + * Current *ktime*. + * + * long bpf_trace_printk(const char *fmt, u32 fmt_size, ...) + * Description + * This helper is a "printk()-like" facility for debugging. It + * prints a message defined by format *fmt* (of size *fmt_size*) + * to file *\/sys/kernel/tracing/trace* from TraceFS, if + * available. It can take up to three additional **u64** + * arguments (as an eBPF helpers, the total number of arguments is + * limited to five). + * + * Each time the helper is called, it appends a line to the trace. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. + * The format of the trace is customizable, and the exact output + * one will get depends on the options set in + * *\/sys/kernel/tracing/trace_options* (see also the + * *README* file under the same directory). However, it usually + * defaults to something like: + * + * :: + * + * telnet-470 [001] .N.. 419421.045894: 0x00000001: + * + * In the above: + * + * * ``telnet`` is the name of the current task. + * * ``470`` is the PID of the current task. + * * ``001`` is the CPU number on which the task is + * running. + * * In ``.N..``, each character refers to a set of + * options (whether irqs are enabled, scheduling + * options, whether hard/softirqs are running, level of + * preempt_disabled respectively). **N** means that + * **TIF_NEED_RESCHED** and **PREEMPT_NEED_RESCHED** + * are set. + * * ``419421.045894`` is a timestamp. + * * ``0x00000001`` is a fake value used by BPF for the + * instruction pointer register. + * * ```` is the message formatted with + * *fmt*. + * + * The conversion specifiers supported by *fmt* are similar, but + * more limited than for printk(). They are **%d**, **%i**, + * **%u**, **%x**, **%ld**, **%li**, **%lu**, **%lx**, **%lld**, + * **%lli**, **%llu**, **%llx**, **%p**, **%s**. No modifier (size + * of field, padding with zeroes, etc.) is available, and the + * helper will return **-EINVAL** (but print nothing) if it + * encounters an unknown specifier. + * + * Also, note that **bpf_trace_printk**\ () is slow, and should + * only be used for debugging purposes. For this reason, a notice + * block (spanning several lines) is printed to kernel logs and + * states that the helper should not be used "for production use" + * the first time this helper is used (or more precisely, when + * **trace_printk**\ () buffers are allocated). For passing values + * to user space, perf events should be preferred. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * u32 bpf_get_prandom_u32(void) + * Description + * Get a pseudo-random number. + * + * From a security point of view, this helper uses its own + * pseudo-random internal state, and cannot be used to infer the + * seed of other random functions in the kernel. However, it is + * essential to note that the generator used by the helper is not + * cryptographically secure. + * Return + * A random 32-bit unsigned value. + * + * u32 bpf_get_smp_processor_id(void) + * Description + * Get the SMP (symmetric multiprocessing) processor id. Note that + * all programs run with migration disabled, which means that the + * SMP processor id is stable during all the execution of the + * program. + * Return + * The SMP id of the processor running the program. + * + * long bpf_skb_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len, u64 flags) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. *flags* are a combination of + * **BPF_F_RECOMPUTE_CSUM** (automatically recompute the + * checksum for the packet after storing the bytes) and + * **BPF_F_INVALIDATE_HASH** (set *skb*\ **->hash**, *skb*\ + * **->swhash** and *skb*\ **->l4hash** to 0). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l3_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 size) + * Description + * Recompute the layer 3 (e.g. IP) checksum for the packet + * associated to *skb*. Computation is incremental, so the helper + * must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored in *size*. + * Alternatively, it is possible to store the difference between + * the previous and the new values of the header field in *to*, by + * setting *from* and *size* to 0. For both methods, *offset* + * indicates the location of the IP checksum within the packet. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_l4_csum_replace(struct sk_buff *skb, u32 offset, u64 from, u64 to, u64 flags) + * Description + * Recompute the layer 4 (e.g. TCP, UDP or ICMP) checksum for the + * packet associated to *skb*. Computation is incremental, so the + * helper must know the former value of the header field that was + * modified (*from*), the new value of this field (*to*), and the + * number of bytes (2 or 4) for this field, stored on the lowest + * four bits of *flags*. Alternatively, it is possible to store + * the difference between the previous and the new values of the + * header field in *to*, by setting *from* and the four lowest + * bits of *flags* to 0. For both methods, *offset* indicates the + * location of the IP checksum within the packet. In addition to + * the size of the field, *flags* can be added (bitwise OR) actual + * flags. With **BPF_F_MARK_MANGLED_0**, a null checksum is left + * untouched (unless **BPF_F_MARK_ENFORCE** is added as well), and + * for updates resulting in a null checksum the value is set to + * **CSUM_MANGLED_0** instead. Flag **BPF_F_PSEUDO_HDR** indicates + * the checksum is to be computed against a pseudo-header. + * + * This helper works in combination with **bpf_csum_diff**\ (), + * which does not update the checksum in-place, but offers more + * flexibility and can handle sizes larger than 2 or 4 for the + * checksum to update. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_tail_call(void *ctx, struct bpf_map *prog_array_map, u32 index) + * Description + * This special helper is used to trigger a "tail call", or in + * other words, to jump into another eBPF program. The same stack + * frame is used (but values on stack and in registers for the + * caller are not accessible to the callee). This mechanism allows + * for program chaining, either for raising the maximum number of + * available eBPF instructions, or to execute given programs in + * conditional blocks. For security reasons, there is an upper + * limit to the number of successive tail calls that can be + * performed. + * + * Upon call of this helper, the program attempts to jump into a + * program referenced at index *index* in *prog_array_map*, a + * special map of type **BPF_MAP_TYPE_PROG_ARRAY**, and passes + * *ctx*, a pointer to the context. + * + * If the call succeeds, the kernel immediately runs the first + * instruction of the new program. This is not a function call, + * and it never returns to the previous program. If the call + * fails, then the helper has no effect, and the caller continues + * to run its subsequent instructions. A call can fail if the + * destination program for the jump does not exist (i.e. *index* + * is superior to the number of entries in *prog_array_map*), or + * if the maximum number of tail calls has been reached for this + * chain of programs. This limit is defined in the kernel by the + * macro **MAX_TAIL_CALL_CNT** (not accessible to user space), + * which is currently set to 33. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_clone_redirect(struct sk_buff *skb, u32 ifindex, u64 flags) + * Description + * Clone and redirect the packet associated to *skb* to another + * net device of index *ifindex*. Both ingress and egress + * interfaces can be used for redirection. The **BPF_F_INGRESS** + * value in *flags* is used to make the distinction (ingress path + * is selected if the flag is present, egress path otherwise). + * This is the only flag supported for now. + * + * In comparison with **bpf_redirect**\ () helper, + * **bpf_clone_redirect**\ () has the associated cost of + * duplicating the packet buffer, but this can be executed out of + * the eBPF program. Conversely, **bpf_redirect**\ () is more + * efficient, but it is handled through an action code where the + * redirection happens only after the eBPF program has returned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. Positive + * error indicates a potential drop or congestion in the target + * device. The particular positive error codes are not defined. + * + * u64 bpf_get_current_pid_tgid(void) + * Description + * Get the current pid and tgid. + * Return + * A 64-bit integer containing the current tgid and pid, and + * created as such: + * *current_task*\ **->tgid << 32 \|** + * *current_task*\ **->pid**. + * + * u64 bpf_get_current_uid_gid(void) + * Description + * Get the current uid and gid. + * Return + * A 64-bit integer containing the current GID and UID, and + * created as such: *current_gid* **<< 32 \|** *current_uid*. + * + * long bpf_get_current_comm(void *buf, u32 size_of_buf) + * Description + * Copy the **comm** attribute of the current task into *buf* of + * *size_of_buf*. The **comm** attribute contains the name of + * the executable (excluding the path) for the current task. The + * *size_of_buf* must be strictly positive. On success, the + * helper makes sure that the *buf* is NUL-terminated. On failure, + * it is filled with zeroes. + * Return + * 0 on success, or a negative error in case of failure. + * + * u32 bpf_get_cgroup_classid(struct sk_buff *skb) + * Description + * Retrieve the classid for the current task, i.e. for the net_cls + * cgroup to which *skb* belongs. + * + * This helper can be used on TC egress path, but not on ingress. + * + * The net_cls cgroup provides an interface to tag network packets + * based on a user-provided identifier for all traffic coming from + * the tasks belonging to the related cgroup. See also the related + * kernel documentation, available from the Linux sources in file + * *Documentation/admin-guide/cgroup-v1/net_cls.rst*. + * + * The Linux kernel has two versions for cgroups: there are + * cgroups v1 and cgroups v2. Both are available to users, who can + * use a mixture of them, but note that the net_cls cgroup is for + * cgroup v1 only. This makes it incompatible with BPF programs + * run on cgroups, which is a cgroup-v2-only feature (a socket can + * only hold data for one version of cgroups at a time). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_CGROUP_NET_CLASSID** configuration option set to + * "**y**" or to "**m**". + * Return + * The classid, or 0 for the default unconfigured classid. + * + * long bpf_skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci) + * Description + * Push a *vlan_tci* (VLAN tag control information) of protocol + * *vlan_proto* to the packet associated to *skb*, then update + * the checksum. Note that if *vlan_proto* is different from + * **ETH_P_8021Q** and **ETH_P_8021AD**, it is considered to + * be **ETH_P_8021Q**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_vlan_pop(struct sk_buff *skb) + * Description + * Pop a VLAN header from the packet associated to *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Get tunnel metadata. This helper takes a pointer *key* to an + * empty **struct bpf_tunnel_key** of **size**, that will be + * filled with tunnel metadata for the packet associated to *skb*. + * The *flags* can be set to **BPF_F_TUNINFO_IPV6**, which + * indicates that the tunnel is based on IPv6 protocol instead of + * IPv4. + * + * The **struct bpf_tunnel_key** is an object that generalizes the + * principal parameters used by various tunneling protocols into a + * single struct. This way, it can be used to easily make a + * decision based on the contents of the encapsulation header, + * "summarized" in this struct. In particular, it holds the IP + * address of the remote end (IPv4 or IPv6, depending on the case) + * in *key*\ **->remote_ipv4** or *key*\ **->remote_ipv6**. Also, + * this struct exposes the *key*\ **->tunnel_id**, which is + * generally mapped to a VNI (Virtual Network Identifier), making + * it programmable together with the **bpf_skb_set_tunnel_key**\ + * () helper. + * + * Let's imagine that the following code is part of a program + * attached to the TC ingress interface, on one end of a GRE + * tunnel, and is supposed to filter out all messages coming from + * remote ends with IPv4 address other than 10.0.0.1: + * + * :: + * + * int ret; + * struct bpf_tunnel_key key = {}; + * + * ret = bpf_skb_get_tunnel_key(skb, &key, sizeof(key), 0); + * if (ret < 0) + * return TC_ACT_SHOT; // drop packet + * + * if (key.remote_ipv4 != 0x0a000001) + * return TC_ACT_SHOT; // drop packet + * + * return TC_ACT_OK; // accept packet + * + * This interface can also be used with all encapsulation devices + * that can operate in "collect metadata" mode: instead of having + * one network device per specific configuration, the "collect + * metadata" mode only requires a single device where the + * configuration can be extracted from this helper. + * + * This can be used together with various tunnels such as VXLan, + * Geneve, GRE or IP in IP (IPIP). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_set_tunnel_key(struct sk_buff *skb, struct bpf_tunnel_key *key, u32 size, u64 flags) + * Description + * Populate tunnel metadata for packet associated to *skb.* The + * tunnel metadata is set to the contents of *key*, of *size*. The + * *flags* can be set to a combination of the following values: + * + * **BPF_F_TUNINFO_IPV6** + * Indicate that the tunnel is based on IPv6 protocol + * instead of IPv4. + * **BPF_F_ZERO_CSUM_TX** + * For IPv4 packets, add a flag to tunnel metadata + * indicating that checksum computation should be skipped + * and checksum set to zeroes. + * **BPF_F_DONT_FRAGMENT** + * Add a flag to tunnel metadata indicating that the + * packet should not be fragmented. + * **BPF_F_SEQ_NUMBER** + * Add a flag to tunnel metadata indicating that a + * sequence number should be added to tunnel header before + * sending the packet. This flag was added for GRE + * encapsulation, but might be used with other protocols + * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. + * + * Here is a typical usage on the transmit path: + * + * :: + * + * struct bpf_tunnel_key key; + * populate key ... + * bpf_skb_set_tunnel_key(skb, &key, sizeof(key), 0); + * bpf_clone_redirect(skb, vxlan_dev_ifindex, 0); + * + * See also the description of the **bpf_skb_get_tunnel_key**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_perf_event_read(struct bpf_map *map, u64 flags) + * Description + * Read the value of a perf event counter. This helper relies on a + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of + * the perf event counter is selected when *map* is updated with + * perf event file descriptors. The *map* is an array whose size + * is the number of available CPUs, and each cell contains a value + * relative to one CPU. The value to retrieve is indicated by + * *flags*, that contains the index of the CPU to look up, masked + * with **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * Note that before Linux 4.13, only hardware perf event can be + * retrieved. + * + * Also, be aware that the newer helper + * **bpf_perf_event_read_value**\ () is recommended over + * **bpf_perf_event_read**\ () in general. The latter has some ABI + * quirks where error and counter value are used as a return code + * (which is wrong to do since ranges may overlap). This issue is + * fixed with **bpf_perf_event_read_value**\ (), which at the same + * time provides more features over the **bpf_perf_event_read**\ + * () interface. Please refer to the description of + * **bpf_perf_event_read_value**\ () for details. + * Return + * The value of the perf event counter read from the map, or a + * negative error code in case of failure. + * + * long bpf_redirect(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_clone_redirect**\ + * (), except that the packet is not cloned, which provides + * increased performance. + * + * Except for XDP, both ingress and egress interfaces can be used + * for redirection. The **BPF_F_INGRESS** value in *flags* is used + * to make the distinction (ingress path is selected if the flag + * is present, egress path otherwise). Currently, XDP only + * supports redirection to the egress interface, and accepts no + * flag at all. + * + * The same effect can also be attained with the more generic + * **bpf_redirect_map**\ (), which uses a BPF map to store the + * redirect target instead of providing it directly to the helper. + * Return + * For XDP, the helper returns **XDP_REDIRECT** on success or + * **XDP_ABORTED** on error. For other program types, the values + * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on + * error. + * + * u32 bpf_get_route_realm(struct sk_buff *skb) + * Description + * Retrieve the realm or the route, that is to say the + * **tclassid** field of the destination for the *skb*. The + * identifier retrieved is a user-provided tag, similar to the + * one used with the net_cls cgroup (see description for + * **bpf_get_cgroup_classid**\ () helper), but here this tag is + * held by a route (a destination entry), not by a task. + * + * Retrieving this identifier works with the clsact TC egress hook + * (see also **tc-bpf(8)**), or alternatively on conventional + * classful egress qdiscs, but not on TC ingress path. In case of + * clsact TC egress hook, this has the advantage that, internally, + * the destination entry has not been dropped yet in the transmit + * path. Therefore, the destination entry does not need to be + * artificially held via **netif_keep_dst**\ () for a classful + * qdisc until the *skb* is freed. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_IP_ROUTE_CLASSID** configuration option. + * Return + * The realm of the route for the packet associated to *skb*, or 0 + * if none was found. + * + * long bpf_perf_event_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * The context of the program *ctx* needs also be passed to the + * helper. + * + * On user space, a program willing to read the values needs to + * call **perf_event_open**\ () on the perf event (either for + * one or for all CPUs) and to store the file descriptor into the + * *map*. This must be done before the eBPF program can send data + * into it. An example is available in file + * *samples/bpf/trace_output_user.c* in the Linux kernel source + * tree (the eBPF program counterpart is in + * *samples/bpf/trace_output_kern.c*). + * + * **bpf_perf_event_output**\ () achieves better performance + * than **bpf_trace_printk**\ () for sharing data with user + * space, and is much better suitable for streaming data from eBPF + * programs. + * + * Note that this helper is not restricted to tracing use cases + * and can be used with programs attached to TC or XDP as well, + * where it allows for passing data to user space listeners. Data + * can be: + * + * * Only custom structs, + * * Only the packet payload, or + * * A combination of both. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes(const void *skb, u32 offset, void *to, u32 len) + * Description + * This helper was provided as an easy way to load data from a + * packet. It can be used to load *len* bytes from *offset* from + * the packet associated to *skb*, into the buffer pointed by + * *to*. + * + * Since Linux 4.7, usage of this helper has mostly been replaced + * by "direct packet access", enabling packet data to be + * manipulated with *skb*\ **->data** and *skb*\ **->data_end** + * pointing respectively to the first byte of packet data and to + * the byte after the last byte of packet data. However, it + * remains useful if one wishes to read large quantities of data + * at once from a packet into the eBPF stack. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stackid(void *ctx, struct bpf_map *map, u64 flags) + * Description + * Walk a user or a kernel stack and return its id. To achieve + * this, the helper needs *ctx*, which is a pointer to the context + * on which the tracing program is executed, and a pointer to a + * *map* of type **BPF_MAP_TYPE_STACK_TRACE**. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * a combination of the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_FAST_STACK_CMP** + * Compare stacks by hash only. + * **BPF_F_REUSE_STACKID** + * If two different stacks hash into the same *stackid*, + * discard the old one. + * + * The stack id retrieved is a 32 bit long integer handle which + * can be further combined with other data (including other stack + * ids) and used as a key into maps. This can be useful for + * generating a variety of graphs (such as flame graphs or off-cpu + * graphs). + * + * For walking a stack, this helper is an improvement over + * **bpf_probe_read**\ (), which can be used with unrolled loops + * but is not efficient and consumes a lot of eBPF instructions. + * Instead, **bpf_get_stackid**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The positive or null stack id on success, or a negative error + * in case of failure. + * + * s64 bpf_csum_diff(__be32 *from, u32 from_size, __be32 *to, u32 to_size, __wsum seed) + * Description + * Compute a checksum difference, from the raw buffer pointed by + * *from*, of length *from_size* (that must be a multiple of 4), + * towards the raw buffer pointed by *to*, of size *to_size* + * (same remark). An optional *seed* can be added to the value + * (this can be cascaded, the seed may come from a previous call + * to the helper). + * + * This is flexible enough to be used in several ways: + * + * * With *from_size* == 0, *to_size* > 0 and *seed* set to + * checksum, it can be used when pushing new data. + * * With *from_size* > 0, *to_size* == 0 and *seed* set to + * checksum, it can be used when removing data from a packet. + * * With *from_size* > 0, *to_size* > 0 and *seed* set to 0, it + * can be used to compute a diff. Note that *from_size* and + * *to_size* do not need to be equal. + * + * This helper can be used in combination with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ (), to + * which one can feed in the difference computed with + * **bpf_csum_diff**\ (). + * Return + * The checksum result, or a negative error code in case of + * failure. + * + * long bpf_skb_get_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Retrieve tunnel options metadata for the packet associated to + * *skb*, and store the raw tunnel option data to the buffer *opt* + * of *size*. + * + * This helper can be used with encapsulation devices that can + * operate in "collect metadata" mode (please refer to the related + * note in the description of **bpf_skb_get_tunnel_key**\ () for + * more details). A particular example where this can be used is + * in combination with the Geneve encapsulation protocol, where it + * allows for pushing (with **bpf_skb_get_tunnel_opt**\ () helper) + * and retrieving arbitrary TLVs (Type-Length-Value headers) from + * the eBPF program. This allows for full customization of these + * headers. + * Return + * The size of the option data retrieved. + * + * long bpf_skb_set_tunnel_opt(struct sk_buff *skb, void *opt, u32 size) + * Description + * Set tunnel options metadata for the packet associated to *skb* + * to the option data contained in the raw buffer *opt* of *size*. + * + * See also the description of the **bpf_skb_get_tunnel_opt**\ () + * helper for additional information. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_proto(struct sk_buff *skb, __be16 proto, u64 flags) + * Description + * Change the protocol of the *skb* to *proto*. Currently + * supported are transition from IPv4 to IPv6, and from IPv6 to + * IPv4. The helper takes care of the groundwork for the + * transition, including resizing the socket buffer. The eBPF + * program is expected to fill the new headers, if any, via + * **skb_store_bytes**\ () and to recompute the checksums with + * **bpf_l3_csum_replace**\ () and **bpf_l4_csum_replace**\ + * (). The main case for this helper is to perform NAT64 + * operations out of an eBPF program. + * + * Internally, the GSO type is marked as dodgy so that headers are + * checked and segments are recalculated by the GSO/GRO engine. + * The size for GSO target is adapted as well. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_change_type(struct sk_buff *skb, u32 type) + * Description + * Change the packet type for the packet associated to *skb*. This + * comes down to setting *skb*\ **->pkt_type** to *type*, except + * the eBPF program does not have a write access to *skb*\ + * **->pkt_type** beside this helper. Using a helper here allows + * for graceful handling of errors. + * + * The major use case is to change incoming *skb*s to + * **PACKET_HOST** in a programmatic way instead of having to + * recirculate via **redirect**\ (..., **BPF_F_INGRESS**), for + * example. + * + * Note that *type* only allows certain values. At this time, they + * are: + * + * **PACKET_HOST** + * Packet is for us. + * **PACKET_BROADCAST** + * Send packet to all. + * **PACKET_MULTICAST** + * Send packet to group. + * **PACKET_OTHERHOST** + * Send packet to someone else. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_under_cgroup(struct sk_buff *skb, struct bpf_map *map, u32 index) + * Description + * Check whether *skb* is a descendant of the cgroup2 held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 0, if the *skb* failed the cgroup2 descendant test. + * * 1, if the *skb* succeeded the cgroup2 descendant test. + * * A negative error code, if an error occurred. + * + * u32 bpf_get_hash_recalc(struct sk_buff *skb) + * Description + * Retrieve the hash of the packet, *skb*\ **->hash**. If it is + * not set, in particular if the hash was cleared due to mangling, + * recompute this hash. Later accesses to the hash can be done + * directly with *skb*\ **->hash**. + * + * Calling **bpf_set_hash_invalid**\ (), changing a packet + * prototype with **bpf_skb_change_proto**\ (), or calling + * **bpf_skb_store_bytes**\ () with the + * **BPF_F_INVALIDATE_HASH** are actions susceptible to clear + * the hash and to trigger a new computation for the next call to + * **bpf_get_hash_recalc**\ (). + * Return + * The 32-bit hash. + * + * u64 bpf_get_current_task(void) + * Description + * Get the current task. + * Return + * A pointer to the current task struct. + * + * long bpf_probe_write_user(void *dst, const void *src, u32 len) + * Description + * Attempt in a safe way to write *len* bytes from the buffer + * *src* to *dst* in memory. It only works for threads that are in + * user context, and *dst* must be a valid user space address. + * + * This helper should not be used to implement any kind of + * security mechanism because of TOC-TOU attacks, but rather to + * debug, divert, and manipulate execution of semi-cooperative + * processes. + * + * Keep in mind that this feature is meant for experiments, and it + * has a risk of crashing the system and running programs. + * Therefore, when an eBPF program using this helper is attached, + * a warning including PID and process name is printed to kernel + * logs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_current_task_under_cgroup(struct bpf_map *map, u32 index) + * Description + * Check whether the probe is being run is the context of a given + * subset of the cgroup2 hierarchy. The cgroup2 to test is held by + * *map* of type **BPF_MAP_TYPE_CGROUP_ARRAY**, at *index*. + * Return + * The return value depends on the result of the test, and can be: + * + * * 1, if current task belongs to the cgroup2. + * * 0, if current task does not belong to the cgroup2. + * * A negative error code, if an error occurred. + * + * long bpf_skb_change_tail(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Resize (trim or grow) the packet associated to *skb* to the + * new *len*. The *flags* are reserved for future usage, and must + * be left at zero. + * + * The basic idea is that the helper performs the needed work to + * change the size of the packet, then the eBPF program rewrites + * the rest via helpers like **bpf_skb_store_bytes**\ (), + * **bpf_l3_csum_replace**\ (), **bpf_l3_csum_replace**\ () + * and others. This helper is a slow path utility intended for + * replies with control messages. And because it is targeted for + * slow path, the helper itself can afford to be slow: it + * implicitly linearizes, unclones and drops offloads from the + * *skb*. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_pull_data(struct sk_buff *skb, u32 len) + * Description + * Pull in non-linear data in case the *skb* is non-linear and not + * all of *len* are part of the linear section. Make *len* bytes + * from *skb* readable and writable. If a zero value is passed for + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. + * + * This helper is only needed for reading and writing with direct + * packet access. + * + * For direct packet access, testing that offsets to access + * are within packet boundaries (test on *skb*\ **->data_end**) is + * susceptible to fail if offsets are invalid, or if the requested + * data is in non-linear parts of the *skb*. On failure the + * program can just bail out, or in the case of a non-linear + * buffer, use a helper to make the data available. The + * **bpf_skb_load_bytes**\ () helper is a first solution to access + * the data. Another one consists in using **bpf_skb_pull_data** + * to pull in once the non-linear parts, then retesting and + * eventually access the data. + * + * At the same time, this also makes sure the *skb* is uncloned, + * which is a necessary condition for direct write. As this needs + * to be an invariant for the write part only, the verifier + * detects writes and adds a prologue that is calling + * **bpf_skb_pull_data()** to effectively unclone the *skb* from + * the very beginning in case it is indeed cloned. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * s64 bpf_csum_update(struct sk_buff *skb, __wsum csum) + * Description + * Add the checksum *csum* into *skb*\ **->csum** in case the + * driver has supplied a checksum for the entire packet into that + * field. Return an error otherwise. This helper is intended to be + * used in combination with **bpf_csum_diff**\ (), in particular + * when the checksum needs to be updated after data has been + * written into the packet through direct packet access. + * Return + * The checksum on success, or a negative error code in case of + * failure. + * + * void bpf_set_hash_invalid(struct sk_buff *skb) + * Description + * Invalidate the current *skb*\ **->hash**. It can be used after + * mangling on headers through direct packet access, in order to + * indicate that the hash is outdated and to trigger a + * recalculation the next time the kernel tries to access this + * hash or when the **bpf_get_hash_recalc**\ () helper is called. + * Return + * void. + * + * long bpf_get_numa_node_id(void) + * Description + * Return the id of the current NUMA node. The primary use case + * for this helper is the selection of sockets for the local NUMA + * node, when the program is attached to sockets using the + * **SO_ATTACH_REUSEPORT_EBPF** option (see also **socket(7)**), + * but the helper is also available to other eBPF program types, + * similarly to **bpf_get_smp_processor_id**\ (). + * Return + * The id of current NUMA node. + * + * long bpf_skb_change_head(struct sk_buff *skb, u32 len, u64 flags) + * Description + * Grows headroom of packet associated to *skb* and adjusts the + * offset of the MAC header accordingly, adding *len* bytes of + * space. It automatically extends and reallocates memory as + * required. + * + * This helper can be used on a layer 3 *skb* to push a MAC header + * for redirection into a layer 2 device. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_head(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data** by *delta* bytes. Note that + * it is possible to use a negative value for *delta*. This helper + * can be used to prepare the packet for pushing or popping + * headers. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address + * *unsafe_ptr* to *dst*. See **bpf_probe_read_kernel_str**\ () for + * more details. + * + * Generally, use **bpf_probe_read_user_str**\ () or + * **bpf_probe_read_kernel_str**\ () instead. + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * u64 bpf_get_socket_cookie(struct sk_buff *skb) + * Description + * If the **struct sk_buff** pointed by *skb* has a known socket, + * retrieve the cookie (generated by the kernel) of this socket. + * If no cookie has been set yet, generate a new cookie. Once + * generated, the socket cookie remains stable for the life of the + * socket. This helper can be useful for monitoring per socket + * networking traffic statistics as it provides a global socket + * identifier that can be assumed unique. + * Return + * A 8-byte long unique number on success, or 0 if the socket + * field is missing inside *skb*. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_addr *ctx) + * Description + * Equivalent to bpf_get_socket_cookie() helper that accepts + * *skb*, but gets socket from **struct bpf_sock_addr** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct bpf_sock_ops *ctx) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *skb*, but gets socket from **struct bpf_sock_ops** context. + * Return + * A 8-byte long unique number. + * + * u64 bpf_get_socket_cookie(struct sock *sk) + * Description + * Equivalent to **bpf_get_socket_cookie**\ () helper that accepts + * *sk*, but gets socket from a BTF **struct sock**. This helper + * also works for sleepable programs. + * Return + * A 8-byte long unique number or 0 if *sk* is NULL. + * + * u32 bpf_get_socket_uid(struct sk_buff *skb) + * Description + * Get the owner UID of the socked associated to *skb*. + * Return + * The owner UID of the socket associated to *skb*. If the socket + * is **NULL**, or if it is not a full socket (i.e. if it is a + * time-wait or a request socket instead), **overflowuid** value + * is returned (note that **overflowuid** might also be the actual + * UID value for the socket). + * + * long bpf_set_hash(struct sk_buff *skb, u32 hash) + * Description + * Set the full hash for *skb* (set the field *skb*\ **->hash**) + * to value *hash*. + * Return + * 0 + * + * long bpf_setsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **setsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **setsockopt(2)** for more information. + * The option value of length *optlen* is pointed by *optval*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. + * + * This helper actually implements a subset of **setsockopt()**. + * It supports the following *level*\ s: + * + * * **SOL_SOCKET**, which supports the following *optname*\ s: + * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, + * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. + * * **IPPROTO_TCP**, which supports the following *optname*\ s: + * **TCP_CONGESTION**, **TCP_BPF_IW**, + * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, + * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. + * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_adjust_room(struct sk_buff *skb, s32 len_diff, u32 mode, u64 flags) + * Description + * Grow or shrink the room for data in the packet associated to + * *skb* by *len_diff*, and according to the selected *mode*. + * + * By default, the helper will reset any offloaded checksum + * indicator of the skb to CHECKSUM_NONE. This can be avoided + * by the following flag: + * + * * **BPF_F_ADJ_ROOM_NO_CSUM_RESET**: Do not reset offloaded + * checksum data of the skb to CHECKSUM_NONE. + * + * There are two supported modes at this time: + * + * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer + * (room space is added or removed between the layer 2 and + * layer 3 headers). + * + * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer + * (room space is added or removed between the layer 3 and + * layer 4 headers). + * + * The following flags are supported at this time: + * + * * **BPF_F_ADJ_ROOM_FIXED_GSO**: Do not adjust gso_size. + * Adjusting mss in this way is not allowed for datagrams. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_ENCAP_L3_IPV6**: + * Any new space is reserved to hold a tunnel header. + * Configure skb offsets and other fields accordingly. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L4_GRE**, + * **BPF_F_ADJ_ROOM_ENCAP_L4_UDP**: + * Use with ENCAP_L3 flags to further specify the tunnel type. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2**\ (*len*): + * Use with ENCAP_L3/L4 flags to further specify the tunnel + * type; *len* is the length of the inner MAC header. + * + * * **BPF_F_ADJ_ROOM_ENCAP_L2_ETH**: + * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the + * L2 type as Ethernet. + * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_redirect_map(struct bpf_map *map, u64 key, u64 flags) + * Description + * Redirect the packet to the endpoint referenced by *map* at + * index *key*. Depending on its type, this *map* can contain + * references to net devices (for forwarding packets through other + * ports), or to CPUs (for redirecting XDP frames to another CPU; + * but this is only implemented for native XDP (with driver + * support) as of this writing). + * + * The lower two bits of *flags* are used as the return code if + * the map lookup fails. This is so that the return value can be + * one of the XDP program return codes up to **XDP_TX**, as chosen + * by the caller. The higher bits of *flags* can be set to + * BPF_F_BROADCAST or BPF_F_EXCLUDE_INGRESS as defined below. + * + * With BPF_F_BROADCAST the packet will be broadcasted to all the + * interfaces in the map, with BPF_F_EXCLUDE_INGRESS the ingress + * interface will be excluded when do broadcasting. + * + * See also **bpf_redirect**\ (), which only supports redirecting + * to an ifindex, but doesn't require a map to do so. + * Return + * **XDP_REDIRECT** on success, or the value of the two lower bits + * of the *flags* argument on error. + * + * long bpf_sk_redirect_map(struct sk_buff *skb, struct bpf_map *map, u32 key, u64 flags) + * Description + * Redirect the packet to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sock_map_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a *map* referencing sockets. The + * *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_meta(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust the address pointed by *xdp_md*\ **->data_meta** by + * *delta* (which can be positive or negative). Note that this + * operation modifies the address stored in *xdp_md*\ **->data**, + * so the latter must be loaded only after the helper has been + * called. + * + * The use of *xdp_md*\ **->data_meta** is optional and programs + * are not required to use it. The rationale is that when the + * packet is processed with XDP (e.g. as DoS filter), it is + * possible to push further meta data along with it before passing + * to the stack, and to give the guarantee that an ingress eBPF + * program attached as a TC classifier on the same device can pick + * this up for further post-processing. Since TC works with socket + * buffers, it remains possible to set from XDP the **mark** or + * **priority** pointers, or other pointers for the socket buffer. + * Having this scratch space generic and programmable allows for + * more flexibility as the user is free to store whatever meta + * data they need. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_event_read_value(struct bpf_map *map, u64 flags, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * Read the value of a perf event counter, and store it into *buf* + * of size *buf_size*. This helper relies on a *map* of type + * **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. The nature of the perf event + * counter is selected when *map* is updated with perf event file + * descriptors. The *map* is an array whose size is the number of + * available CPUs, and each cell contains a value relative to one + * CPU. The value to retrieve is indicated by *flags*, that + * contains the index of the CPU to look up, masked with + * **BPF_F_INDEX_MASK**. Alternatively, *flags* can be set to + * **BPF_F_CURRENT_CPU** to indicate that the value for the + * current CPU should be retrieved. + * + * This helper behaves in a way close to + * **bpf_perf_event_read**\ () helper, save that instead of + * just returning the value observed, it fills the *buf* + * structure. This allows for additional data to be retrieved: in + * particular, the enabled and running times (in *buf*\ + * **->enabled** and *buf*\ **->running**, respectively) are + * copied. In general, **bpf_perf_event_read_value**\ () is + * recommended over **bpf_perf_event_read**\ (), which has some + * ABI issues and provides fewer functionalities. + * + * These values are interesting, because hardware PMU (Performance + * Monitoring Unit) counters are limited resources. When there are + * more PMU based perf events opened than available counters, + * kernel will multiplex these events so each event gets certain + * percentage (but not all) of the PMU time. In case that + * multiplexing happens, the number of samples or counter value + * will not reflect the case compared to when no multiplexing + * occurs. This makes comparison between different runs difficult. + * Typically, the counter value should be normalized before + * comparing to other experiments. The usual normalization is done + * as follows. + * + * :: + * + * normalized_counter = counter * t_enabled / t_running + * + * Where t_enabled is the time enabled for event and t_running is + * the time running for event since last normalization. The + * enabled and running times are accumulated since the perf event + * open. To achieve scaling factor between two invocations of an + * eBPF program, users can use CPU id as the key (which is + * typical for perf array usage model) to remember the previous + * value and do the calculation inside the eBPF program. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_perf_prog_read_value(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, u32 buf_size) + * Description + * For an eBPF program attached to a perf event, retrieve the + * value of the event counter associated to *ctx* and store it in + * the structure pointed by *buf* and of size *buf_size*. Enabled + * and running times are also stored in the structure (see + * description of helper **bpf_perf_event_read_value**\ () for + * more details). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_getsockopt(void *bpf_socket, int level, int optname, void *optval, int optlen) + * Description + * Emulate a call to **getsockopt()** on the socket associated to + * *bpf_socket*, which must be a full socket. The *level* at + * which the option resides and the name *optname* of the option + * must be specified, see **getsockopt(2)** for more information. + * The retrieved value is stored in the structure pointed by + * *opval* and of length *optlen*. + * + * *bpf_socket* should be one of the following: + * + * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. + * + * This helper actually implements a subset of **getsockopt()**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_override_return(struct pt_regs *regs, u64 rc) + * Description + * Used for error injection, this helper uses kprobes to override + * the return value of the probed function, and to set it to *rc*. + * The first argument is the context *regs* on which the kprobe + * works. + * + * This helper works by setting the PC (program counter) + * to an override function which is run in place of the original + * probed function. This means the probed function is not run at + * all. The replacement function just returns with the required + * value. + * + * This helper has security implications, and thus is subject to + * restrictions. It is only available if the kernel was compiled + * with the **CONFIG_BPF_KPROBE_OVERRIDE** configuration + * option, and in this case it only works on functions tagged with + * **ALLOW_ERROR_INJECTION** in the kernel code. + * + * Also, the helper is only available for the architectures having + * the CONFIG_FUNCTION_ERROR_INJECTION option. As of this writing, + * x86 architecture is the only one to support this feature. + * Return + * 0 + * + * long bpf_sock_ops_cb_flags_set(struct bpf_sock_ops *bpf_sock, int argval) + * Description + * Attempt to set the value of the **bpf_sock_ops_cb_flags** field + * for the full TCP socket associated to *bpf_sock_ops* to + * *argval*. + * + * The primary use of this field is to determine if there should + * be calls to eBPF programs of type + * **BPF_PROG_TYPE_SOCK_OPS** at various points in the TCP + * code. A program of the same type can change its value, per + * connection and as necessary, when the connection is + * established. This field is directly accessible for reading, but + * this helper must be used for updates in order to return an + * error if an eBPF program tries to set a callback that is not + * supported in the current kernel. + * + * *argval* is a flag array which can combine these flags: + * + * * **BPF_SOCK_OPS_RTO_CB_FLAG** (retransmission time out) + * * **BPF_SOCK_OPS_RETRANS_CB_FLAG** (retransmission) + * * **BPF_SOCK_OPS_STATE_CB_FLAG** (TCP state change) + * * **BPF_SOCK_OPS_RTT_CB_FLAG** (every RTT) + * + * Therefore, this function can be used to clear a callback flag by + * setting the appropriate bit to zero. e.g. to disable the RTO + * callback: + * + * **bpf_sock_ops_cb_flags_set(bpf_sock,** + * **bpf_sock->bpf_sock_ops_cb_flags & ~BPF_SOCK_OPS_RTO_CB_FLAG)** + * + * Here are some examples of where one could call such eBPF + * program: + * + * * When RTO fires. + * * When a packet is retransmitted. + * * When the connection terminates. + * * When a packet is sent. + * * When a packet is received. + * Return + * Code **-EINVAL** if the socket is not a full TCP socket; + * otherwise, a positive number containing the bits that could not + * be set is returned (which comes down to 0 if all bits were set + * as required). + * + * long bpf_msg_redirect_map(struct sk_msg_buff *msg, struct bpf_map *map, u32 key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKMAP**) at index *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_msg_apply_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, apply the verdict of the eBPF program to + * the next *bytes* (number of bytes) of message *msg*. + * + * For example, this helper can be used in the following cases: + * + * * A single **sendmsg**\ () or **sendfile**\ () system call + * contains multiple logical messages that the eBPF program is + * supposed to read and for which it should apply a verdict. + * * An eBPF program only cares to read the first *bytes* of a + * *msg*. If the message has a large payload, then setting up + * and calling the eBPF program repeatedly for all bytes, even + * though the verdict is already known, would create unnecessary + * overhead. + * + * When called from within an eBPF program, the helper sets a + * counter internal to the BPF infrastructure, that is used to + * apply the last verdict to the next *bytes*. If *bytes* is + * smaller than the current data being processed from a + * **sendmsg**\ () or **sendfile**\ () system call, the first + * *bytes* will be sent and the eBPF program will be re-run with + * the pointer for start of data pointing to byte number *bytes* + * **+ 1**. If *bytes* is larger than the current data being + * processed, then the eBPF verdict will be applied to multiple + * **sendmsg**\ () or **sendfile**\ () calls until *bytes* are + * consumed. + * + * Note that if a socket closes with the internal counter holding + * a non-zero value, this is not a problem because data is not + * being buffered for *bytes* and is sent as it is received. + * Return + * 0 + * + * long bpf_msg_cork_bytes(struct sk_msg_buff *msg, u32 bytes) + * Description + * For socket policies, prevent the execution of the verdict eBPF + * program for message *msg* until *bytes* (byte number) have been + * accumulated. + * + * This can be used when one needs a specific number of bytes + * before a verdict can be assigned, even if the data spans + * multiple **sendmsg**\ () or **sendfile**\ () calls. The extreme + * case would be a user calling **sendmsg**\ () repeatedly with + * 1-byte long message segments. Obviously, this is bad for + * performance, but it is still valid. If the eBPF program needs + * *bytes* bytes to validate a header, this helper can be used to + * prevent the eBPF program to be called again until *bytes* have + * been accumulated. + * Return + * 0 + * + * long bpf_msg_pull_data(struct sk_msg_buff *msg, u32 start, u32 end, u64 flags) + * Description + * For socket policies, pull in non-linear data from user space + * for *msg* and set pointers *msg*\ **->data** and *msg*\ + * **->data_end** to *start* and *end* bytes offsets into *msg*, + * respectively. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it can only parse data that the (**data**, **data_end**) + * pointers have already consumed. For **sendmsg**\ () hooks this + * is likely the first scatterlist element. But for calls relying + * on the **sendpage** handler (e.g. **sendfile**\ ()) this will + * be the range (**0**, **0**) because the data is shared with + * user space and by default the objective is to avoid allowing + * user space to modify data while (or after) eBPF verdict is + * being decided. This helper can be used to pull in data and to + * set the start and end pointer to given values. Data will be + * copied if necessary (i.e. if data was not linear and if start + * and end pointers do not point to the same chunk). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_bind(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) + * Description + * Bind the socket associated to *ctx* to the address pointed by + * *addr*, of length *addr_len*. This allows for making outgoing + * connection from the desired IP address, which can be useful for + * example when all processes inside a cgroup should use one + * single IP address on a host that has multiple IP configured. + * + * This helper works for IPv4 and IPv6, TCP and UDP sockets. The + * domain (*addr*\ **->sa_family**) must be **AF_INET** (or + * **AF_INET6**). It's advised to pass zero port (**sin_port** + * or **sin6_port**) which triggers IP_BIND_ADDRESS_NO_PORT-like + * behavior and lets the kernel efficiently pick up an unused + * port as long as 4-tuple is unique. Passing non-zero port might + * lead to degraded performance. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_adjust_tail(struct xdp_buff *xdp_md, int delta) + * Description + * Adjust (move) *xdp_md*\ **->data_end** by *delta* bytes. It is + * possible to both shrink and grow the packet tail. + * Shrink done via *delta* being a negative integer. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_skb_get_xfrm_state(struct sk_buff *skb, u32 index, struct bpf_xfrm_state *xfrm_state, u32 size, u64 flags) + * Description + * Retrieve the XFRM state (IP transform framework, see also + * **ip-xfrm(8)**) at *index* in XFRM "security path" for *skb*. + * + * The retrieved value is stored in the **struct bpf_xfrm_state** + * pointed by *xfrm_state* and of length *size*. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_XFRM** configuration option. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_get_stack(void *ctx, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * To achieve this, the helper needs *ctx*, which is a pointer + * to the context on which the tracing program is executed. + * To store the stacktrace, the bpf program provides *buf* with + * a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * **BPF_F_USER_BUILD_ID** + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. + * + * **bpf_get_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_skb_load_bytes_relative(const void *skb, u32 offset, void *to, u32 len, u32 start_header) + * Description + * This helper is similar to **bpf_skb_load_bytes**\ () in that + * it provides an easy way to load *len* bytes from *offset* + * from the packet associated to *skb*, into the buffer pointed + * by *to*. The difference to **bpf_skb_load_bytes**\ () is that + * a fifth argument *start_header* exists in order to select a + * base offset to start from. *start_header* can be one of: + * + * **BPF_HDR_START_MAC** + * Base offset to load data from is *skb*'s mac header. + * **BPF_HDR_START_NET** + * Base offset to load data from is *skb*'s network header. + * + * In general, "direct packet access" is the preferred method to + * access packet data, however, this helper is in particular useful + * in socket filters where *skb*\ **->data** does not always point + * to the start of the mac header and where "direct packet access" + * is not available. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_fib_lookup(void *ctx, struct bpf_fib_lookup *params, int plen, u32 flags) + * Description + * Do FIB lookup in kernel tables using parameters in *params*. + * If lookup is successful and result shows packet is to be + * forwarded, the neighbor tables are searched for the nexthop. + * If successful (ie., FIB lookup shows forwarding and nexthop + * is resolved), the nexthop address is returned in ipv4_dst + * or ipv6_dst based on family, smac is set to mac address of + * egress device, dmac is set to nexthop mac address, rt_metric + * is set to metric from route (IPv4/IPv6 only), and ifindex + * is set to the device index of the nexthop from the FIB lookup. + * + * *plen* argument is the size of the passed in struct. + * *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_FIB_LOOKUP_DIRECT** + * Do a direct table lookup vs full lookup using FIB + * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. + * **BPF_FIB_LOOKUP_OUTPUT** + * Perform lookup from an egress perspective (default is + * ingress). + * **BPF_FIB_LOOKUP_SKIP_NEIGH** + * Skip the neighbour table lookup. *params*->dmac + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** tc cls_act programs. + * Return + * * < 0 if any input argument is invalid + * * 0 on success (packet is forwarded, nexthop neighbor exists) + * * > 0 one of **BPF_FIB_LKUP_RET_** codes explaining why the + * packet is not forwarded or needs assist from full stack + * + * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU + * was exceeded and output params->mtu_result contains the MTU. + * + * long bpf_sock_hash_update(struct bpf_sock_ops *skops, struct bpf_map *map, void *key, u64 flags) + * Description + * Add an entry to, or update a sockhash *map* referencing sockets. + * The *skops* is used as a new value for the entry associated to + * *key*. *flags* is one of: + * + * **BPF_NOEXIST** + * The entry for *key* must not exist in the map. + * **BPF_EXIST** + * The entry for *key* must already exist in the map. + * **BPF_ANY** + * No condition on the existence of the entry for *key*. + * + * If the *map* has eBPF programs (parser and verdict), those will + * be inherited by the socket being added. If the socket is + * already attached to eBPF programs, this results in an error. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_redirect_hash(struct sk_msg_buff *msg, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * socket level. If the message *msg* is allowed to pass (i.e. if + * the verdict eBPF program returns **SK_PASS**), redirect it to + * the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress path otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_sk_redirect_hash(struct sk_buff *skb, struct bpf_map *map, void *key, u64 flags) + * Description + * This helper is used in programs implementing policies at the + * skb socket level. If the sk_buff *skb* is allowed to pass (i.e. + * if the verdict eBPF program returns **SK_PASS**), redirect it + * to the socket referenced by *map* (of type + * **BPF_MAP_TYPE_SOCKHASH**) using hash *key*. Both ingress and + * egress interfaces can be used for redirection. The + * **BPF_F_INGRESS** value in *flags* is used to make the + * distinction (ingress path is selected if the flag is present, + * egress otherwise). This is the only flag supported for now. + * Return + * **SK_PASS** on success, or **SK_DROP** on error. + * + * long bpf_lwt_push_encap(struct sk_buff *skb, u32 type, void *hdr, u32 len) + * Description + * Encapsulate the packet associated to *skb* within a Layer 3 + * protocol header. This header is provided in the buffer at + * address *hdr*, with *len* its size in bytes. *type* indicates + * the protocol of the header and can be one of: + * + * **BPF_LWT_ENCAP_SEG6** + * IPv6 encapsulation with Segment Routing Header + * (**struct ipv6_sr_hdr**). *hdr* only contains the SRH, + * the IPv6 header is computed by the kernel. + * **BPF_LWT_ENCAP_SEG6_INLINE** + * Only works if *skb* contains an IPv6 packet. Insert a + * Segment Routing Header (**struct ipv6_sr_hdr**) inside + * the IPv6 header. + * **BPF_LWT_ENCAP_IP** + * IP encapsulation (GRE/GUE/IPIP/etc). The outer header + * must be IPv4 or IPv6, followed by zero or more + * additional headers, up to **LWT_BPF_MAX_HEADROOM** + * total bytes in all prepended headers. Please note that + * if **skb_is_gso**\ (*skb*) is true, no more than two + * headers can be prepended, and the inner header, if + * present, should be either GRE or UDP/GUE. + * + * **BPF_LWT_ENCAP_SEG6**\ \* types can be called by BPF programs + * of type **BPF_PROG_TYPE_LWT_IN**; **BPF_LWT_ENCAP_IP** type can + * be called by bpf programs of types **BPF_PROG_TYPE_LWT_IN** and + * **BPF_PROG_TYPE_LWT_XMIT**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_store_bytes(struct sk_buff *skb, u32 offset, const void *from, u32 len) + * Description + * Store *len* bytes from address *from* into the packet + * associated to *skb*, at *offset*. Only the flags, tag and TLVs + * inside the outermost IPv6 Segment Routing Header can be + * modified through this helper. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_adjust_srh(struct sk_buff *skb, u32 offset, s32 delta) + * Description + * Adjust the size allocated to TLVs in the outermost IPv6 + * Segment Routing Header contained in the packet associated to + * *skb*, at position *offset* by *delta* bytes. Only offsets + * after the segments are accepted. *delta* can be as well + * positive (growing) as negative (shrinking). + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_lwt_seg6_action(struct sk_buff *skb, u32 action, void *param, u32 param_len) + * Description + * Apply an IPv6 Segment Routing action of type *action* to the + * packet associated to *skb*. Each action takes a parameter + * contained at address *param*, and of length *param_len* bytes. + * *action* can be one of: + * + * **SEG6_LOCAL_ACTION_END_X** + * End.X action: Endpoint with Layer-3 cross-connect. + * Type of *param*: **struct in6_addr**. + * **SEG6_LOCAL_ACTION_END_T** + * End.T action: Endpoint with specific IPv6 table lookup. + * Type of *param*: **int**. + * **SEG6_LOCAL_ACTION_END_B6** + * End.B6 action: Endpoint bound to an SRv6 policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * **SEG6_LOCAL_ACTION_END_B6_ENCAP** + * End.B6.Encap action: Endpoint bound to an SRv6 + * encapsulation policy. + * Type of *param*: **struct ipv6_sr_hdr**. + * + * A call to this helper is susceptible to change the underlying + * packet buffer. Therefore, at load time, all checks on pointers + * previously done by the verifier are invalidated and must be + * performed again, if the helper is used in combination with + * direct packet access. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_repeat(void *ctx) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded repeat key message. This delays + * the generation of a key up event for previously generated + * key down event. + * + * Some IR protocols like NEC have a special IR message for + * repeating last button, for when a button is held down. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_rc_keydown(void *ctx, u32 protocol, u64 scancode, u32 toggle) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded key press with *scancode*, + * *toggle* value in the given *protocol*. The scancode will be + * translated to a keycode using the rc keymap, and reported as + * an input key down event. After a period a key up event is + * generated. This period can be extended by calling either + * **bpf_rc_keydown**\ () again with the same values, or calling + * **bpf_rc_repeat**\ (). + * + * Some protocols include a toggle bit, in case the button was + * released and pressed again between consecutive scancodes. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * The *protocol* is the decoded protocol number (see + * **enum rc_proto** for some predefined values). + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * u64 bpf_skb_cgroup_id(struct sk_buff *skb) + * Description + * Return the cgroup v2 id of the socket associated with the *skb*. + * This is roughly similar to the **bpf_get_cgroup_classid**\ () + * helper for cgroup v1 by providing a tag resp. identifier that + * can be matched on or used for map lookups e.g. to implement + * policy. The cgroup v2 id of a given path in the hierarchy is + * exposed in user space through the f_handle API in order to get + * to the same 64-bit id. + * + * This helper can be used on TC egress path, but not on ingress, + * and is available only if the kernel was compiled with the + * **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_get_current_cgroup_id(void) + * Description + * Get the current cgroup id based on the cgroup within which + * the current task is running. + * Return + * A 64-bit integer containing the current cgroup id based + * on the cgroup within which the current task is running. + * + * void *bpf_get_local_storage(void *map, u64 flags) + * Description + * Get the pointer to the local storage area. + * The type and the size of the local storage is defined + * by the *map* argument. + * The *flags* meaning is specific for each map type, + * and has to be 0 for cgroup local storage. + * + * Depending on the BPF program type, a local storage area + * can be shared between multiple instances of the BPF program, + * running simultaneously. + * + * A user should care about the synchronization by himself. + * For example, by using the **BPF_ATOMIC** instructions to alter + * the shared data. + * Return + * A pointer to the local storage area. + * + * long bpf_sk_select_reuseport(struct sk_reuseport_md *reuse, struct bpf_map *map, void *key, u64 flags) + * Description + * Select a **SO_REUSEPORT** socket from a + * **BPF_MAP_TYPE_REUSEPORT_SOCKARRAY** *map*. + * It checks the selected socket is matching the incoming + * request in the socket buffer. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_skb_ancestor_cgroup_id(struct sk_buff *skb, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *skb* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *skb*, then return value will be same as that + * of **bpf_skb_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *skb*. + * + * The format of returned id and helper limitations are same as in + * **bpf_skb_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * struct bpf_sock *bpf_sk_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * struct bpf_sock *bpf_sk_lookup_udp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for UDP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * The *ctx* should point to the context of the program, such as + * the skb or socket (depending on the hook in use). This is used + * to determine the base network namespace for the lookup. + * + * *tuple_size* must be one of: + * + * **sizeof**\ (*tuple*\ **->ipv4**) + * Look for an IPv4 socket. + * **sizeof**\ (*tuple*\ **->ipv6**) + * Look for an IPv6 socket. + * + * If the *netns* is a negative signed 32-bit integer, then the + * socket lookup table in the netns associated with the *ctx* + * will be used. For the TC hooks, this is the netns of the device + * in the skb. For socket hooks, this is the netns of the socket. + * If *netns* is any other signed 32-bit value greater than or + * equal to zero then it specifies the ID of the netns relative to + * the netns associated with the *ctx*. *netns* values beyond the + * range of 32-bit integers are reserved for future use. + * + * All values for *flags* are reserved for future usage, and must + * be left at zero. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_sk_release(void *sock) + * Description + * Release the reference held by *sock*. *sock* must be a + * non-**NULL** pointer that was returned from + * **bpf_sk_lookup_xxx**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_push_elem(struct bpf_map *map, const void *value, u64 flags) + * Description + * Push an element *value* in *map*. *flags* is one of: + * + * **BPF_EXIST** + * If the queue/stack is full, the oldest element is + * removed to make room for this. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_pop_elem(struct bpf_map *map, void *value) + * Description + * Pop an element from *map*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_map_peek_elem(struct bpf_map *map, void *value) + * Description + * Get an element from *map* without removing it. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_push_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * For socket policies, insert *len* bytes into *msg* at offset + * *start*. + * + * If a program of type **BPF_PROG_TYPE_SK_MSG** is run on a + * *msg* it may want to insert metadata or options into the *msg*. + * This can later be read and used by any of the lower layer BPF + * hooks. + * + * This helper may fail if under memory pressure (a malloc + * fails) in these cases BPF programs will get an appropriate + * error and BPF programs will need to handle them. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_msg_pop_data(struct sk_msg_buff *msg, u32 start, u32 len, u64 flags) + * Description + * Will remove *len* bytes from a *msg* starting at byte *start*. + * This may result in **ENOMEM** errors under certain situations if + * an allocation and copy are required due to a full ring buffer. + * However, the helper will try to avoid doing the allocation + * if possible. Other errors can occur if input parameters are + * invalid either due to *start* byte not being valid part of *msg* + * payload and/or *pop* value being to large. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_rc_pointer_rel(void *ctx, s32 rel_x, s32 rel_y) + * Description + * This helper is used in programs implementing IR decoding, to + * report a successfully decoded pointer movement. + * + * The *ctx* should point to the lirc sample as passed into + * the program. + * + * This helper is only available is the kernel was compiled with + * the **CONFIG_BPF_LIRC_MODE2** configuration option set to + * "**y**". + * Return + * 0 + * + * long bpf_spin_lock(struct bpf_spin_lock *lock) + * Description + * Acquire a spinlock represented by the pointer *lock*, which is + * stored as part of a value of a map. Taking the lock allows to + * safely update the rest of the fields in that value. The + * spinlock can (and must) later be released with a call to + * **bpf_spin_unlock**\ (\ *lock*\ ). + * + * Spinlocks in BPF programs come with a number of restrictions + * and constraints: + * + * * **bpf_spin_lock** objects are only allowed inside maps of + * types **BPF_MAP_TYPE_HASH** and **BPF_MAP_TYPE_ARRAY** (this + * list could be extended in the future). + * * BTF description of the map is mandatory. + * * The BPF program can take ONE lock at a time, since taking two + * or more could cause dead locks. + * * Only one **struct bpf_spin_lock** is allowed per map element. + * * When the lock is taken, calls (either BPF to BPF or helpers) + * are not allowed. + * * The **BPF_LD_ABS** and **BPF_LD_IND** instructions are not + * allowed inside a spinlock-ed region. + * * The BPF program MUST call **bpf_spin_unlock**\ () to release + * the lock, on all execution paths, before it returns. + * * The BPF program can access **struct bpf_spin_lock** only via + * the **bpf_spin_lock**\ () and **bpf_spin_unlock**\ () + * helpers. Loading or storing data into the **struct + * bpf_spin_lock** *lock*\ **;** field of a map is not allowed. + * * To use the **bpf_spin_lock**\ () helper, the BTF description + * of the map value must be a struct and have **struct + * bpf_spin_lock** *anyname*\ **;** field at the top level. + * Nested lock inside another struct is not allowed. + * * The **struct bpf_spin_lock** *lock* field in a map value must + * be aligned on a multiple of 4 bytes in that value. + * * Syscall with command **BPF_MAP_LOOKUP_ELEM** does not copy + * the **bpf_spin_lock** field to user space. + * * Syscall with command **BPF_MAP_UPDATE_ELEM**, or update from + * a BPF program, do not update the **bpf_spin_lock** field. + * * **bpf_spin_lock** cannot be on the stack or inside a + * networking packet (it can only be inside of a map values). + * * **bpf_spin_lock** is available to root only. + * * Tracing programs and socket filter programs cannot use + * **bpf_spin_lock**\ () due to insufficient preemption checks + * (but this may change in the future). + * * **bpf_spin_lock** is not allowed in inner maps of map-in-map. + * Return + * 0 + * + * long bpf_spin_unlock(struct bpf_spin_lock *lock) + * Description + * Release the *lock* previously locked by a call to + * **bpf_spin_lock**\ (\ *lock*\ ). + * Return + * 0 + * + * struct bpf_sock *bpf_sk_fullsock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_sock** pointer such + * that all the fields in this **bpf_sock** can be accessed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_tcp_sock *bpf_tcp_sock(struct bpf_sock *sk) + * Description + * This helper gets a **struct bpf_tcp_sock** pointer from a + * **struct bpf_sock** pointer. + * Return + * A **struct bpf_tcp_sock** pointer on success, or **NULL** in + * case of failure. + * + * long bpf_skb_ecn_set_ce(struct sk_buff *skb) + * Description + * Set ECN (Explicit Congestion Notification) field of IP header + * to **CE** (Congestion Encountered) if current value is **ECT** + * (ECN Capable Transport). Otherwise, do nothing. Works with IPv6 + * and IPv4. + * Return + * 1 if the **CE** flag is set (either by the current helper call + * or because it was already present), 0 if it is not set. + * + * struct bpf_sock *bpf_get_listener_sock(struct bpf_sock *sk) + * Description + * Return a **struct bpf_sock** pointer in **TCP_LISTEN** state. + * **bpf_sk_release**\ () is unnecessary and not allowed. + * Return + * A **struct bpf_sock** pointer on success, or **NULL** in + * case of failure. + * + * struct bpf_sock *bpf_skc_lookup_tcp(void *ctx, struct bpf_sock_tuple *tuple, u32 tuple_size, u64 netns, u64 flags) + * Description + * Look for TCP socket matching *tuple*, optionally in a child + * network namespace *netns*. The return value must be checked, + * and if non-**NULL**, released via **bpf_sk_release**\ (). + * + * This function is identical to **bpf_sk_lookup_tcp**\ (), except + * that it also returns timewait or request sockets. Use + * **bpf_sk_fullsock**\ () or **bpf_tcp_sock**\ () to access the + * full structure. + * + * This helper is available only if the kernel was compiled with + * **CONFIG_NET** configuration option. + * Return + * Pointer to **struct bpf_sock**, or **NULL** in case of failure. + * For sockets with reuseport option, the **struct bpf_sock** + * result is from *reuse*\ **->socks**\ [] using the hash of the + * tuple. + * + * long bpf_tcp_check_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK for + * the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative + * error otherwise. + * + * long bpf_sysctl_get_name(struct bpf_sysctl *ctx, char *buf, size_t buf_len, u64 flags) + * Description + * Get name of sysctl in /proc/sys/ and copy it into provided by + * program buffer *buf* of size *buf_len*. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * + * If *flags* is zero, full name (e.g. "net/ipv4/tcp_mem") is + * copied. Use **BPF_F_SYSCTL_BASE_NAME** flag to copy base name + * only (e.g. "tcp_mem"). + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * long bpf_sysctl_get_current_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get current value of sysctl as it is presented in /proc/sys + * (incl. newline, etc), and copy it as a string into provided + * by program buffer *buf* of size *buf_len*. + * + * The whole value is copied, no matter what file position user + * space issued e.g. sys_read at. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if current value was unavailable, e.g. because + * sysctl is uninitialized and read returns -EIO for it. + * + * long bpf_sysctl_get_new_value(struct bpf_sysctl *ctx, char *buf, size_t buf_len) + * Description + * Get new value being written by user space to sysctl (before + * the actual write happens) and copy it as a string into + * provided by program buffer *buf* of size *buf_len*. + * + * User space may write new value at file position > 0. + * + * The buffer is always NUL terminated, unless it's zero-sized. + * Return + * Number of character copied (not including the trailing NUL). + * + * **-E2BIG** if the buffer wasn't big enough (*buf* will contain + * truncated name in this case). + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_sysctl_set_new_value(struct bpf_sysctl *ctx, const char *buf, size_t buf_len) + * Description + * Override new value being written by user space to sysctl with + * value provided by program in buffer *buf* of size *buf_len*. + * + * *buf* should contain a string in same form as provided by user + * space on sysctl write. + * + * User space may write new value at file position > 0. To override + * the whole sysctl value file position should be set to zero. + * Return + * 0 on success. + * + * **-E2BIG** if the *buf_len* is too big. + * + * **-EINVAL** if sysctl is being read. + * + * long bpf_strtol(const char *buf, size_t buf_len, u64 flags, long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to a long integer according to the given base + * and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)) followed by a single + * optional '**-**' sign. + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtol**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * long bpf_strtoul(const char *buf, size_t buf_len, u64 flags, unsigned long *res) + * Description + * Convert the initial part of the string from buffer *buf* of + * size *buf_len* to an unsigned long integer according to the + * given base and save the result in *res*. + * + * The string may begin with an arbitrary amount of white space + * (as determined by **isspace**\ (3)). + * + * Five least significant bits of *flags* encode base, other bits + * are currently unused. + * + * Base must be either 8, 10, 16 or 0 to detect it automatically + * similar to user space **strtoul**\ (3). + * Return + * Number of characters consumed on success. Must be positive but + * no more than *buf_len*. + * + * **-EINVAL** if no valid digits were found or unsupported base + * was provided. + * + * **-ERANGE** if resulting value was out of range. + * + * void *bpf_sk_storage_get(struct bpf_map *map, void *sk, void *value, u64 flags) + * Description + * Get a bpf-local-storage from a *sk*. + * + * Logically, it could be thought of getting the value from + * a *map* with *sk* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *sk*) except this + * helper enforces the key must be a full socket and the map must + * be a **BPF_MAP_TYPE_SK_STORAGE** also. + * + * Underneath, the value is stored locally at *sk* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf-local-storages residing at *sk*. + * + * *sk* is a kernel **struct sock** pointer for LSM program. + * *sk* is a **struct bpf_sock** pointer for other program types. + * + * An optional *flags* (**BPF_SK_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf-local-storage will be + * created if one does not exist. *value* can be used + * together with **BPF_SK_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf-local-storage. If *value* is + * **NULL**, the new bpf-local-storage will be zero initialized. + * Return + * A bpf-local-storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf-local-storage. + * + * long bpf_sk_storage_delete(struct bpf_map *map, void *sk) + * Description + * Delete a bpf-local-storage from a *sk*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf-local-storage cannot be found. + * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). + * + * long bpf_send_signal(u32 sig) + * Description + * Send signal *sig* to the process of the current task. + * The signal may be delivered to any of this process's threads. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * s64 bpf_tcp_gen_syncookie(void *sk, void *iph, u32 iph_len, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IP/TCP headers, *iph* and *th*, on the listening socket in *sk*. + * + * *iph* points to the start of the IPv4 or IPv6 header, while + * *iph_len* contains **sizeof**\ (**struct iphdr**) or + * **sizeof**\ (**struct ipv6hdr**). + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** SYN cookie cannot be issued due to error + * + * **-ENOENT** SYN cookie should not be issued (no SYN flood) + * + * **-EOPNOTSUPP** kernel configuration does not enable SYN cookies + * + * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 + * + * long bpf_skb_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct sk_buff. + * + * This helper is similar to **bpf_perf_event_output**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from user space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_kernel(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Safely attempt to read *size* bytes from kernel space address + * *unsafe_ptr* and store the data in *dst*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_probe_read_user_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe user address + * *unsafe_ptr* to *dst*. The *size* should include the + * terminating NUL byte. In case the string length is smaller than + * *size*, the target is not padded with further NUL bytes. If the + * string length is larger than *size*, just *size*-1 bytes are + * copied and the last byte is set to NUL. + * + * On success, returns the number of bytes that were written, + * including the terminal NUL. This makes this helper useful in + * tracing programs for reading strings, and more importantly to + * get its length at runtime. See the following snippet: + * + * :: + * + * SEC("kprobe/sys_open") + * void bpf_sys_open(struct pt_regs *ctx) + * { + * char buf[PATHLEN]; // PATHLEN is defined to 256 + * int res = bpf_probe_read_user_str(buf, sizeof(buf), + * ctx->di); + * + * // Consume buf, for example push it to + * // userspace via bpf_perf_event_output(); we + * // can use res (the string length) as event + * // size, after checking its boundaries. + * } + * + * In comparison, using **bpf_probe_read_user**\ () helper here + * instead to read the string would require to estimate the length + * at compile time, and would often result in copying more memory + * than necessary. + * + * Another useful use case is when parsing individual process + * arguments or individual environment variables navigating + * *current*\ **->mm->arg_start** and *current*\ + * **->mm->env_start**: using this helper and the return value, + * one can quickly iterate at the right offset of the memory area. + * Return + * On success, the strictly positive length of the output string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_probe_read_kernel_str(void *dst, u32 size, const void *unsafe_ptr) + * Description + * Copy a NUL terminated string from an unsafe kernel address *unsafe_ptr* + * to *dst*. Same semantics as with **bpf_probe_read_user_str**\ () apply. + * Return + * On success, the strictly positive length of the string, including + * the trailing NUL character. On error, a negative value. + * + * long bpf_tcp_send_ack(void *tp, u32 rcv_nxt) + * Description + * Send out a tcp-ack. *tp* is the in-kernel struct **tcp_sock**. + * *rcv_nxt* is the ack_seq to be sent out. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_send_signal_thread(u32 sig) + * Description + * Send signal *sig* to the thread corresponding to the current task. + * Return + * 0 on success or successfully queued. + * + * **-EBUSY** if work queue under nmi is full. + * + * **-EINVAL** if *sig* is invalid. + * + * **-EPERM** if no permission to send the *sig*. + * + * **-EAGAIN** if bpf program can try again. + * + * u64 bpf_jiffies64(void) + * Description + * Obtain the 64bit jiffies + * Return + * The 64 bit jiffies + * + * long bpf_read_branch_records(struct bpf_perf_event_data *ctx, void *buf, u32 size, u64 flags) + * Description + * For an eBPF program attached to a perf event, retrieve the + * branch records (**struct perf_branch_entry**) associated to *ctx* + * and store it in the buffer pointed by *buf* up to size + * *size* bytes. + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * The *flags* can be set to **BPF_F_GET_BRANCH_RECORDS_SIZE** to + * instead return the number of bytes required to store all the + * branch entries. If this flag is set, *buf* may be NULL. + * + * **-EINVAL** if arguments invalid or **size** not a multiple + * of **sizeof**\ (**struct perf_branch_entry**\ ). + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_get_ns_current_pid_tgid(u64 dev, u64 ino, struct bpf_pidns_info *nsdata, u32 size) + * Description + * Returns 0 on success, values for *pid* and *tgid* as seen from the current + * *namespace* will be returned in *nsdata*. + * Return + * 0 on success, or one of the following in case of failure: + * + * **-EINVAL** if dev and inum supplied don't match dev_t and inode number + * with nsfs of current task, or if dev conversion to dev_t lost high bits. + * + * **-ENOENT** if pidns does not exists for the current task. + * + * long bpf_xdp_output(void *ctx, struct bpf_map *map, u64 flags, void *data, u64 size) + * Description + * Write raw *data* blob into a special BPF perf event held by + * *map* of type **BPF_MAP_TYPE_PERF_EVENT_ARRAY**. This perf + * event must have the following attributes: **PERF_SAMPLE_RAW** + * as **sample_type**, **PERF_TYPE_SOFTWARE** as **type**, and + * **PERF_COUNT_SW_BPF_OUTPUT** as **config**. + * + * The *flags* are used to indicate the index in *map* for which + * the value must be put, masked with **BPF_F_INDEX_MASK**. + * Alternatively, *flags* can be set to **BPF_F_CURRENT_CPU** + * to indicate that the index of the current CPU core should be + * used. + * + * The value to write, of *size*, is passed through eBPF stack and + * pointed by *data*. + * + * *ctx* is a pointer to in-kernel struct xdp_buff. + * + * This helper is similar to **bpf_perf_eventoutput**\ () but + * restricted to raw_tracepoint bpf programs. + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_get_netns_cookie(void *ctx) + * Description + * Retrieve the cookie (generated by the kernel) of the network + * namespace the input *ctx* is associated with. The network + * namespace cookie remains stable for its lifetime and provides + * a global identifier that can be assumed unique. If *ctx* is + * NULL, then the helper returns the cookie for the initial + * network namespace. The cookie itself is very similar to that + * of **bpf_get_socket_cookie**\ () helper, but for network + * namespaces instead of sockets. + * Return + * A 8-byte long opaque number. + * + * u64 bpf_get_current_ancestor_cgroup_id(int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of the cgroup associated + * with the current task at the *ancestor_level*. The root cgroup + * is at *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with the current task, then return value will be the + * same as that of **bpf_get_current_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with the current task. + * + * The format of returned id and helper limitations are same as in + * **bpf_get_current_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_sk_assign(struct sk_buff *skb, void *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SCHED_CLS** and + * **BPF_PROG_TYPE_SCHED_ACT** programs. + * + * Assign the *sk* to the *skb*. When combined with appropriate + * routing configuration to receive the packet towards the socket, + * will cause *skb* to be delivered to the specified socket. + * Subsequent redirection of *skb* via **bpf_redirect**\ (), + * **bpf_clone_redirect**\ () or other methods outside of BPF may + * interfere with successful delivery to the socket. + * + * This operation is only valid from TC ingress path. + * + * The *flags* argument must be zero. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EINVAL** if specified *flags* are not supported. + * + * **-ENOENT** if the socket is unavailable for assignment. + * + * **-ENETUNREACH** if the socket is unreachable (wrong netns). + * + * **-EOPNOTSUPP** if the operation is not supported, for example + * a call from outside of TC ingress. + * + * long bpf_sk_assign(struct bpf_sk_lookup *ctx, struct bpf_sock *sk, u64 flags) + * Description + * Helper is overloaded depending on BPF program type. This + * description applies to **BPF_PROG_TYPE_SK_LOOKUP** programs. + * + * Select the *sk* as a result of a socket lookup. + * + * For the operation to succeed passed socket must be compatible + * with the packet description provided by the *ctx* object. + * + * L4 protocol (**IPPROTO_TCP** or **IPPROTO_UDP**) must + * be an exact match. While IP family (**AF_INET** or + * **AF_INET6**) must be compatible, that is IPv6 sockets + * that are not v6-only can be selected for IPv4 packets. + * + * Only TCP listeners and UDP unconnected sockets can be + * selected. *sk* can also be NULL to reset any previous + * selection. + * + * *flags* argument can combination of following values: + * + * * **BPF_SK_LOOKUP_F_REPLACE** to override the previous + * socket selection, potentially done by a BPF program + * that ran before us. + * + * * **BPF_SK_LOOKUP_F_NO_REUSEPORT** to skip + * load-balancing within reuseport group for the socket + * being selected. + * + * On success *ctx->sk* will point to the selected socket. + * + * Return + * 0 on success, or a negative errno in case of failure. + * + * * **-EAFNOSUPPORT** if socket family (*sk->family*) is + * not compatible with packet family (*ctx->family*). + * + * * **-EEXIST** if socket has been already selected, + * potentially by another program, and + * **BPF_SK_LOOKUP_F_REPLACE** flag was not specified. + * + * * **-EINVAL** if unsupported flags were specified. + * + * * **-EPROTOTYPE** if socket L4 protocol + * (*sk->protocol*) doesn't match packet protocol + * (*ctx->protocol*). + * + * * **-ESOCKTNOSUPPORT** if socket is not in allowed + * state (TCP listening or UDP unconnected). + * + * u64 bpf_ktime_get_boot_ns(void) + * Description + * Return the time elapsed since system boot, in nanoseconds. + * Does include the time the system was suspended. + * See: **clock_gettime**\ (**CLOCK_BOOTTIME**) + * Return + * Current *ktime*. + * + * long bpf_seq_printf(struct seq_file *m, const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * **bpf_seq_printf**\ () uses seq_file **seq_printf**\ () to print + * out the format string. + * The *m* represents the seq_file. The *fmt* and *fmt_size* are for + * the format string itself. The *data* and *data_len* are format string + * arguments. The *data* are a **u64** array and corresponding format string + * values are stored in the array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* array. + * The *data_len* is the size of *data* in bytes - must be a multiple of 8. + * + * Formats **%s**, **%p{i,I}{4,6}** requires to read kernel memory. + * Reading kernel memory may fail due to either invalid address or + * valid address but requiring a major memory fault. If reading kernel memory + * fails, the string for **%s** will be an empty string, and the ip + * address for **%p{i,I}{4,6}** will be 0. Not returning error to + * bpf program is consistent with what **bpf_trace_printk**\ () does for now. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EBUSY** if per-CPU memory copy buffer is busy, can try again + * by returning 1 from bpf program. + * + * **-EINVAL** if arguments are invalid, or if *fmt* is invalid/unsupported. + * + * **-E2BIG** if *fmt* contains too many format specifiers. + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * long bpf_seq_write(struct seq_file *m, const void *data, u32 len) + * Description + * **bpf_seq_write**\ () uses seq_file **seq_write**\ () to write the data. + * The *m* represents the seq_file. The *data* and *len* represent the + * data to write in bytes. + * Return + * 0 on success, or a negative error in case of failure: + * + * **-EOVERFLOW** if an overflow happened: The same object will be tried again. + * + * u64 bpf_sk_cgroup_id(void *sk) + * Description + * Return the cgroup v2 id of the socket *sk*. + * + * *sk* must be a non-**NULL** pointer to a socket, e.g. one + * returned from **bpf_sk_lookup_xxx**\ (), + * **bpf_sk_fullsock**\ (), etc. The format of returned id is + * same as in **bpf_skb_cgroup_id**\ (). + * + * This helper is available only if the kernel was compiled with + * the **CONFIG_SOCK_CGROUP_DATA** configuration option. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * u64 bpf_sk_ancestor_cgroup_id(void *sk, int ancestor_level) + * Description + * Return id of cgroup v2 that is ancestor of cgroup associated + * with the *sk* at the *ancestor_level*. The root cgroup is at + * *ancestor_level* zero and each step down the hierarchy + * increments the level. If *ancestor_level* == level of cgroup + * associated with *sk*, then return value will be same as that + * of **bpf_sk_cgroup_id**\ (). + * + * The helper is useful to implement policies based on cgroups + * that are upper in hierarchy than immediate cgroup associated + * with *sk*. + * + * The format of returned id and helper limitations are same as in + * **bpf_sk_cgroup_id**\ (). + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_ringbuf_output(void *ringbuf, void *data, u64 size, u64 flags) + * Description + * Copy *size* bytes from *data* into a ring buffer *ringbuf*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * An adaptive notification is a notification sent whenever the user-space + * process has caught up and consumed all available payloads. In case the user-space + * process is still processing a previous payload, then no notification is needed + * as it will process the newly added payload automatically. + * Return + * 0 on success, or a negative error in case of failure. + * + * void *bpf_ringbuf_reserve(void *ringbuf, u64 size, u64 flags) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf*. + * *flags* must be 0. + * Return + * Valid pointer with *size* bytes of memory available; NULL, + * otherwise. + * + * void bpf_ringbuf_submit(void *data, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard(void *data, u64 flags) + * Description + * Discard reserved ring buffer sample, pointed to by *data*. + * If **BPF_RB_NO_WAKEUP** is specified in *flags*, no notification + * of new data availability is sent. + * If **BPF_RB_FORCE_WAKEUP** is specified in *flags*, notification + * of new data availability is sent unconditionally. + * If **0** is specified in *flags*, an adaptive notification + * of new data availability is sent. + * + * See 'bpf_ringbuf_output()' for the definition of adaptive notification. + * Return + * Nothing. Always succeeds. + * + * u64 bpf_ringbuf_query(void *ringbuf, u64 flags) + * Description + * Query various characteristics of provided ring buffer. What + * exactly is queries is determined by *flags*: + * + * * **BPF_RB_AVAIL_DATA**: Amount of data not yet consumed. + * * **BPF_RB_RING_SIZE**: The size of ring buffer. + * * **BPF_RB_CONS_POS**: Consumer position (can wrap around). + * * **BPF_RB_PROD_POS**: Producer(s) position (can wrap around). + * + * Data returned is just a momentary snapshot of actual values + * and could be inaccurate, so this facility should be used to + * power heuristics and for reporting, not to make 100% correct + * calculation. + * Return + * Requested value, or 0, if *flags* are not recognized. + * + * long bpf_csum_level(struct sk_buff *skb, u64 level) + * Description + * Change the skbs checksum level by one layer up or down, or + * reset it entirely to none in order to have the stack perform + * checksum validation. The level is applicable to the following + * protocols: TCP, UDP, GRE, SCTP, FCOE. For example, a decap of + * | ETH | IP | UDP | GUE | IP | TCP | into | ETH | IP | TCP | + * through **bpf_skb_adjust_room**\ () helper with passing in + * **BPF_F_ADJ_ROOM_NO_CSUM_RESET** flag would require one call + * to **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_DEC** since + * the UDP header is removed. Similarly, an encap of the latter + * into the former could be accompanied by a helper call to + * **bpf_csum_level**\ () with **BPF_CSUM_LEVEL_INC** if the + * skb is still intended to be processed in higher layers of the + * stack instead of just egressing at tc. + * + * There are three supported level settings at this time: + * + * * **BPF_CSUM_LEVEL_INC**: Increases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_DEC**: Decreases skb->csum_level for skbs + * with CHECKSUM_UNNECESSARY. + * * **BPF_CSUM_LEVEL_RESET**: Resets skb->csum_level to 0 and + * sets CHECKSUM_NONE to force checksum validation by the stack. + * * **BPF_CSUM_LEVEL_QUERY**: No-op, returns the current + * skb->csum_level. + * Return + * 0 on success, or a negative error in case of failure. In the + * case of **BPF_CSUM_LEVEL_QUERY**, the current skb->csum_level + * is returned or the error code -EACCES in case the skb is not + * subject to CHECKSUM_UNNECESSARY. + * + * struct tcp6_sock *bpf_skc_to_tcp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_sock *bpf_skc_to_tcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_timewait_sock *bpf_skc_to_tcp_timewait_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_timewait_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct tcp_request_sock *bpf_skc_to_tcp_request_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *tcp_request_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * struct udp6_sock *bpf_skc_to_udp6_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *udp6_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_get_task_stack(struct task_struct *task, void *buf, u32 size, u64 flags) + * Description + * Return a user or a kernel stack in bpf program provided buffer. + * Note: the user stack will only be populated if the *task* is + * the current task; all other tasks will return -EOPNOTSUPP. + * To achieve this, the helper needs *task*, which is a valid + * pointer to **struct task_struct**. To store the stacktrace, the + * bpf program provides *buf* with a nonnegative *size*. + * + * The last argument, *flags*, holds the number of stack frames to + * skip (from 0 to 255), masked with + * **BPF_F_SKIP_FIELD_MASK**. The next bits can be used to set + * the following flags: + * + * **BPF_F_USER_STACK** + * Collect a user space stack instead of a kernel stack. + * The *task* must be the current task. + * **BPF_F_USER_BUILD_ID** + * Collect buildid+offset instead of ips for user stack, + * only valid if **BPF_F_USER_STACK** is also specified. + * + * **bpf_get_task_stack**\ () can collect up to + * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject + * to sufficient large buffer size. Note that + * this limit can be controlled with the **sysctl** program, and + * that it should be manually increased in order to profile long + * user stacks (such as stacks for Java programs). To do so, use: + * + * :: + * + * # sysctl kernel.perf_event_max_stack= + * Return + * The non-negative copied *buf* length equal to or less than + * *size* on success, or a negative error in case of failure. + * + * long bpf_load_hdr_opt(struct bpf_sock_ops *skops, void *searchby_res, u32 len, u64 flags) + * Description + * Load header option. Support reading a particular TCP header + * option for bpf program (**BPF_PROG_TYPE_SOCK_OPS**). + * + * If *flags* is 0, it will search the option from the + * *skops*\ **->skb_data**. The comment in **struct bpf_sock_ops** + * has details on what skb_data contains under different + * *skops*\ **->op**. + * + * The first byte of the *searchby_res* specifies the + * kind that it wants to search. + * + * If the searching kind is an experimental kind + * (i.e. 253 or 254 according to RFC6994). It also + * needs to specify the "magic" which is either + * 2 bytes or 4 bytes. It then also needs to + * specify the size of the magic by using + * the 2nd byte which is "kind-length" of a TCP + * header option and the "kind-length" also + * includes the first 2 bytes "kind" and "kind-length" + * itself as a normal TCP header option also does. + * + * For example, to search experimental kind 254 with + * 2 byte magic 0xeB9F, the searchby_res should be + * [ 254, 4, 0xeB, 0x9F, 0, 0, .... 0 ]. + * + * To search for the standard window scale option (3), + * the *searchby_res* should be [ 3, 0, 0, .... 0 ]. + * Note, kind-length must be 0 for regular option. + * + * Searching for No-Op (0) and End-of-Option-List (1) are + * not supported. + * + * *len* must be at least 2 bytes which is the minimal size + * of a header option. + * + * Supported flags: + * + * * **BPF_LOAD_HDR_OPT_TCP_SYN** to search from the + * saved_syn packet or the just-received syn packet. + * + * Return + * > 0 when found, the header option is copied to *searchby_res*. + * The return value is the total length copied. On failure, a + * negative error code is returned: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOMSG** if the option is not found. + * + * **-ENOENT** if no syn packet is available when + * **BPF_LOAD_HDR_OPT_TCP_SYN** is used. + * + * **-ENOSPC** if there is not enough space. Only *len* number of + * bytes are copied. + * + * **-EFAULT** on failure to parse the header options in the + * packet. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_store_hdr_opt(struct bpf_sock_ops *skops, const void *from, u32 len, u64 flags) + * Description + * Store header option. The data will be copied + * from buffer *from* with length *len* to the TCP header. + * + * The buffer *from* should have the whole option that + * includes the kind, kind-length, and the actual + * option data. The *len* must be at least kind-length + * long. The kind-length does not have to be 4 byte + * aligned. The kernel will take care of the padding + * and setting the 4 bytes aligned value to th->doff. + * + * This helper will check for duplicated option + * by searching the same option in the outgoing skb. + * + * This helper can only be called during + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** If param is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * Nothing has been written + * + * **-EEXIST** if the option already exists. + * + * **-EFAULT** on failure to parse the existing header options. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * long bpf_reserve_hdr_opt(struct bpf_sock_ops *skops, u32 len, u64 flags) + * Description + * Reserve *len* bytes for the bpf header option. The + * space will be used by **bpf_store_hdr_opt**\ () later in + * **BPF_SOCK_OPS_WRITE_HDR_OPT_CB**. + * + * If **bpf_reserve_hdr_opt**\ () is called multiple times, + * the total number of bytes will be reserved. + * + * This helper can only be called during + * **BPF_SOCK_OPS_HDR_OPT_LEN_CB**. + * + * Return + * 0 on success, or negative error in case of failure: + * + * **-EINVAL** if a parameter is invalid. + * + * **-ENOSPC** if there is not enough space in the header. + * + * **-EPERM** if the helper cannot be used under the current + * *skops*\ **->op**. + * + * void *bpf_inode_storage_get(struct bpf_map *map, void *inode, void *value, u64 flags) + * Description + * Get a bpf_local_storage from an *inode*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *inode* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *inode*) except this + * helper enforces the key must be an inode and the map must also + * be a **BPF_MAP_TYPE_INODE_STORAGE**. + * + * Underneath, the value is stored locally at *inode* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *inode*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * int bpf_inode_storage_delete(struct bpf_map *map, void *inode) + * Description + * Delete a bpf_local_storage from an *inode*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * long bpf_d_path(struct path *path, char *buf, u32 sz) + * Description + * Return full path for given **struct path** object, which + * needs to be the kernel BTF *path* object. The path is + * returned in the provided buffer *buf* of size *sz* and + * is zero terminated. + * + * Return + * On success, the strictly positive length of the string, + * including the trailing NUL character. On error, a negative + * value. + * + * long bpf_copy_from_user(void *dst, u32 size, const void *user_ptr) + * Description + * Read *size* bytes from user space address *user_ptr* and store + * the data in *dst*. This is a wrapper of **copy_from_user**\ (). + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_snprintf_btf(char *str, u32 str_size, struct btf_ptr *ptr, u32 btf_ptr_size, u64 flags) + * Description + * Use BTF to store a string representation of *ptr*->ptr in *str*, + * using *ptr*->type_id. This value should specify the type + * that *ptr*->ptr points to. LLVM __builtin_btf_type_id(type, 1) + * can be used to look up vmlinux BTF type ids. Traversing the + * data structure using BTF, the type information and values are + * stored in the first *str_size* - 1 bytes of *str*. Safe copy of + * the pointer data is carried out to avoid kernel crashes during + * operation. Smaller types can use string space on the stack; + * larger programs can use map data to store the string + * representation. + * + * The string can be subsequently shared with userspace via + * bpf_perf_event_output() or ring buffer interfaces. + * bpf_trace_printk() is to be avoided as it places too small + * a limit on string size to be useful. + * + * *flags* is a combination of + * + * **BTF_F_COMPACT** + * no formatting around type information + * **BTF_F_NONAME** + * no struct/union member names/types + * **BTF_F_PTR_RAW** + * show raw (unobfuscated) pointer values; + * equivalent to printk specifier %px. + * **BTF_F_ZERO** + * show zero-valued struct/union members; they + * are not displayed by default + * + * Return + * The number of bytes that were written (or would have been + * written if output had to be truncated due to string size), + * or a negative error in cases of failure. + * + * long bpf_seq_printf_btf(struct seq_file *m, struct btf_ptr *ptr, u32 ptr_size, u64 flags) + * Description + * Use BTF to write to seq_write a string representation of + * *ptr*->ptr, using *ptr*->type_id as per bpf_snprintf_btf(). + * *flags* are identical to those used for bpf_snprintf_btf. + * Return + * 0 on success or a negative error in case of failure. + * + * u64 bpf_skb_cgroup_classid(struct sk_buff *skb) + * Description + * See **bpf_get_cgroup_classid**\ () for the main description. + * This helper differs from **bpf_get_cgroup_classid**\ () in that + * the cgroup v1 net_cls class is retrieved only from the *skb*'s + * associated socket instead of the current process. + * Return + * The id is returned or 0 in case the id could not be retrieved. + * + * long bpf_redirect_neigh(u32 ifindex, struct bpf_redir_neigh *params, int plen, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex* + * and fill in L2 addresses from neighboring subsystem. This helper + * is somewhat similar to **bpf_redirect**\ (), except that it + * populates L2 addresses as well, meaning, internally, the helper + * relies on the neighbor lookup for the L2 address of the nexthop. + * + * The helper will perform a FIB lookup based on the skb's + * networking header to get the address of the next hop, unless + * this is supplied by the caller in the *params* argument. The + * *plen* argument indicates the len of *params* and should be set + * to 0 if *params* is NULL. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types, and enabled + * for IPv4 and IPv6 protocols. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_per_cpu_ptr(const void *percpu_ptr, u32 cpu) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on *cpu*. A ksym is an + * extern variable decorated with '__ksym'. For ksym, there is a + * global var (either static or global) defined of the same name + * in the kernel. The ksym is percpu if the global var is percpu. + * The returned pointer points to the global percpu var on *cpu*. + * + * bpf_per_cpu_ptr() has the same semantic as per_cpu_ptr() in the + * kernel, except that bpf_per_cpu_ptr() may return NULL. This + * happens if *cpu* is larger than nr_cpu_ids. The caller of + * bpf_per_cpu_ptr() must check the returned value. + * Return + * A pointer pointing to the kernel percpu variable on *cpu*, or + * NULL, if *cpu* is invalid. + * + * void *bpf_this_cpu_ptr(const void *percpu_ptr) + * Description + * Take a pointer to a percpu ksym, *percpu_ptr*, and return a + * pointer to the percpu kernel variable on this cpu. See the + * description of 'ksym' in **bpf_per_cpu_ptr**\ (). + * + * bpf_this_cpu_ptr() has the same semantic as this_cpu_ptr() in + * the kernel. Different from **bpf_per_cpu_ptr**\ (), it would + * never return NULL. + * Return + * A pointer pointing to the kernel percpu variable on this cpu. + * + * long bpf_redirect_peer(u32 ifindex, u64 flags) + * Description + * Redirect the packet to another net device of index *ifindex*. + * This helper is somewhat similar to **bpf_redirect**\ (), except + * that the redirection happens to the *ifindex*' peer device and + * the netns switch takes place from ingress to ingress without + * going through the CPU's backlog queue. + * + * The *flags* argument is reserved and must be 0. The helper is + * currently only supported for tc BPF program types at the + * ingress hook and for veth and netkit target device types. The + * peer device must reside in a different network namespace. + * Return + * The helper returns **TC_ACT_REDIRECT** on success or + * **TC_ACT_SHOT** on error. + * + * void *bpf_task_storage_get(struct bpf_map *map, struct task_struct *task, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *task*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *task* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this + * helper enforces the key must be a task_struct and the map must also + * be a **BPF_MAP_TYPE_TASK_STORAGE**. + * + * Underneath, the value is stored locally at *task* instead of + * the *map*. The *map* is used as the bpf-local-storage + * "type". The bpf-local-storage "type" (i.e. the *map*) is + * searched against all bpf_local_storage residing at *task*. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_task_storage_delete(struct bpf_map *map, struct task_struct *task) + * Description + * Delete a bpf_local_storage from a *task*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + * + * struct task_struct *bpf_get_current_task_btf(void) + * Description + * Return a BTF pointer to the "current" task. + * This pointer can also be used in helpers that accept an + * *ARG_PTR_TO_BTF_ID* of type *task_struct*. + * Return + * Pointer to the current task. + * + * long bpf_bprm_opts_set(struct linux_binprm *bprm, u64 flags) + * Description + * Set or clear certain options on *bprm*: + * + * **BPF_F_BPRM_SECUREEXEC** Set the secureexec bit + * which sets the **AT_SECURE** auxv for glibc. The bit + * is cleared if the flag is not specified. + * Return + * **-EINVAL** if invalid *flags* are passed, zero otherwise. + * + * u64 bpf_ktime_get_coarse_ns(void) + * Description + * Return a coarse-grained version of the time elapsed since + * system boot, in nanoseconds. Does not include time the system + * was suspended. + * + * See: **clock_gettime**\ (**CLOCK_MONOTONIC_COARSE**) + * Return + * Current *ktime*. + * + * long bpf_ima_inode_hash(struct inode *inode, void *dst, u32 size) + * Description + * Returns the stored IMA hash of the *inode* (if it's available). + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if + * invalid arguments are passed. + * + * struct socket *bpf_sock_from_file(struct file *file) + * Description + * If the given file represents a socket, returns the associated + * socket. + * Return + * A pointer to a struct socket on success or NULL if the file is + * not a socket. + * + * long bpf_check_mtu(void *ctx, u32 ifindex, u32 *mtu_len, s32 len_diff, u64 flags) + * Description + * Check packet size against exceeding MTU of net device (based + * on *ifindex*). This helper will likely be used in combination + * with helpers that adjust/change the packet size. + * + * The argument *len_diff* can be used for querying with a planned + * size change. This allows to check MTU prior to changing packet + * ctx. Providing a *len_diff* adjustment that is larger than the + * actual packet size (resulting in negative packet size) will in + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. + * + * Specifying *ifindex* zero means the MTU check is performed + * against the current net device. This is practical if this isn't + * used prior to redirect. + * + * On input *mtu_len* must be a valid pointer, else verifier will + * reject BPF program. If the value *mtu_len* is initialized to + * zero then the ctx packet size is use. When value *mtu_len* is + * provided as input this specify the L3 length that the MTU check + * is done against. Remember XDP and TC length operate at L2, but + * this value is L3 as this correlate to MTU and IP-header tot_len + * values which are L3 (similar behavior as bpf_fib_lookup). + * + * The Linux kernel route table can configure MTUs on a more + * specific per route level, which is not provided by this helper. + * For route level MTU checks use the **bpf_fib_lookup**\ () + * helper. + * + * *ctx* is either **struct xdp_md** for XDP programs or + * **struct sk_buff** for tc cls_act programs. + * + * The *flags* argument can be a combination of one or more of the + * following values: + * + * **BPF_MTU_CHK_SEGS** + * This flag will only works for *ctx* **struct sk_buff**. + * If packet context contains extra packet segment buffers + * (often knows as GSO skb), then MTU check is harder to + * check at this point, because in transmit path it is + * possible for the skb packet to get re-segmented + * (depending on net device features). This could still be + * a MTU violation, so this flag enables performing MTU + * check against segments, with a different violation + * return code to tell it apart. Check cannot use len_diff. + * + * On return *mtu_len* pointer contains the MTU value of the net + * device. Remember the net device configured MTU is the L3 size, + * which is returned here and XDP and TC length operate at L2. + * Helper take this into account for you, but remember when using + * MTU value in your BPF-code. + * + * Return + * * 0 on success, and populate MTU value in *mtu_len* pointer. + * + * * < 0 if any input argument is invalid (*mtu_len* not updated) + * + * MTU violations return positive values, but also populate MTU + * value in *mtu_len* pointer, as this can be needed for + * implementing PMTU handing: + * + * * **BPF_MTU_CHK_RET_FRAG_NEEDED** + * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** + * + * long bpf_for_each_map_elem(struct bpf_map *map, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For each element in **map**, call **callback_fn** function with + * **map**, **callback_ctx** and other map-specific parameters. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. + * + * The following are a list of supported map types and their + * respective expected callback signatures: + * + * BPF_MAP_TYPE_HASH, BPF_MAP_TYPE_PERCPU_HASH, + * BPF_MAP_TYPE_LRU_HASH, BPF_MAP_TYPE_LRU_PERCPU_HASH, + * BPF_MAP_TYPE_ARRAY, BPF_MAP_TYPE_PERCPU_ARRAY + * + * long (\*callback_fn)(struct bpf_map \*map, const void \*key, void \*value, void \*ctx); + * + * For per_cpu maps, the map_value is the value on the cpu where the + * bpf_prog is running. + * + * If **callback_fn** return 0, the helper will continue to the next + * element. If return value is 1, the helper will skip the rest of + * elements and return. Other return values are not used now. + * + * Return + * The number of traversed map elements for success, **-EINVAL** for + * invalid **flags**. + * + * long bpf_snprintf(char *str, u32 str_size, const char *fmt, u64 *data, u32 data_len) + * Description + * Outputs a string into the **str** buffer of size **str_size** + * based on a format string stored in a read-only map pointed by + * **fmt**. + * + * Each format specifier in **fmt** corresponds to one u64 element + * in the **data** array. For strings and pointers where pointees + * are accessed, only the pointer values are stored in the *data* + * array. The *data_len* is the size of *data* in bytes - must be + * a multiple of 8. + * + * Formats **%s** and **%p{i,I}{4,6}** require to read kernel + * memory. Reading kernel memory may fail due to either invalid + * address or valid address but requiring a major memory fault. If + * reading kernel memory fails, the string for **%s** will be an + * empty string, and the ip address for **%p{i,I}{4,6}** will be 0. + * Not returning error to bpf program is consistent with what + * **bpf_trace_printk**\ () does for now. + * + * Return + * The strictly positive length of the formatted string, including + * the trailing zero character. If the return value is greater than + * **str_size**, **str** contains a truncated string, guaranteed to + * be zero-terminated except when **str_size** is 0. + * + * Or **-EBUSY** if the per-CPU memory copy buffer is busy. + * + * long bpf_sys_bpf(u32 cmd, void *attr, u32 attr_size) + * Description + * Execute bpf syscall with given arguments. + * Return + * A syscall result. + * + * long bpf_btf_find_by_name_kind(char *name, int name_sz, u32 kind, int flags) + * Description + * Find BTF type with given name and kind in vmlinux BTF or in module's BTFs. + * Return + * Returns btf_id and btf_obj_fd in lower and upper 32 bits. + * + * long bpf_sys_close(u32 fd) + * Description + * Execute close syscall for given FD. + * Return + * A syscall result. + * + * long bpf_timer_init(struct bpf_timer *timer, struct bpf_map *map, u64 flags) + * Description + * Initialize the timer. + * First 4 bits of *flags* specify clockid. + * Only CLOCK_MONOTONIC, CLOCK_REALTIME, CLOCK_BOOTTIME are allowed. + * All other bits of *flags* are reserved. + * The verifier will reject the program if *timer* is not from + * the same *map*. + * Return + * 0 on success. + * **-EBUSY** if *timer* is already initialized. + * **-EINVAL** if invalid *flags* are passed. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_set_callback(struct bpf_timer *timer, void *callback_fn) + * Description + * Configure the timer to call *callback_fn* static function. + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EPERM** if *timer* is in a map that doesn't have any user references. + * The user space should either hold a file descriptor to a map with timers + * or pin such map in bpffs. When map is unpinned or file descriptor is + * closed all timers in the map will be cancelled and freed. + * + * long bpf_timer_start(struct bpf_timer *timer, u64 nsecs, u64 flags) + * Description + * Set timer expiration N nanoseconds from the current time. The + * configured callback will be invoked in soft irq context on some cpu + * and will not repeat unless another bpf_timer_start() is made. + * In such case the next invocation can migrate to a different cpu. + * Since struct bpf_timer is a field inside map element the map + * owns the timer. The bpf_timer_set_callback() will increment refcnt + * of BPF program to make sure that callback_fn code stays valid. + * When user space reference to a map reaches zero all timers + * in a map are cancelled and corresponding program's refcnts are + * decremented. This is done to make sure that Ctrl-C of a user + * process doesn't leave any timers running. If map is pinned in + * bpffs the callback_fn can re-arm itself indefinitely. + * bpf_map_update/delete_elem() helpers and user space sys_bpf commands + * cancel and free the timer in the given map element. + * The map can contain timers that invoke callback_fn-s from different + * programs. The same callback_fn can serve different timers from + * different maps if key/value layout matches across maps. + * Every bpf_timer_set_callback() can have different callback_fn. + * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * **BPF_F_TIMER_CPU_PIN** + * Timer will be pinned to the CPU of the caller. + * + * Return + * 0 on success. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier + * or invalid *flags* are passed. + * + * long bpf_timer_cancel(struct bpf_timer *timer) + * Description + * Cancel the timer and wait for callback_fn to finish if it was running. + * Return + * 0 if the timer was not active. + * 1 if the timer was active. + * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier. + * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its + * own timer which would have led to a deadlock otherwise. + * + * u64 bpf_get_func_ip(void *ctx) + * Description + * Get address of the traced function (for tracing and kprobe programs). + * + * When called for kprobe program attached as uprobe it returns + * probe address for both entry and return uprobe. + * + * Return + * Address of the traced function for kprobe. + * 0 for kprobes placed within the function (not at the entry). + * Address of the probe for uprobe and return uprobe. + * + * u64 bpf_get_attach_cookie(void *ctx) + * Description + * Get bpf_cookie value provided (optionally) during the program + * attachment. It might be different for each individual + * attachment, even if BPF program itself is the same. + * Expects BPF program context *ctx* as a first argument. + * + * Supported for the following program types: + * - kprobe/uprobe; + * - tracepoint; + * - perf_event. + * Return + * Value specified by user at BPF link creation/attachment time + * or 0, if it was not specified. + * + * long bpf_task_pt_regs(struct task_struct *task) + * Description + * Get the struct pt_regs associated with **task**. + * Return + * A pointer to struct pt_regs. + * + * long bpf_get_branch_snapshot(void *entries, u32 size, u64 flags) + * Description + * Get branch trace from hardware engines like Intel LBR. The + * hardware engine is stopped shortly after the helper is + * called. Therefore, the user need to filter branch entries + * based on the actual use case. To capture branch trace + * before the trigger point of the BPF program, the helper + * should be called at the beginning of the BPF program. + * + * The data is stored as struct perf_branch_entry into output + * buffer *entries*. *size* is the size of *entries* in bytes. + * *flags* is reserved for now and must be zero. + * + * Return + * On success, number of bytes written to *buf*. On error, a + * negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-ENOENT** if architecture does not support branch records. + * + * long bpf_trace_vprintk(const char *fmt, u32 fmt_size, const void *data, u32 data_len) + * Description + * Behaves like **bpf_trace_printk**\ () helper, but takes an array of u64 + * to format and can handle more format args as a result. + * + * Arguments are to be used as in **bpf_seq_printf**\ () helper. + * Return + * The number of bytes written to the buffer, or a negative error + * in case of failure. + * + * struct unix_sock *bpf_skc_to_unix_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *unix_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_kallsyms_lookup_name(const char *name, int name_sz, int flags, u64 *res) + * Description + * Get the address of a kernel symbol, returned in *res*. *res* is + * set to 0 if the symbol is not found. + * Return + * On success, zero. On error, a negative value. + * + * **-EINVAL** if *flags* is not zero. + * + * **-EINVAL** if string *name* is not the same size as *name_sz*. + * + * **-ENOENT** if symbol is not found. + * + * **-EPERM** if caller does not have permission to obtain kernel address. + * + * long bpf_find_vma(struct task_struct *task, u64 addr, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * Find vma of *task* that contains *addr*, call *callback_fn* + * function with *task*, *vma*, and *callback_ctx*. + * The *callback_fn* should be a static function and + * the *callback_ctx* should be a pointer to the stack. + * The *flags* is used to control certain aspects of the helper. + * Currently, the *flags* must be 0. + * + * The expected callback signature is + * + * long (\*callback_fn)(struct task_struct \*task, struct vm_area_struct \*vma, void \*callback_ctx); + * + * Return + * 0 on success. + * **-ENOENT** if *task->mm* is NULL, or no vma contains *addr*. + * **-EBUSY** if failed to try lock mmap_lock. + * **-EINVAL** for invalid **flags**. + * + * long bpf_loop(u32 nr_loops, void *callback_fn, void *callback_ctx, u64 flags) + * Description + * For **nr_loops**, call **callback_fn** function + * with **callback_ctx** as the context parameter. + * The **callback_fn** should be a static function and + * the **callback_ctx** should be a pointer to the stack. + * The **flags** is used to control certain aspects of the helper. + * Currently, the **flags** must be 0. Currently, nr_loops is + * limited to 1 << 23 (~8 million) loops. + * + * long (\*callback_fn)(u32 index, void \*ctx); + * + * where **index** is the current index in the loop. The index + * is zero-indexed. + * + * If **callback_fn** returns 0, the helper will continue to the next + * loop. If return value is 1, the helper will skip the rest of + * the loops and return. Other return values are not used now, + * and will be rejected by the verifier. + * + * Return + * The number of loops performed, **-EINVAL** for invalid **flags**, + * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. + * + * long bpf_strncmp(const char *s1, u32 s1_sz, const char *s2) + * Description + * Do strncmp() between **s1** and **s2**. **s1** doesn't need + * to be null-terminated and **s1_sz** is the maximum storage + * size of **s1**. **s2** must be a read-only string. + * Return + * An integer less than, equal to, or greater than zero + * if the first **s1_sz** bytes of **s1** is found to be + * less than, to match, or be greater than **s2**. + * + * long bpf_get_func_arg(void *ctx, u32 n, u64 *value) + * Description + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) + * returned in **value**. + * + * Return + * 0 on success. + * **-EINVAL** if n >= argument register count of traced function. + * + * long bpf_get_func_ret(void *ctx, u64 *value) + * Description + * Get return value of the traced function (for tracing programs) + * in **value**. + * + * Return + * 0 on success. + * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. + * + * long bpf_get_func_arg_cnt(void *ctx) + * Description + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. + * + * Return + * The number of argument registers of the traced function. + * + * int bpf_get_retval(void) + * Description + * Get the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * Return + * The BPF program's return value. + * + * int bpf_set_retval(int retval) + * Description + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. + * + * Return + * 0 on success, or a negative error in case of failure. + * + * u64 bpf_xdp_get_buff_len(struct xdp_buff *xdp_md) + * Description + * Get the total size of a given xdp buff (linear and paged area) + * Return + * The total size of a given xdp buffer. + * + * long bpf_xdp_load_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * This helper is provided as an easy way to load data from a + * xdp buffer. It can be used to load *len* bytes from *offset* from + * the frame associated to *xdp_md*, into the buffer pointed by + * *buf*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_xdp_store_bytes(struct xdp_buff *xdp_md, u32 offset, void *buf, u32 len) + * Description + * Store *len* bytes from buffer *buf* into the frame + * associated to *xdp_md*, at *offset*. + * Return + * 0 on success, or a negative error in case of failure. + * + * long bpf_copy_from_user_task(void *dst, u32 size, const void *user_ptr, struct task_struct *tsk, u64 flags) + * Description + * Read *size* bytes from user space address *user_ptr* in *tsk*'s + * address space, and stores the data in *dst*. *flags* is not + * used yet and is provided for future extensibility. This helper + * can only be used by sleepable programs. + * Return + * 0 on success, or a negative error in case of failure. On error + * *dst* buffer is zeroed out. + * + * long bpf_skb_set_tstamp(struct sk_buff *skb, u64 tstamp, u32 tstamp_type) + * Description + * Change the __sk_buff->tstamp_type to *tstamp_type* + * and set *tstamp* to the __sk_buff->tstamp together. + * + * If there is no need to change the __sk_buff->tstamp_type, + * the tstamp value can be directly written to __sk_buff->tstamp + * instead. + * + * BPF_SKB_TSTAMP_DELIVERY_MONO is the only tstamp that + * will be kept during bpf_redirect_*(). A non zero + * *tstamp* must be used with the BPF_SKB_TSTAMP_DELIVERY_MONO + * *tstamp_type*. + * + * A BPF_SKB_TSTAMP_UNSPEC *tstamp_type* can only be used + * with a zero *tstamp*. + * + * Only IPv4 and IPv6 skb->protocol are supported. + * + * This function is most useful when it needs to set a + * mono delivery time to __sk_buff->tstamp and then + * bpf_redirect_*() to the egress of an iface. For example, + * changing the (rcv) timestamp in __sk_buff->tstamp at + * ingress to a mono delivery time and then bpf_redirect_*() + * to sch_fq@phy-dev. + * Return + * 0 on success. + * **-EINVAL** for invalid input + * **-EOPNOTSUPP** for unsupported protocol + * + * long bpf_ima_file_hash(struct file *file, void *dst, u32 size) + * Description + * Returns a calculated IMA hash of the *file*. + * If the hash is larger than *size*, then only *size* + * bytes will be copied to *dst* + * Return + * The **hash_algo** is returned on success, + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if + * invalid arguments are passed. + * + * void *bpf_kptr_xchg(void *map_value, void *ptr) + * Description + * Exchange kptr at pointer *map_value* with *ptr*, and return the + * old value. *ptr* can be NULL, otherwise it must be a referenced + * pointer which will be released when this helper is called. + * Return + * The old value of kptr (which can be NULL). The returned pointer + * if not NULL, is a reference which must be released using its + * corresponding release function, or moved into a BPF map before + * program exit. + * + * void *bpf_map_lookup_percpu_elem(struct bpf_map *map, const void *key, u32 cpu) + * Description + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * Return + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. + * + * struct mptcp_sock *bpf_skc_to_mptcp_sock(void *sk) + * Description + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * Return + * *sk* if casting is valid, or **NULL** otherwise. + * + * long bpf_dynptr_from_mem(void *data, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. + * + * long bpf_ringbuf_reserve_dynptr(void *ringbuf, u32 size, u64 flags, struct bpf_dynptr *ptr) + * Description + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * Return + * 0 on success, or a negative error in case of failure. + * + * void bpf_ringbuf_submit_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * Return + * Nothing. Always succeeds. + * + * void bpf_ringbuf_discard_dynptr(struct bpf_dynptr *ptr, u64 flags) + * Description + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * Return + * Nothing. Always succeeds. + * + * long bpf_dynptr_read(void *dst, u32 len, const struct bpf_dynptr *src, u32 offset, u64 flags) + * Description + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * *flags* is currently unused. + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. + * + * long bpf_dynptr_write(const struct bpf_dynptr *dst, u32 offset, void *src, u32 len, u64 flags) + * Description + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). + * Return + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). + * + * void *bpf_dynptr_data(const struct bpf_dynptr *ptr, u32 offset, u32 len) + * Description + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. + * Return + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * s64 bpf_tcp_raw_gen_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th, u32 th_len) + * Description + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * Return + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * long bpf_tcp_raw_check_syncookie_ipv4(struct iphdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * long bpf_tcp_raw_check_syncookie_ipv6(struct ipv6hdr *iph, struct tcphdr *th) + * Description + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * Return + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + * + * u64 bpf_ktime_get_tai_ns(void) + * Description + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * Return + * Current *ktime*. + * + * long bpf_user_ringbuf_drain(struct bpf_map *map, void *callback_fn, void *ctx, u64 flags) + * Description + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * Return + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. + * + * void *bpf_cgrp_storage_get(struct bpf_map *map, struct cgroup *cgroup, void *value, u64 flags) + * Description + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * Return + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + * + * long bpf_cgrp_storage_delete(struct bpf_map *map, struct cgroup *cgroup) + * Description + * Delete a bpf_local_storage from a *cgroup*. + * Return + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +#define ___BPF_FUNC_MAPPER(FN, ctx...) \ + FN(unspec, 0, ##ctx) \ + FN(map_lookup_elem, 1, ##ctx) \ + FN(map_update_elem, 2, ##ctx) \ + FN(map_delete_elem, 3, ##ctx) \ + FN(probe_read, 4, ##ctx) \ + FN(ktime_get_ns, 5, ##ctx) \ + FN(trace_printk, 6, ##ctx) \ + FN(get_prandom_u32, 7, ##ctx) \ + FN(get_smp_processor_id, 8, ##ctx) \ + FN(skb_store_bytes, 9, ##ctx) \ + FN(l3_csum_replace, 10, ##ctx) \ + FN(l4_csum_replace, 11, ##ctx) \ + FN(tail_call, 12, ##ctx) \ + FN(clone_redirect, 13, ##ctx) \ + FN(get_current_pid_tgid, 14, ##ctx) \ + FN(get_current_uid_gid, 15, ##ctx) \ + FN(get_current_comm, 16, ##ctx) \ + FN(get_cgroup_classid, 17, ##ctx) \ + FN(skb_vlan_push, 18, ##ctx) \ + FN(skb_vlan_pop, 19, ##ctx) \ + FN(skb_get_tunnel_key, 20, ##ctx) \ + FN(skb_set_tunnel_key, 21, ##ctx) \ + FN(perf_event_read, 22, ##ctx) \ + FN(redirect, 23, ##ctx) \ + FN(get_route_realm, 24, ##ctx) \ + FN(perf_event_output, 25, ##ctx) \ + FN(skb_load_bytes, 26, ##ctx) \ + FN(get_stackid, 27, ##ctx) \ + FN(csum_diff, 28, ##ctx) \ + FN(skb_get_tunnel_opt, 29, ##ctx) \ + FN(skb_set_tunnel_opt, 30, ##ctx) \ + FN(skb_change_proto, 31, ##ctx) \ + FN(skb_change_type, 32, ##ctx) \ + FN(skb_under_cgroup, 33, ##ctx) \ + FN(get_hash_recalc, 34, ##ctx) \ + FN(get_current_task, 35, ##ctx) \ + FN(probe_write_user, 36, ##ctx) \ + FN(current_task_under_cgroup, 37, ##ctx) \ + FN(skb_change_tail, 38, ##ctx) \ + FN(skb_pull_data, 39, ##ctx) \ + FN(csum_update, 40, ##ctx) \ + FN(set_hash_invalid, 41, ##ctx) \ + FN(get_numa_node_id, 42, ##ctx) \ + FN(skb_change_head, 43, ##ctx) \ + FN(xdp_adjust_head, 44, ##ctx) \ + FN(probe_read_str, 45, ##ctx) \ + FN(get_socket_cookie, 46, ##ctx) \ + FN(get_socket_uid, 47, ##ctx) \ + FN(set_hash, 48, ##ctx) \ + FN(setsockopt, 49, ##ctx) \ + FN(skb_adjust_room, 50, ##ctx) \ + FN(redirect_map, 51, ##ctx) \ + FN(sk_redirect_map, 52, ##ctx) \ + FN(sock_map_update, 53, ##ctx) \ + FN(xdp_adjust_meta, 54, ##ctx) \ + FN(perf_event_read_value, 55, ##ctx) \ + FN(perf_prog_read_value, 56, ##ctx) \ + FN(getsockopt, 57, ##ctx) \ + FN(override_return, 58, ##ctx) \ + FN(sock_ops_cb_flags_set, 59, ##ctx) \ + FN(msg_redirect_map, 60, ##ctx) \ + FN(msg_apply_bytes, 61, ##ctx) \ + FN(msg_cork_bytes, 62, ##ctx) \ + FN(msg_pull_data, 63, ##ctx) \ + FN(bind, 64, ##ctx) \ + FN(xdp_adjust_tail, 65, ##ctx) \ + FN(skb_get_xfrm_state, 66, ##ctx) \ + FN(get_stack, 67, ##ctx) \ + FN(skb_load_bytes_relative, 68, ##ctx) \ + FN(fib_lookup, 69, ##ctx) \ + FN(sock_hash_update, 70, ##ctx) \ + FN(msg_redirect_hash, 71, ##ctx) \ + FN(sk_redirect_hash, 72, ##ctx) \ + FN(lwt_push_encap, 73, ##ctx) \ + FN(lwt_seg6_store_bytes, 74, ##ctx) \ + FN(lwt_seg6_adjust_srh, 75, ##ctx) \ + FN(lwt_seg6_action, 76, ##ctx) \ + FN(rc_repeat, 77, ##ctx) \ + FN(rc_keydown, 78, ##ctx) \ + FN(skb_cgroup_id, 79, ##ctx) \ + FN(get_current_cgroup_id, 80, ##ctx) \ + FN(get_local_storage, 81, ##ctx) \ + FN(sk_select_reuseport, 82, ##ctx) \ + FN(skb_ancestor_cgroup_id, 83, ##ctx) \ + FN(sk_lookup_tcp, 84, ##ctx) \ + FN(sk_lookup_udp, 85, ##ctx) \ + FN(sk_release, 86, ##ctx) \ + FN(map_push_elem, 87, ##ctx) \ + FN(map_pop_elem, 88, ##ctx) \ + FN(map_peek_elem, 89, ##ctx) \ + FN(msg_push_data, 90, ##ctx) \ + FN(msg_pop_data, 91, ##ctx) \ + FN(rc_pointer_rel, 92, ##ctx) \ + FN(spin_lock, 93, ##ctx) \ + FN(spin_unlock, 94, ##ctx) \ + FN(sk_fullsock, 95, ##ctx) \ + FN(tcp_sock, 96, ##ctx) \ + FN(skb_ecn_set_ce, 97, ##ctx) \ + FN(get_listener_sock, 98, ##ctx) \ + FN(skc_lookup_tcp, 99, ##ctx) \ + FN(tcp_check_syncookie, 100, ##ctx) \ + FN(sysctl_get_name, 101, ##ctx) \ + FN(sysctl_get_current_value, 102, ##ctx) \ + FN(sysctl_get_new_value, 103, ##ctx) \ + FN(sysctl_set_new_value, 104, ##ctx) \ + FN(strtol, 105, ##ctx) \ + FN(strtoul, 106, ##ctx) \ + FN(sk_storage_get, 107, ##ctx) \ + FN(sk_storage_delete, 108, ##ctx) \ + FN(send_signal, 109, ##ctx) \ + FN(tcp_gen_syncookie, 110, ##ctx) \ + FN(skb_output, 111, ##ctx) \ + FN(probe_read_user, 112, ##ctx) \ + FN(probe_read_kernel, 113, ##ctx) \ + FN(probe_read_user_str, 114, ##ctx) \ + FN(probe_read_kernel_str, 115, ##ctx) \ + FN(tcp_send_ack, 116, ##ctx) \ + FN(send_signal_thread, 117, ##ctx) \ + FN(jiffies64, 118, ##ctx) \ + FN(read_branch_records, 119, ##ctx) \ + FN(get_ns_current_pid_tgid, 120, ##ctx) \ + FN(xdp_output, 121, ##ctx) \ + FN(get_netns_cookie, 122, ##ctx) \ + FN(get_current_ancestor_cgroup_id, 123, ##ctx) \ + FN(sk_assign, 124, ##ctx) \ + FN(ktime_get_boot_ns, 125, ##ctx) \ + FN(seq_printf, 126, ##ctx) \ + FN(seq_write, 127, ##ctx) \ + FN(sk_cgroup_id, 128, ##ctx) \ + FN(sk_ancestor_cgroup_id, 129, ##ctx) \ + FN(ringbuf_output, 130, ##ctx) \ + FN(ringbuf_reserve, 131, ##ctx) \ + FN(ringbuf_submit, 132, ##ctx) \ + FN(ringbuf_discard, 133, ##ctx) \ + FN(ringbuf_query, 134, ##ctx) \ + FN(csum_level, 135, ##ctx) \ + FN(skc_to_tcp6_sock, 136, ##ctx) \ + FN(skc_to_tcp_sock, 137, ##ctx) \ + FN(skc_to_tcp_timewait_sock, 138, ##ctx) \ + FN(skc_to_tcp_request_sock, 139, ##ctx) \ + FN(skc_to_udp6_sock, 140, ##ctx) \ + FN(get_task_stack, 141, ##ctx) \ + FN(load_hdr_opt, 142, ##ctx) \ + FN(store_hdr_opt, 143, ##ctx) \ + FN(reserve_hdr_opt, 144, ##ctx) \ + FN(inode_storage_get, 145, ##ctx) \ + FN(inode_storage_delete, 146, ##ctx) \ + FN(d_path, 147, ##ctx) \ + FN(copy_from_user, 148, ##ctx) \ + FN(snprintf_btf, 149, ##ctx) \ + FN(seq_printf_btf, 150, ##ctx) \ + FN(skb_cgroup_classid, 151, ##ctx) \ + FN(redirect_neigh, 152, ##ctx) \ + FN(per_cpu_ptr, 153, ##ctx) \ + FN(this_cpu_ptr, 154, ##ctx) \ + FN(redirect_peer, 155, ##ctx) \ + FN(task_storage_get, 156, ##ctx) \ + FN(task_storage_delete, 157, ##ctx) \ + FN(get_current_task_btf, 158, ##ctx) \ + FN(bprm_opts_set, 159, ##ctx) \ + FN(ktime_get_coarse_ns, 160, ##ctx) \ + FN(ima_inode_hash, 161, ##ctx) \ + FN(sock_from_file, 162, ##ctx) \ + FN(check_mtu, 163, ##ctx) \ + FN(for_each_map_elem, 164, ##ctx) \ + FN(snprintf, 165, ##ctx) \ + FN(sys_bpf, 166, ##ctx) \ + FN(btf_find_by_name_kind, 167, ##ctx) \ + FN(sys_close, 168, ##ctx) \ + FN(timer_init, 169, ##ctx) \ + FN(timer_set_callback, 170, ##ctx) \ + FN(timer_start, 171, ##ctx) \ + FN(timer_cancel, 172, ##ctx) \ + FN(get_func_ip, 173, ##ctx) \ + FN(get_attach_cookie, 174, ##ctx) \ + FN(task_pt_regs, 175, ##ctx) \ + FN(get_branch_snapshot, 176, ##ctx) \ + FN(trace_vprintk, 177, ##ctx) \ + FN(skc_to_unix_sock, 178, ##ctx) \ + FN(kallsyms_lookup_name, 179, ##ctx) \ + FN(find_vma, 180, ##ctx) \ + FN(loop, 181, ##ctx) \ + FN(strncmp, 182, ##ctx) \ + FN(get_func_arg, 183, ##ctx) \ + FN(get_func_ret, 184, ##ctx) \ + FN(get_func_arg_cnt, 185, ##ctx) \ + FN(get_retval, 186, ##ctx) \ + FN(set_retval, 187, ##ctx) \ + FN(xdp_get_buff_len, 188, ##ctx) \ + FN(xdp_load_bytes, 189, ##ctx) \ + FN(xdp_store_bytes, 190, ##ctx) \ + FN(copy_from_user_task, 191, ##ctx) \ + FN(skb_set_tstamp, 192, ##ctx) \ + FN(ima_file_hash, 193, ##ctx) \ + FN(kptr_xchg, 194, ##ctx) \ + FN(map_lookup_percpu_elem, 195, ##ctx) \ + FN(skc_to_mptcp_sock, 196, ##ctx) \ + FN(dynptr_from_mem, 197, ##ctx) \ + FN(ringbuf_reserve_dynptr, 198, ##ctx) \ + FN(ringbuf_submit_dynptr, 199, ##ctx) \ + FN(ringbuf_discard_dynptr, 200, ##ctx) \ + FN(dynptr_read, 201, ##ctx) \ + FN(dynptr_write, 202, ##ctx) \ + FN(dynptr_data, 203, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv4, 204, ##ctx) \ + FN(tcp_raw_gen_syncookie_ipv6, 205, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv4, 206, ##ctx) \ + FN(tcp_raw_check_syncookie_ipv6, 207, ##ctx) \ + FN(ktime_get_tai_ns, 208, ##ctx) \ + FN(user_ringbuf_drain, 209, ##ctx) \ + FN(cgrp_storage_get, 210, ##ctx) \ + FN(cgrp_storage_delete, 211, ##ctx) \ + /* */ + +/* backwards-compatibility macros for users of __BPF_FUNC_MAPPER that don't + * know or care about integer value that is now passed as second argument + */ +#define __BPF_FUNC_MAPPER_APPLY(name, value, FN) FN(name), +#define __BPF_FUNC_MAPPER(FN) ___BPF_FUNC_MAPPER(__BPF_FUNC_MAPPER_APPLY, FN) + +/* integer value in 'imm' field of BPF_CALL instruction selects which helper + * function eBPF program intends to call + */ +#define __BPF_ENUM_FN(x, y) BPF_FUNC_ ## x = y, +enum bpf_func_id { + ___BPF_FUNC_MAPPER(__BPF_ENUM_FN) + __BPF_FUNC_MAX_ID, +}; +#undef __BPF_ENUM_FN + +/* All flags used by eBPF helper functions, placed here. */ + +/* BPF_FUNC_skb_store_bytes flags. */ +enum { + BPF_F_RECOMPUTE_CSUM = (1ULL << 0), + BPF_F_INVALIDATE_HASH = (1ULL << 1), +}; + +/* BPF_FUNC_l3_csum_replace and BPF_FUNC_l4_csum_replace flags. + * First 4 bits are for passing the header field size. + */ +enum { + BPF_F_HDR_FIELD_MASK = 0xfULL, +}; + +/* BPF_FUNC_l4_csum_replace flags. */ +enum { + BPF_F_PSEUDO_HDR = (1ULL << 4), + BPF_F_MARK_MANGLED_0 = (1ULL << 5), + BPF_F_MARK_ENFORCE = (1ULL << 6), +}; + +/* BPF_FUNC_clone_redirect and BPF_FUNC_redirect flags. */ +enum { + BPF_F_INGRESS = (1ULL << 0), +}; + +/* BPF_FUNC_skb_set_tunnel_key and BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_IPV6 = (1ULL << 0), +}; + +/* flags for both BPF_FUNC_get_stackid and BPF_FUNC_get_stack. */ +enum { + BPF_F_SKIP_FIELD_MASK = 0xffULL, + BPF_F_USER_STACK = (1ULL << 8), +/* flags used by BPF_FUNC_get_stackid only. */ + BPF_F_FAST_STACK_CMP = (1ULL << 9), + BPF_F_REUSE_STACKID = (1ULL << 10), +/* flags used by BPF_FUNC_get_stack only. */ + BPF_F_USER_BUILD_ID = (1ULL << 11), +}; + +/* BPF_FUNC_skb_set_tunnel_key flags. */ +enum { + BPF_F_ZERO_CSUM_TX = (1ULL << 1), + BPF_F_DONT_FRAGMENT = (1ULL << 2), + BPF_F_SEQ_NUMBER = (1ULL << 3), + BPF_F_NO_TUNNEL_KEY = (1ULL << 4), +}; + +/* BPF_FUNC_skb_get_tunnel_key flags. */ +enum { + BPF_F_TUNINFO_FLAGS = (1ULL << 4), +}; + +/* BPF_FUNC_perf_event_output, BPF_FUNC_perf_event_read and + * BPF_FUNC_perf_event_read_value flags. + */ +enum { + BPF_F_INDEX_MASK = 0xffffffffULL, + BPF_F_CURRENT_CPU = BPF_F_INDEX_MASK, +/* BPF_FUNC_perf_event_output for sk_buff input context. */ + BPF_F_CTXLEN_MASK = (0xfffffULL << 32), +}; + +/* Current network namespace */ +enum { + BPF_F_CURRENT_NETNS = (-1L), +}; + +/* BPF_FUNC_csum_level level values. */ +enum { + BPF_CSUM_LEVEL_QUERY, + BPF_CSUM_LEVEL_INC, + BPF_CSUM_LEVEL_DEC, + BPF_CSUM_LEVEL_RESET, +}; + +/* BPF_FUNC_skb_adjust_room flags. */ +enum { + BPF_F_ADJ_ROOM_FIXED_GSO = (1ULL << 0), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV4 = (1ULL << 1), + BPF_F_ADJ_ROOM_ENCAP_L3_IPV6 = (1ULL << 2), + BPF_F_ADJ_ROOM_ENCAP_L4_GRE = (1ULL << 3), + BPF_F_ADJ_ROOM_ENCAP_L4_UDP = (1ULL << 4), + BPF_F_ADJ_ROOM_NO_CSUM_RESET = (1ULL << 5), + BPF_F_ADJ_ROOM_ENCAP_L2_ETH = (1ULL << 6), + BPF_F_ADJ_ROOM_DECAP_L3_IPV4 = (1ULL << 7), + BPF_F_ADJ_ROOM_DECAP_L3_IPV6 = (1ULL << 8), +}; + +enum { + BPF_ADJ_ROOM_ENCAP_L2_MASK = 0xff, + BPF_ADJ_ROOM_ENCAP_L2_SHIFT = 56, +}; + +#define BPF_F_ADJ_ROOM_ENCAP_L2(len) (((__u64)len & \ + BPF_ADJ_ROOM_ENCAP_L2_MASK) \ + << BPF_ADJ_ROOM_ENCAP_L2_SHIFT) + +/* BPF_FUNC_sysctl_get_name flags. */ +enum { + BPF_F_SYSCTL_BASE_NAME = (1ULL << 0), +}; + +/* BPF_FUNC__storage_get flags */ +enum { + BPF_LOCAL_STORAGE_GET_F_CREATE = (1ULL << 0), + /* BPF_SK_STORAGE_GET_F_CREATE is only kept for backward compatibility + * and BPF_LOCAL_STORAGE_GET_F_CREATE must be used instead. + */ + BPF_SK_STORAGE_GET_F_CREATE = BPF_LOCAL_STORAGE_GET_F_CREATE, +}; + +/* BPF_FUNC_read_branch_records flags. */ +enum { + BPF_F_GET_BRANCH_RECORDS_SIZE = (1ULL << 0), +}; + +/* BPF_FUNC_bpf_ringbuf_commit, BPF_FUNC_bpf_ringbuf_discard, and + * BPF_FUNC_bpf_ringbuf_output flags. + */ +enum { + BPF_RB_NO_WAKEUP = (1ULL << 0), + BPF_RB_FORCE_WAKEUP = (1ULL << 1), +}; + +/* BPF_FUNC_bpf_ringbuf_query flags */ +enum { + BPF_RB_AVAIL_DATA = 0, + BPF_RB_RING_SIZE = 1, + BPF_RB_CONS_POS = 2, + BPF_RB_PROD_POS = 3, +}; + +/* BPF ring buffer constants */ +enum { + BPF_RINGBUF_BUSY_BIT = (1U << 31), + BPF_RINGBUF_DISCARD_BIT = (1U << 30), + BPF_RINGBUF_HDR_SZ = 8, +}; + +/* BPF_FUNC_sk_assign flags in bpf_sk_lookup context. */ +enum { + BPF_SK_LOOKUP_F_REPLACE = (1ULL << 0), + BPF_SK_LOOKUP_F_NO_REUSEPORT = (1ULL << 1), +}; + +/* Mode for BPF_FUNC_skb_adjust_room helper. */ +enum bpf_adj_room_mode { + BPF_ADJ_ROOM_NET, + BPF_ADJ_ROOM_MAC, +}; + +/* Mode for BPF_FUNC_skb_load_bytes_relative helper. */ +enum bpf_hdr_start_off { + BPF_HDR_START_MAC, + BPF_HDR_START_NET, +}; + +/* Encapsulation type for BPF_FUNC_lwt_push_encap helper. */ +enum bpf_lwt_encap_mode { + BPF_LWT_ENCAP_SEG6, + BPF_LWT_ENCAP_SEG6_INLINE, + BPF_LWT_ENCAP_IP, +}; + +/* Flags for bpf_bprm_opts_set helper */ +enum { + BPF_F_BPRM_SECUREEXEC = (1ULL << 0), +}; + +/* Flags for bpf_redirect_map helper */ +enum { + BPF_F_BROADCAST = (1ULL << 3), + BPF_F_EXCLUDE_INGRESS = (1ULL << 4), +}; + +#define __bpf_md_ptr(type, name) \ +union { \ + type name; \ + __u64 :64; \ +} __attribute__((aligned(8))) + +/* The enum used in skb->tstamp_type. It specifies the clock type + * of the time stored in the skb->tstamp. + */ +enum { + BPF_SKB_TSTAMP_UNSPEC = 0, /* DEPRECATED */ + BPF_SKB_TSTAMP_DELIVERY_MONO = 1, /* DEPRECATED */ + BPF_SKB_CLOCK_REALTIME = 0, + BPF_SKB_CLOCK_MONOTONIC = 1, + BPF_SKB_CLOCK_TAI = 2, + /* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle, + * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid. + */ +}; + +/* user accessible mirror of in-kernel sk_buff. + * new fields can only be added to the end of this structure + */ +struct __sk_buff { + __u32 len; + __u32 pkt_type; + __u32 mark; + __u32 queue_mapping; + __u32 protocol; + __u32 vlan_present; + __u32 vlan_tci; + __u32 vlan_proto; + __u32 priority; + __u32 ingress_ifindex; + __u32 ifindex; + __u32 tc_index; + __u32 cb[5]; + __u32 hash; + __u32 tc_classid; + __u32 data; + __u32 data_end; + __u32 napi_id; + + /* Accessed by BPF_PROG_TYPE_sk_skb types from here to ... */ + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + /* ... here. */ + + __u32 data_meta; + __bpf_md_ptr(struct bpf_flow_keys *, flow_keys); + __u64 tstamp; + __u32 wire_len; + __u32 gso_segs; + __bpf_md_ptr(struct bpf_sock *, sk); + __u32 gso_size; + __u8 tstamp_type; + __u32 :24; /* Padding, future use. */ + __u64 hwtstamp; +}; + +struct bpf_tunnel_key { + __u32 tunnel_id; + union { + __u32 remote_ipv4; + __u32 remote_ipv6[4]; + }; + __u8 tunnel_tos; + __u8 tunnel_ttl; + union { + __u16 tunnel_ext; /* compat */ + __be16 tunnel_flags; + }; + __u32 tunnel_label; + union { + __u32 local_ipv4; + __u32 local_ipv6[4]; + }; +}; + +/* user accessible mirror of in-kernel xfrm_state. + * new fields can only be added to the end of this structure + */ +struct bpf_xfrm_state { + __u32 reqid; + __u32 spi; /* Stored in network byte order */ + __u16 family; + __u16 ext; /* Padding, future use. */ + union { + __u32 remote_ipv4; /* Stored in network byte order */ + __u32 remote_ipv6[4]; /* Stored in network byte order */ + }; +}; + +/* Generic BPF return codes which all BPF program types may support. + * The values are binary compatible with their TC_ACT_* counter-part to + * provide backwards compatibility with existing SCHED_CLS and SCHED_ACT + * programs. + * + * XDP is handled seprately, see XDP_*. + */ +enum bpf_ret_code { + BPF_OK = 0, + /* 1 reserved */ + BPF_DROP = 2, + /* 3-6 reserved */ + BPF_REDIRECT = 7, + /* >127 are reserved for prog type specific return codes. + * + * BPF_LWT_REROUTE: used by BPF_PROG_TYPE_LWT_IN and + * BPF_PROG_TYPE_LWT_XMIT to indicate that skb had been + * changed and should be routed based on its new L3 header. + * (This is an L3 redirect, as opposed to L2 redirect + * represented by BPF_REDIRECT above). + */ + BPF_LWT_REROUTE = 128, + /* BPF_FLOW_DISSECTOR_CONTINUE: used by BPF_PROG_TYPE_FLOW_DISSECTOR + * to indicate that no custom dissection was performed, and + * fallback to standard dissector is requested. + */ + BPF_FLOW_DISSECTOR_CONTINUE = 129, +}; + +struct bpf_sock { + __u32 bound_dev_if; + __u32 family; + __u32 type; + __u32 protocol; + __u32 mark; + __u32 priority; + /* IP address also allows 1 and 2 bytes access */ + __u32 src_ip4; + __u32 src_ip6[4]; + __u32 src_port; /* host byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ + __u32 dst_ip4; + __u32 dst_ip6[4]; + __u32 state; + __s32 rx_queue_mapping; +}; + +struct bpf_tcp_sock { + __u32 snd_cwnd; /* Sending congestion window */ + __u32 srtt_us; /* smoothed round trip time << 3 in usecs */ + __u32 rtt_min; + __u32 snd_ssthresh; /* Slow start size threshold */ + __u32 rcv_nxt; /* What we want to receive next */ + __u32 snd_nxt; /* Next sequence we send */ + __u32 snd_una; /* First byte we want an ack for */ + __u32 mss_cache; /* Cached effective mss, not including SACKS */ + __u32 ecn_flags; /* ECN status bits. */ + __u32 rate_delivered; /* saved rate sample: packets delivered */ + __u32 rate_interval_us; /* saved rate sample: time elapsed */ + __u32 packets_out; /* Packets which are "in flight" */ + __u32 retrans_out; /* Retransmitted packets out */ + __u32 total_retrans; /* Total retransmits for entire connection */ + __u32 segs_in; /* RFC4898 tcpEStatsPerfSegsIn + * total number of segments in. + */ + __u32 data_segs_in; /* RFC4898 tcpEStatsPerfDataSegsIn + * total number of data segments in. + */ + __u32 segs_out; /* RFC4898 tcpEStatsPerfSegsOut + * The total number of segments sent. + */ + __u32 data_segs_out; /* RFC4898 tcpEStatsPerfDataSegsOut + * total number of data segments sent. + */ + __u32 lost_out; /* Lost packets */ + __u32 sacked_out; /* SACK'd packets */ + __u64 bytes_received; /* RFC4898 tcpEStatsAppHCThruOctetsReceived + * sum(delta(rcv_nxt)), or how many bytes + * were acked. + */ + __u64 bytes_acked; /* RFC4898 tcpEStatsAppHCThruOctetsAcked + * sum(delta(snd_una)), or how many bytes + * were acked. + */ + __u32 dsack_dups; /* RFC4898 tcpEStatsStackDSACKDups + * total number of DSACK blocks received + */ + __u32 delivered; /* Total data packets delivered incl. rexmits */ + __u32 delivered_ce; /* Like the above but only ECE marked packets */ + __u32 icsk_retransmits; /* Number of unrecovered [RTO] timeouts */ +}; + +struct bpf_sock_tuple { + union { + struct { + __be32 saddr; + __be32 daddr; + __be16 sport; + __be16 dport; + } ipv4; + struct { + __be32 saddr[4]; + __be32 daddr[4]; + __be16 sport; + __be16 dport; + } ipv6; + }; +}; + +/* (Simplified) user return codes for tcx prog type. + * A valid tcx program must return one of these defined values. All other + * return codes are reserved for future use. Must remain compatible with + * their TC_ACT_* counter-parts. For compatibility in behavior, unknown + * return codes are mapped to TCX_NEXT. + */ +enum tcx_action_base { + TCX_NEXT = -1, + TCX_PASS = 0, + TCX_DROP = 2, + TCX_REDIRECT = 7, +}; + +struct bpf_xdp_sock { + __u32 queue_id; +}; + +#define XDP_PACKET_HEADROOM 256 + +/* User return codes for XDP prog type. + * A valid XDP program must return one of these defined values. All other + * return codes are reserved for future use. Unknown return codes will + * result in packet drops and a warning via bpf_warn_invalid_xdp_action(). + */ +enum xdp_action { + XDP_ABORTED = 0, + XDP_DROP, + XDP_PASS, + XDP_TX, + XDP_REDIRECT, +}; + +/* user accessible metadata for XDP packet hook + * new fields must be added to the end of this structure + */ +struct xdp_md { + __u32 data; + __u32 data_end; + __u32 data_meta; + /* Below access go through struct xdp_rxq_info */ + __u32 ingress_ifindex; /* rxq->dev->ifindex */ + __u32 rx_queue_index; /* rxq->queue_index */ + + __u32 egress_ifindex; /* txq->dev->ifindex */ +}; + +/* DEVMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_devmap_val { + __u32 ifindex; /* device index */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +/* CPUMAP map-value layout + * + * The struct data-layout of map-value is a configuration interface. + * New members can only be added to the end of this structure. + */ +struct bpf_cpumap_val { + __u32 qsize; /* queue size to remote target CPU */ + union { + int fd; /* prog fd on map write */ + __u32 id; /* prog id on map read */ + } bpf_prog; +}; + +enum sk_action { + SK_DROP = 0, + SK_PASS, +}; + +/* user accessible metadata for SK_MSG packet hook, new fields must + * be added to the end of this structure + */ +struct sk_msg_md { + __bpf_md_ptr(void *, data); + __bpf_md_ptr(void *, data_end); + + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 size; /* Total size of sk_msg */ + + __bpf_md_ptr(struct bpf_sock *, sk); /* current socket */ +}; + +struct sk_reuseport_md { + /* + * Start of directly accessible data. It begins from + * the tcp/udp header. + */ + __bpf_md_ptr(void *, data); + /* End of directly accessible data */ + __bpf_md_ptr(void *, data_end); + /* + * Total length of packet (starting from the tcp/udp header). + * Note that the directly accessible bytes (data_end - data) + * could be less than this "len". Those bytes could be + * indirectly read by a helper "bpf_skb_load_bytes()". + */ + __u32 len; + /* + * Eth protocol in the mac header (network byte order). e.g. + * ETH_P_IP(0x0800) and ETH_P_IPV6(0x86DD) + */ + __u32 eth_protocol; + __u32 ip_protocol; /* IP protocol. e.g. IPPROTO_TCP, IPPROTO_UDP */ + __u32 bind_inany; /* Is sock bound to an INANY address? */ + __u32 hash; /* A hash of the packet 4 tuples */ + /* When reuse->migrating_sk is NULL, it is selecting a sk for the + * new incoming connection request (e.g. selecting a listen sk for + * the received SYN in the TCP case). reuse->sk is one of the sk + * in the reuseport group. The bpf prog can use reuse->sk to learn + * the local listening ip/port without looking into the skb. + * + * When reuse->migrating_sk is not NULL, reuse->sk is closed and + * reuse->migrating_sk is the socket that needs to be migrated + * to another listening socket. migrating_sk could be a fullsock + * sk that is fully established or a reqsk that is in-the-middle + * of 3-way handshake. + */ + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(struct bpf_sock *, migrating_sk); +}; + +#define BPF_TAG_SIZE 8 + +struct bpf_prog_info { + __u32 type; + __u32 id; + __u8 tag[BPF_TAG_SIZE]; + __u32 jited_prog_len; + __u32 xlated_prog_len; + __aligned_u64 jited_prog_insns; + __aligned_u64 xlated_prog_insns; + __u64 load_time; /* ns since boottime */ + __u32 created_by_uid; + __u32 nr_map_ids; + __aligned_u64 map_ids; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 gpl_compatible:1; + __u32 :31; /* alignment pad */ + __u64 netns_dev; + __u64 netns_ino; + __u32 nr_jited_ksyms; + __u32 nr_jited_func_lens; + __aligned_u64 jited_ksyms; + __aligned_u64 jited_func_lens; + __u32 btf_id; + __u32 func_info_rec_size; + __aligned_u64 func_info; + __u32 nr_func_info; + __u32 nr_line_info; + __aligned_u64 line_info; + __aligned_u64 jited_line_info; + __u32 nr_jited_line_info; + __u32 line_info_rec_size; + __u32 jited_line_info_rec_size; + __u32 nr_prog_tags; + __aligned_u64 prog_tags; + __u64 run_time_ns; + __u64 run_cnt; + __u64 recursion_misses; + __u32 verified_insns; + __u32 attach_btf_obj_id; + __u32 attach_btf_id; +} __attribute__((aligned(8))); + +struct bpf_map_info { + __u32 type; + __u32 id; + __u32 key_size; + __u32 value_size; + __u32 max_entries; + __u32 map_flags; + char name[BPF_OBJ_NAME_LEN]; + __u32 ifindex; + __u32 btf_vmlinux_value_type_id; + __u64 netns_dev; + __u64 netns_ino; + __u32 btf_id; + __u32 btf_key_type_id; + __u32 btf_value_type_id; + __u32 btf_vmlinux_id; + __u64 map_extra; +} __attribute__((aligned(8))); + +struct bpf_btf_info { + __aligned_u64 btf; + __u32 btf_size; + __u32 id; + __aligned_u64 name; + __u32 name_len; + __u32 kernel_btf; +} __attribute__((aligned(8))); + +struct bpf_link_info { + __u32 type; + __u32 id; + __u32 prog_id; + union { + struct { + __aligned_u64 tp_name; /* in/out: tp_name buffer ptr */ + __u32 tp_name_len; /* in/out: tp_name buffer len */ + } raw_tracepoint; + struct { + __u32 attach_type; + __u32 target_obj_id; /* prog_id for PROG_EXT, otherwise btf object id */ + __u32 target_btf_id; /* BTF type id inside the object */ + } tracing; + struct { + __u64 cgroup_id; + __u32 attach_type; + } cgroup; + struct { + __aligned_u64 target_name; /* in/out: target_name buffer ptr */ + __u32 target_name_len; /* in/out: target_name buffer len */ + + /* If the iter specific field is 32 bits, it can be put + * in the first or second union. Otherwise it should be + * put in the second union. + */ + union { + struct { + __u32 map_id; + } map; + }; + union { + struct { + __u64 cgroup_id; + __u32 order; + } cgroup; + struct { + __u32 tid; + __u32 pid; + } task; + }; + } iter; + struct { + __u32 netns_ino; + __u32 attach_type; + } netns; + struct { + __u32 ifindex; + } xdp; + struct { + __u32 map_id; + } struct_ops; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + struct { + __aligned_u64 addrs; + __u32 count; /* in/out: kprobe_multi function count */ + __u32 flags; + __u64 missed; + __aligned_u64 cookies; + } kprobe_multi; + struct { + __aligned_u64 path; + __aligned_u64 offsets; + __aligned_u64 ref_ctr_offsets; + __aligned_u64 cookies; + __u32 path_size; /* in/out: real path size on success, including zero byte */ + __u32 count; /* in/out: uprobe_multi offsets/ref_ctr_offsets/cookies count */ + __u32 flags; + __u32 pid; + } uprobe_multi; + struct { + __u32 type; /* enum bpf_perf_event_type */ + __u32 :32; + union { + struct { + __aligned_u64 file_name; /* in/out */ + __u32 name_len; + __u32 offset; /* offset from file_name */ + __u64 cookie; + } uprobe; /* BPF_PERF_EVENT_UPROBE, BPF_PERF_EVENT_URETPROBE */ + struct { + __aligned_u64 func_name; /* in/out */ + __u32 name_len; + __u32 offset; /* offset from func_name */ + __u64 addr; + __u64 missed; + __u64 cookie; + } kprobe; /* BPF_PERF_EVENT_KPROBE, BPF_PERF_EVENT_KRETPROBE */ + struct { + __aligned_u64 tp_name; /* in/out */ + __u32 name_len; + __u32 :32; + __u64 cookie; + } tracepoint; /* BPF_PERF_EVENT_TRACEPOINT */ + struct { + __u64 config; + __u32 type; + __u32 :32; + __u64 cookie; + } event; /* BPF_PERF_EVENT_EVENT */ + }; + } perf_event; + struct { + __u32 ifindex; + __u32 attach_type; + } tcx; + struct { + __u32 ifindex; + __u32 attach_type; + } netkit; + struct { + __u32 map_id; + __u32 attach_type; + } sockmap; + }; +} __attribute__((aligned(8))); + +/* User bpf_sock_addr struct to access socket fields and sockaddr struct passed + * by user and intended to be used by socket (e.g. to bind to, depends on + * attach type). + */ +struct bpf_sock_addr { + __u32 user_family; /* Allows 4-byte read, but no write. */ + __u32 user_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 user_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __u32 user_port; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order + */ + __u32 family; /* Allows 4-byte read, but no write */ + __u32 type; /* Allows 4-byte read, but no write */ + __u32 protocol; /* Allows 4-byte read, but no write */ + __u32 msg_src_ip4; /* Allows 1,2,4-byte read and 4-byte write. + * Stored in network byte order. + */ + __u32 msg_src_ip6[4]; /* Allows 1,2,4,8-byte read and 4,8-byte write. + * Stored in network byte order. + */ + __bpf_md_ptr(struct bpf_sock *, sk); +}; + +/* User bpf_sock_ops struct to access socket values and specify request ops + * and their replies. + * Some of this fields are in network (bigendian) byte order and may need + * to be converted before use (bpf_ntohl() defined in samples/bpf/bpf_endian.h). + * New fields can only be added at the end of this structure + */ +struct bpf_sock_ops { + __u32 op; + union { + __u32 args[4]; /* Optionally passed to bpf program */ + __u32 reply; /* Returned by bpf program */ + __u32 replylong[4]; /* Optionally returned by bpf prog */ + }; + __u32 family; + __u32 remote_ip4; /* Stored in network byte order */ + __u32 local_ip4; /* Stored in network byte order */ + __u32 remote_ip6[4]; /* Stored in network byte order */ + __u32 local_ip6[4]; /* Stored in network byte order */ + __u32 remote_port; /* Stored in network byte order */ + __u32 local_port; /* stored in host byte order */ + __u32 is_fullsock; /* Some TCP fields are only valid if + * there is a full socket. If not, the + * fields read as zero. + */ + __u32 snd_cwnd; + __u32 srtt_us; /* Averaged RTT << 3 in usecs */ + __u32 bpf_sock_ops_cb_flags; /* flags defined in uapi/linux/tcp.h */ + __u32 state; + __u32 rtt_min; + __u32 snd_ssthresh; + __u32 rcv_nxt; + __u32 snd_nxt; + __u32 snd_una; + __u32 mss_cache; + __u32 ecn_flags; + __u32 rate_delivered; + __u32 rate_interval_us; + __u32 packets_out; + __u32 retrans_out; + __u32 total_retrans; + __u32 segs_in; + __u32 data_segs_in; + __u32 segs_out; + __u32 data_segs_out; + __u32 lost_out; + __u32 sacked_out; + __u32 sk_txhash; + __u64 bytes_received; + __u64 bytes_acked; + __bpf_md_ptr(struct bpf_sock *, sk); + /* [skb_data, skb_data_end) covers the whole TCP header. + * + * BPF_SOCK_OPS_PARSE_HDR_OPT_CB: The packet received + * BPF_SOCK_OPS_HDR_OPT_LEN_CB: Not useful because the + * header has not been written. + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB: The header and options have + * been written so far. + * BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB: The SYNACK that concludes + * the 3WHS. + * BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB: The ACK that concludes + * the 3WHS. + * + * bpf_load_hdr_opt() can also be used to read a particular option. + */ + __bpf_md_ptr(void *, skb_data); + __bpf_md_ptr(void *, skb_data_end); + __u32 skb_len; /* The total length of a packet. + * It includes the header, options, + * and payload. + */ + __u32 skb_tcp_flags; /* tcp_flags of the header. It provides + * an easy way to check for tcp_flags + * without parsing skb_data. + * + * In particular, the skb_tcp_flags + * will still be available in + * BPF_SOCK_OPS_HDR_OPT_LEN even though + * the outgoing header has not + * been written yet. + */ + __u64 skb_hwtstamp; +}; + +/* Definitions for bpf_sock_ops_cb_flags */ +enum { + BPF_SOCK_OPS_RTO_CB_FLAG = (1<<0), + BPF_SOCK_OPS_RETRANS_CB_FLAG = (1<<1), + BPF_SOCK_OPS_STATE_CB_FLAG = (1<<2), + BPF_SOCK_OPS_RTT_CB_FLAG = (1<<3), + /* Call bpf for all received TCP headers. The bpf prog will be + * called under sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + * + * It could be used at the client/active side (i.e. connect() side) + * when the server told it that the server was in syncookie + * mode and required the active side to resend the bpf-written + * options. The active side can keep writing the bpf-options until + * it received a valid packet from the server side to confirm + * the earlier packet (and options) has been received. The later + * example patch is using it like this at the active side when the + * server is in syncookie mode. + * + * The bpf prog will usually turn this off in the common cases. + */ + BPF_SOCK_OPS_PARSE_ALL_HDR_OPT_CB_FLAG = (1<<4), + /* Call bpf when kernel has received a header option that + * the kernel cannot handle. The bpf prog will be called under + * sock_ops->op == BPF_SOCK_OPS_PARSE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_PARSE_HDR_OPT_CB + * for the header option related helpers that will be useful + * to the bpf programs. + */ + BPF_SOCK_OPS_PARSE_UNKNOWN_HDR_OPT_CB_FLAG = (1<<5), + /* Call bpf when the kernel is writing header options for the + * outgoing packet. The bpf prog will first be called + * to reserve space in a skb under + * sock_ops->op == BPF_SOCK_OPS_HDR_OPT_LEN_CB. Then + * the bpf prog will be called to write the header option(s) + * under sock_ops->op == BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * + * Please refer to the comment in BPF_SOCK_OPS_HDR_OPT_LEN_CB + * and BPF_SOCK_OPS_WRITE_HDR_OPT_CB for the header option + * related helpers that will be useful to the bpf programs. + * + * The kernel gets its chance to reserve space and write + * options first before the BPF program does. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB_FLAG = (1<<6), +/* Mask of all currently supported cb flags */ + BPF_SOCK_OPS_ALL_CB_FLAGS = 0x7F, +}; + +/* List of known BPF sock_ops operators. + * New entries can only be added at the end + */ +enum { + BPF_SOCK_OPS_VOID, + BPF_SOCK_OPS_TIMEOUT_INIT, /* Should return SYN-RTO value to use or + * -1 if default value should be used + */ + BPF_SOCK_OPS_RWND_INIT, /* Should return initial advertized + * window (in packets) or -1 if default + * value should be used + */ + BPF_SOCK_OPS_TCP_CONNECT_CB, /* Calls BPF program right before an + * active connection is initialized + */ + BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB, /* Calls BPF program when an + * active connection is + * established + */ + BPF_SOCK_OPS_PASSIVE_ESTABLISHED_CB, /* Calls BPF program when a + * passive connection is + * established + */ + BPF_SOCK_OPS_NEEDS_ECN, /* If connection's congestion control + * needs ECN + */ + BPF_SOCK_OPS_BASE_RTT, /* Get base RTT. The correct value is + * based on the path and may be + * dependent on the congestion control + * algorithm. In general it indicates + * a congestion threshold. RTTs above + * this indicate congestion + */ + BPF_SOCK_OPS_RTO_CB, /* Called when an RTO has triggered. + * Arg1: value of icsk_retransmits + * Arg2: value of icsk_rto + * Arg3: whether RTO has expired + */ + BPF_SOCK_OPS_RETRANS_CB, /* Called when skb is retransmitted. + * Arg1: sequence number of 1st byte + * Arg2: # segments + * Arg3: return value of + * tcp_transmit_skb (0 => success) + */ + BPF_SOCK_OPS_STATE_CB, /* Called when TCP changes state. + * Arg1: old_state + * Arg2: new_state + */ + BPF_SOCK_OPS_TCP_LISTEN_CB, /* Called on listen(2), right after + * socket transition to LISTEN state. + */ + BPF_SOCK_OPS_RTT_CB, /* Called on every RTT. + * Arg1: measured RTT input (mrtt) + * Arg2: updated srtt + */ + BPF_SOCK_OPS_PARSE_HDR_OPT_CB, /* Parse the header option. + * It will be called to handle + * the packets received at + * an already established + * connection. + * + * sock_ops->skb_data: + * Referring to the received skb. + * It covers the TCP header only. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option. + */ + BPF_SOCK_OPS_HDR_OPT_LEN_CB, /* Reserve space for writing the + * header option later in + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Not available because no header has + * been written yet. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the + * outgoing skb. (e.g. SYN, ACK, FIN). + * + * bpf_reserve_hdr_opt() should + * be used to reserve space. + */ + BPF_SOCK_OPS_WRITE_HDR_OPT_CB, /* Write the header options + * Arg1: bool want_cookie. (in + * writing SYNACK only) + * + * sock_ops->skb_data: + * Referring to the outgoing skb. + * It covers the TCP header + * that has already been written + * by the kernel and the + * earlier bpf-progs. + * + * sock_ops->skb_tcp_flags: + * The tcp_flags of the outgoing + * skb. (e.g. SYN, ACK, FIN). + * + * bpf_store_hdr_opt() should + * be used to write the + * option. + * + * bpf_load_hdr_opt() can also + * be used to search for a + * particular option that + * has already been written + * by the kernel or the + * earlier bpf-progs. + */ +}; + +/* List of TCP states. There is a build check in net/ipv4/tcp.c to detect + * changes between the TCP and BPF versions. Ideally this should never happen. + * If it does, we need to add code to convert them before calling + * the BPF sock_ops function. + */ +enum { + BPF_TCP_ESTABLISHED = 1, + BPF_TCP_SYN_SENT, + BPF_TCP_SYN_RECV, + BPF_TCP_FIN_WAIT1, + BPF_TCP_FIN_WAIT2, + BPF_TCP_TIME_WAIT, + BPF_TCP_CLOSE, + BPF_TCP_CLOSE_WAIT, + BPF_TCP_LAST_ACK, + BPF_TCP_LISTEN, + BPF_TCP_CLOSING, /* Now a valid state */ + BPF_TCP_NEW_SYN_RECV, + BPF_TCP_BOUND_INACTIVE, + + BPF_TCP_MAX_STATES /* Leave at the end! */ +}; + +enum { + TCP_BPF_IW = 1001, /* Set TCP initial congestion window */ + TCP_BPF_SNDCWND_CLAMP = 1002, /* Set sndcwnd_clamp */ + TCP_BPF_DELACK_MAX = 1003, /* Max delay ack in usecs */ + TCP_BPF_RTO_MIN = 1004, /* Min delay ack in usecs */ + /* Copy the SYN pkt to optval + * + * BPF_PROG_TYPE_SOCK_OPS only. It is similar to the + * bpf_getsockopt(TCP_SAVED_SYN) but it does not limit + * to only getting from the saved_syn. It can either get the + * syn packet from: + * + * 1. the just-received SYN packet (only available when writing the + * SYNACK). It will be useful when it is not necessary to + * save the SYN packet for latter use. It is also the only way + * to get the SYN during syncookie mode because the syn + * packet cannot be saved during syncookie. + * + * OR + * + * 2. the earlier saved syn which was done by + * bpf_setsockopt(TCP_SAVE_SYN). + * + * The bpf_getsockopt(TCP_BPF_SYN*) option will hide where the + * SYN packet is obtained. + * + * If the bpf-prog does not need the IP[46] header, the + * bpf-prog can avoid parsing the IP header by using + * TCP_BPF_SYN. Otherwise, the bpf-prog can get both + * IP[46] and TCP header by using TCP_BPF_SYN_IP. + * + * >0: Total number of bytes copied + * -ENOSPC: Not enough space in optval. Only optlen number of + * bytes is copied. + * -ENOENT: The SYN skb is not available now and the earlier SYN pkt + * is not saved by setsockopt(TCP_SAVE_SYN). + */ + TCP_BPF_SYN = 1005, /* Copy the TCP header */ + TCP_BPF_SYN_IP = 1006, /* Copy the IP[46] and TCP header */ + TCP_BPF_SYN_MAC = 1007, /* Copy the MAC, IP[46], and TCP header */ +}; + +enum { + BPF_LOAD_HDR_OPT_TCP_SYN = (1ULL << 0), +}; + +/* args[0] value during BPF_SOCK_OPS_HDR_OPT_LEN_CB and + * BPF_SOCK_OPS_WRITE_HDR_OPT_CB. + */ +enum { + BPF_WRITE_HDR_TCP_CURRENT_MSS = 1, /* Kernel is finding the + * total option spaces + * required for an established + * sk in order to calculate the + * MSS. No skb is actually + * sent. + */ + BPF_WRITE_HDR_TCP_SYNACK_COOKIE = 2, /* Kernel is in syncookie mode + * when sending a SYN. + */ +}; + +struct bpf_perf_event_value { + __u64 counter; + __u64 enabled; + __u64 running; +}; + +enum { + BPF_DEVCG_ACC_MKNOD = (1ULL << 0), + BPF_DEVCG_ACC_READ = (1ULL << 1), + BPF_DEVCG_ACC_WRITE = (1ULL << 2), +}; + +enum { + BPF_DEVCG_DEV_BLOCK = (1ULL << 0), + BPF_DEVCG_DEV_CHAR = (1ULL << 1), +}; + +struct bpf_cgroup_dev_ctx { + /* access_type encoded as (BPF_DEVCG_ACC_* << 16) | BPF_DEVCG_DEV_* */ + __u32 access_type; + __u32 major; + __u32 minor; +}; + +struct bpf_raw_tracepoint_args { + __u64 args[0]; +}; + +/* DIRECT: Skip the FIB rules and go to FIB table associated with device + * OUTPUT: Do lookup from egress perspective; default is ingress + */ +enum { + BPF_FIB_LOOKUP_DIRECT = (1U << 0), + BPF_FIB_LOOKUP_OUTPUT = (1U << 1), + BPF_FIB_LOOKUP_SKIP_NEIGH = (1U << 2), + BPF_FIB_LOOKUP_TBID = (1U << 3), + BPF_FIB_LOOKUP_SRC = (1U << 4), + BPF_FIB_LOOKUP_MARK = (1U << 5), +}; + +enum { + BPF_FIB_LKUP_RET_SUCCESS, /* lookup successful */ + BPF_FIB_LKUP_RET_BLACKHOLE, /* dest is blackholed; can be dropped */ + BPF_FIB_LKUP_RET_UNREACHABLE, /* dest is unreachable; can be dropped */ + BPF_FIB_LKUP_RET_PROHIBIT, /* dest not allowed; can be dropped */ + BPF_FIB_LKUP_RET_NOT_FWDED, /* packet is not forwarded */ + BPF_FIB_LKUP_RET_FWD_DISABLED, /* fwding is not enabled on ingress */ + BPF_FIB_LKUP_RET_UNSUPP_LWT, /* fwd requires encapsulation */ + BPF_FIB_LKUP_RET_NO_NEIGH, /* no neighbor entry for nh */ + BPF_FIB_LKUP_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_FIB_LKUP_RET_NO_SRC_ADDR, /* failed to derive IP src addr */ +}; + +struct bpf_fib_lookup { + /* input: network family for lookup (AF_INET, AF_INET6) + * output: network family of egress nexthop + */ + __u8 family; + + /* set if lookup is to consider L4 data - e.g., FIB rules */ + __u8 l4_protocol; + __be16 sport; + __be16 dport; + + union { /* used for MTU check */ + /* input to lookup */ + __u16 tot_len; /* L3 length from network hdr (iph->tot_len) */ + + /* output: MTU value */ + __u16 mtu_result; + } __attribute__((packed, aligned(2))); + /* input: L3 device index for lookup + * output: device index from FIB lookup + */ + __u32 ifindex; + + union { + /* inputs to lookup */ + __u8 tos; /* AF_INET */ + __be32 flowinfo; /* AF_INET6, flow_label + priority */ + + /* output: metric of fib result (IPv4/IPv6 only) */ + __u32 rt_metric; + }; + + /* input: source address to consider for lookup + * output: source address result from lookup + */ + union { + __be32 ipv4_src; + __u32 ipv6_src[4]; /* in6_addr; network order */ + }; + + /* input to bpf_fib_lookup, ipv{4,6}_dst is destination address in + * network header. output: bpf_fib_lookup sets to gateway address + * if FIB lookup returns gateway route + */ + union { + __be32 ipv4_dst; + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + + union { + struct { + /* output */ + __be16 h_vlan_proto; + __be16 h_vlan_TCI; + }; + /* input: when accompanied with the + * 'BPF_FIB_LOOKUP_DIRECT | BPF_FIB_LOOKUP_TBID` flags, a + * specific routing table to use for the fib lookup. + */ + __u32 tbid; + }; + + union { + /* input */ + struct { + __u32 mark; /* policy routing */ + /* 2 4-byte holes for input */ + }; + + /* output: source and dest mac */ + struct { + __u8 smac[6]; /* ETH_ALEN */ + __u8 dmac[6]; /* ETH_ALEN */ + }; + }; +}; + +struct bpf_redir_neigh { + /* network family for lookup (AF_INET, AF_INET6) */ + __u32 nh_family; + /* network address of nexthop; skips fib lookup to find gateway */ + union { + __be32 ipv4_nh; + __u32 ipv6_nh[4]; /* in6_addr; network order */ + }; +}; + +/* bpf_check_mtu flags*/ +enum bpf_check_mtu_flags { + BPF_MTU_CHK_SEGS = (1U << 0), +}; + +enum bpf_check_mtu_ret { + BPF_MTU_CHK_RET_SUCCESS, /* check and lookup successful */ + BPF_MTU_CHK_RET_FRAG_NEEDED, /* fragmentation required to fwd */ + BPF_MTU_CHK_RET_SEGS_TOOBIG, /* GSO re-segmentation needed to fwd */ +}; + +enum bpf_task_fd_type { + BPF_FD_TYPE_RAW_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_TRACEPOINT, /* tp name */ + BPF_FD_TYPE_KPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_KRETPROBE, /* (symbol + offset) or addr */ + BPF_FD_TYPE_UPROBE, /* filename + offset */ + BPF_FD_TYPE_URETPROBE, /* filename + offset */ +}; + +enum { + BPF_FLOW_DISSECTOR_F_PARSE_1ST_FRAG = (1U << 0), + BPF_FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL = (1U << 1), + BPF_FLOW_DISSECTOR_F_STOP_AT_ENCAP = (1U << 2), +}; + +struct bpf_flow_keys { + __u16 nhoff; + __u16 thoff; + __u16 addr_proto; /* ETH_P_* of valid addrs */ + __u8 is_frag; + __u8 is_first_frag; + __u8 is_encap; + __u8 ip_proto; + __be16 n_proto; + __be16 sport; + __be16 dport; + union { + struct { + __be32 ipv4_src; + __be32 ipv4_dst; + }; + struct { + __u32 ipv6_src[4]; /* in6_addr; network order */ + __u32 ipv6_dst[4]; /* in6_addr; network order */ + }; + }; + __u32 flags; + __be32 flow_label; +}; + +struct bpf_func_info { + __u32 insn_off; + __u32 type_id; +}; + +#define BPF_LINE_INFO_LINE_NUM(line_col) ((line_col) >> 10) +#define BPF_LINE_INFO_LINE_COL(line_col) ((line_col) & 0x3ff) + +struct bpf_line_info { + __u32 insn_off; + __u32 file_name_off; + __u32 line_off; + __u32 line_col; +}; + +struct bpf_spin_lock { + __u32 val; +}; + +struct bpf_timer { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + +struct bpf_wq { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + +struct bpf_dynptr { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + +struct bpf_list_head { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + +struct bpf_list_node { + __u64 __opaque[3]; +} __attribute__((aligned(8))); + +struct bpf_rb_root { + __u64 __opaque[2]; +} __attribute__((aligned(8))); + +struct bpf_rb_node { + __u64 __opaque[4]; +} __attribute__((aligned(8))); + +struct bpf_refcount { + __u32 __opaque[1]; +} __attribute__((aligned(4))); + +struct bpf_sysctl { + __u32 write; /* Sysctl is being read (= 0) or written (= 1). + * Allows 1,2,4-byte read, but no write. + */ + __u32 file_pos; /* Sysctl file position to read from, write to. + * Allows 1,2,4-byte read an 4-byte write. + */ +}; + +struct bpf_sockopt { + __bpf_md_ptr(struct bpf_sock *, sk); + __bpf_md_ptr(void *, optval); + __bpf_md_ptr(void *, optval_end); + + __s32 level; + __s32 optname; + __s32 optlen; + __s32 retval; +}; + +struct bpf_pidns_info { + __u32 pid; + __u32 tgid; +}; + +/* User accessible data for SK_LOOKUP programs. Add new fields at the end. */ +struct bpf_sk_lookup { + union { + __bpf_md_ptr(struct bpf_sock *, sk); /* Selected socket */ + __u64 cookie; /* Non-zero if socket was selected in PROG_TEST_RUN */ + }; + + __u32 family; /* Protocol family (AF_INET, AF_INET6) */ + __u32 protocol; /* IP protocol (IPPROTO_TCP, IPPROTO_UDP) */ + __u32 remote_ip4; /* Network byte order */ + __u32 remote_ip6[4]; /* Network byte order */ + __be16 remote_port; /* Network byte order */ + __u16 :16; /* Zero padding */ + __u32 local_ip4; /* Network byte order */ + __u32 local_ip6[4]; /* Network byte order */ + __u32 local_port; /* Host byte order */ + __u32 ingress_ifindex; /* The arriving interface. Determined by inet_iif. */ +}; + +/* + * struct btf_ptr is used for typed pointer representation; the + * type id is used to render the pointer data as the appropriate type + * via the bpf_snprintf_btf() helper described above. A flags field - + * potentially to specify additional details about the BTF pointer + * (rather than its mode of display) - is included for future use. + * Display flags - BTF_F_* - are passed to bpf_snprintf_btf separately. + */ +struct btf_ptr { + void *ptr; + __u32 type_id; + __u32 flags; /* BTF ptr flags; unused at present. */ +}; + +/* + * Flags to control bpf_snprintf_btf() behaviour. + * - BTF_F_COMPACT: no formatting around type information + * - BTF_F_NONAME: no struct/union member names/types + * - BTF_F_PTR_RAW: show raw (unobfuscated) pointer values; + * equivalent to %px. + * - BTF_F_ZERO: show zero-valued struct/union members; they + * are not displayed by default + */ +enum { + BTF_F_COMPACT = (1ULL << 0), + BTF_F_NONAME = (1ULL << 1), + BTF_F_PTR_RAW = (1ULL << 2), + BTF_F_ZERO = (1ULL << 3), +}; + +/* bpf_core_relo_kind encodes which aspect of captured field/type/enum value + * has to be adjusted by relocations. It is emitted by llvm and passed to + * libbpf and later to the kernel. + */ +enum bpf_core_relo_kind { + BPF_CORE_FIELD_BYTE_OFFSET = 0, /* field byte offset */ + BPF_CORE_FIELD_BYTE_SIZE = 1, /* field size in bytes */ + BPF_CORE_FIELD_EXISTS = 2, /* field existence in target kernel */ + BPF_CORE_FIELD_SIGNED = 3, /* field signedness (0 - unsigned, 1 - signed) */ + BPF_CORE_FIELD_LSHIFT_U64 = 4, /* bitfield-specific left bitshift */ + BPF_CORE_FIELD_RSHIFT_U64 = 5, /* bitfield-specific right bitshift */ + BPF_CORE_TYPE_ID_LOCAL = 6, /* type ID in local BPF object */ + BPF_CORE_TYPE_ID_TARGET = 7, /* type ID in target kernel */ + BPF_CORE_TYPE_EXISTS = 8, /* type existence in target kernel */ + BPF_CORE_TYPE_SIZE = 9, /* type size in bytes */ + BPF_CORE_ENUMVAL_EXISTS = 10, /* enum value existence in target kernel */ + BPF_CORE_ENUMVAL_VALUE = 11, /* enum value integer value */ + BPF_CORE_TYPE_MATCHES = 12, /* type match in target kernel */ +}; + +/* + * "struct bpf_core_relo" is used to pass relocation data form LLVM to libbpf + * and from libbpf to the kernel. + * + * CO-RE relocation captures the following data: + * - insn_off - instruction offset (in bytes) within a BPF program that needs + * its insn->imm field to be relocated with actual field info; + * - type_id - BTF type ID of the "root" (containing) entity of a relocatable + * type or field; + * - access_str_off - offset into corresponding .BTF string section. String + * interpretation depends on specific relocation kind: + * - for field-based relocations, string encodes an accessed field using + * a sequence of field and array indices, separated by colon (:). It's + * conceptually very close to LLVM's getelementptr ([0]) instruction's + * arguments for identifying offset to a field. + * - for type-based relocations, strings is expected to be just "0"; + * - for enum value-based relocations, string contains an index of enum + * value within its enum type; + * - kind - one of enum bpf_core_relo_kind; + * + * Example: + * struct sample { + * int a; + * struct { + * int b[10]; + * }; + * }; + * + * struct sample *s = ...; + * int *x = &s->a; // encoded as "0:0" (a is field #0) + * int *y = &s->b[5]; // encoded as "0:1:0:5" (anon struct is field #1, + * // b is field #0 inside anon struct, accessing elem #5) + * int *z = &s[10]->b; // encoded as "10:1" (ptr is used as an array) + * + * type_id for all relocs in this example will capture BTF type id of + * `struct sample`. + * + * Such relocation is emitted when using __builtin_preserve_access_index() + * Clang built-in, passing expression that captures field address, e.g.: + * + * bpf_probe_read(&dst, sizeof(dst), + * __builtin_preserve_access_index(&src->a.b.c)); + * + * In this case Clang will emit field relocation recording necessary data to + * be able to find offset of embedded `a.b.c` field within `src` struct. + * + * [0] https://llvm.org/docs/LangRef.html#getelementptr-instruction + */ +struct bpf_core_relo { + __u32 insn_off; + __u32 type_id; + __u32 access_str_off; + enum bpf_core_relo_kind kind; +}; + +/* + * Flags to control bpf_timer_start() behaviour. + * - BPF_F_TIMER_ABS: Timeout passed is absolute time, by default it is + * relative to current time. + * - BPF_F_TIMER_CPU_PIN: Timer will be pinned to the CPU of the caller. + */ +enum { + BPF_F_TIMER_ABS = (1ULL << 0), + BPF_F_TIMER_CPU_PIN = (1ULL << 1), +}; + +/* BPF numbers iterator state */ +struct bpf_iter_num { + /* opaque iterator state; having __u64 here allows to preserve correct + * alignment requirements in vmlinux.h, generated from BTF + */ + __u64 __opaque[1]; +} __attribute__((aligned(8))); + +#endif /* _UAPI__LINUX_BPF_H__ */ diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/bpf_common.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/bpf_common.h new file mode 100644 index 0000000000..ee97668bda --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/bpf_common.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_BPF_COMMON_H__ +#define _UAPI__LINUX_BPF_COMMON_H__ + +/* Instruction classes */ +#define BPF_CLASS(code) ((code) & 0x07) +#define BPF_LD 0x00 +#define BPF_LDX 0x01 +#define BPF_ST 0x02 +#define BPF_STX 0x03 +#define BPF_ALU 0x04 +#define BPF_JMP 0x05 +#define BPF_RET 0x06 +#define BPF_MISC 0x07 + +/* ld/ldx fields */ +#define BPF_SIZE(code) ((code) & 0x18) +#define BPF_W 0x00 /* 32-bit */ +#define BPF_H 0x08 /* 16-bit */ +#define BPF_B 0x10 /* 8-bit */ +/* eBPF BPF_DW 0x18 64-bit */ +#define BPF_MODE(code) ((code) & 0xe0) +#define BPF_IMM 0x00 +#define BPF_ABS 0x20 +#define BPF_IND 0x40 +#define BPF_MEM 0x60 +#define BPF_LEN 0x80 +#define BPF_MSH 0xa0 + +/* alu/jmp fields */ +#define BPF_OP(code) ((code) & 0xf0) +#define BPF_ADD 0x00 +#define BPF_SUB 0x10 +#define BPF_MUL 0x20 +#define BPF_DIV 0x30 +#define BPF_OR 0x40 +#define BPF_AND 0x50 +#define BPF_LSH 0x60 +#define BPF_RSH 0x70 +#define BPF_NEG 0x80 +#define BPF_MOD 0x90 +#define BPF_XOR 0xa0 + +#define BPF_JA 0x00 +#define BPF_JEQ 0x10 +#define BPF_JGT 0x20 +#define BPF_JGE 0x30 +#define BPF_JSET 0x40 +#define BPF_SRC(code) ((code) & 0x08) +#define BPF_K 0x00 +#define BPF_X 0x08 + +#ifndef BPF_MAXINSNS +#define BPF_MAXINSNS 4096 +#endif + +#endif /* _UAPI__LINUX_BPF_COMMON_H__ */ diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/btf.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/btf.h new file mode 100644 index 0000000000..ec1798b6d3 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/btf.h @@ -0,0 +1,200 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* Copyright (c) 2018 Facebook */ +#ifndef _UAPI__LINUX_BTF_H__ +#define _UAPI__LINUX_BTF_H__ + +#include + +#define BTF_MAGIC 0xeB9F +#define BTF_VERSION 1 + +struct btf_header { + __u16 magic; + __u8 version; + __u8 flags; + __u32 hdr_len; + + /* All offsets are in bytes relative to the end of this header */ + __u32 type_off; /* offset of type section */ + __u32 type_len; /* length of type section */ + __u32 str_off; /* offset of string section */ + __u32 str_len; /* length of string section */ +}; + +/* Max # of type identifier */ +#define BTF_MAX_TYPE 0x000fffff +/* Max offset into the string section */ +#define BTF_MAX_NAME_OFFSET 0x00ffffff +/* Max # of struct/union/enum members or func args */ +#define BTF_MAX_VLEN 0xffff + +struct btf_type { + __u32 name_off; + /* "info" bits arrangement + * bits 0-15: vlen (e.g. # of struct's members) + * bits 16-23: unused + * bits 24-28: kind (e.g. int, ptr, array...etc) + * bits 29-30: unused + * bit 31: kind_flag, currently used by + * struct, union, enum, fwd and enum64 + */ + __u32 info; + /* "size" is used by INT, ENUM, STRUCT, UNION, DATASEC and ENUM64. + * "size" tells the size of the type it is describing. + * + * "type" is used by PTR, TYPEDEF, VOLATILE, CONST, RESTRICT, + * FUNC, FUNC_PROTO, VAR, DECL_TAG and TYPE_TAG. + * "type" is a type_id referring to another type. + */ + union { + __u32 size; + __u32 type; + }; +}; + +#define BTF_INFO_KIND(info) (((info) >> 24) & 0x1f) +#define BTF_INFO_VLEN(info) ((info) & 0xffff) +#define BTF_INFO_KFLAG(info) ((info) >> 31) + +enum { + BTF_KIND_UNKN = 0, /* Unknown */ + BTF_KIND_INT = 1, /* Integer */ + BTF_KIND_PTR = 2, /* Pointer */ + BTF_KIND_ARRAY = 3, /* Array */ + BTF_KIND_STRUCT = 4, /* Struct */ + BTF_KIND_UNION = 5, /* Union */ + BTF_KIND_ENUM = 6, /* Enumeration up to 32-bit values */ + BTF_KIND_FWD = 7, /* Forward */ + BTF_KIND_TYPEDEF = 8, /* Typedef */ + BTF_KIND_VOLATILE = 9, /* Volatile */ + BTF_KIND_CONST = 10, /* Const */ + BTF_KIND_RESTRICT = 11, /* Restrict */ + BTF_KIND_FUNC = 12, /* Function */ + BTF_KIND_FUNC_PROTO = 13, /* Function Proto */ + BTF_KIND_VAR = 14, /* Variable */ + BTF_KIND_DATASEC = 15, /* Section */ + BTF_KIND_FLOAT = 16, /* Floating point */ + BTF_KIND_DECL_TAG = 17, /* Decl Tag */ + BTF_KIND_TYPE_TAG = 18, /* Type Tag */ + BTF_KIND_ENUM64 = 19, /* Enumeration up to 64-bit values */ + + NR_BTF_KINDS, + BTF_KIND_MAX = NR_BTF_KINDS - 1, +}; + +/* For some specific BTF_KIND, "struct btf_type" is immediately + * followed by extra data. + */ + +/* BTF_KIND_INT is followed by a u32 and the following + * is the 32 bits arrangement: + */ +#define BTF_INT_ENCODING(VAL) (((VAL) & 0x0f000000) >> 24) +#define BTF_INT_OFFSET(VAL) (((VAL) & 0x00ff0000) >> 16) +#define BTF_INT_BITS(VAL) ((VAL) & 0x000000ff) + +/* Attributes stored in the BTF_INT_ENCODING */ +#define BTF_INT_SIGNED (1 << 0) +#define BTF_INT_CHAR (1 << 1) +#define BTF_INT_BOOL (1 << 2) + +/* BTF_KIND_ENUM is followed by multiple "struct btf_enum". + * The exact number of btf_enum is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum { + __u32 name_off; + __s32 val; +}; + +/* BTF_KIND_ARRAY is followed by one "struct btf_array" */ +struct btf_array { + __u32 type; + __u32 index_type; + __u32 nelems; +}; + +/* BTF_KIND_STRUCT and BTF_KIND_UNION are followed + * by multiple "struct btf_member". The exact number + * of btf_member is stored in the vlen (of the info in + * "struct btf_type"). + */ +struct btf_member { + __u32 name_off; + __u32 type; + /* If the type info kind_flag is set, the btf_member offset + * contains both member bitfield size and bit offset. The + * bitfield size is set for bitfield members. If the type + * info kind_flag is not set, the offset contains only bit + * offset. + */ + __u32 offset; +}; + +/* If the struct/union type info kind_flag is set, the + * following two macros are used to access bitfield_size + * and bit_offset from btf_member.offset. + */ +#define BTF_MEMBER_BITFIELD_SIZE(val) ((val) >> 24) +#define BTF_MEMBER_BIT_OFFSET(val) ((val) & 0xffffff) + +/* BTF_KIND_FUNC_PROTO is followed by multiple "struct btf_param". + * The exact number of btf_param is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_param { + __u32 name_off; + __u32 type; +}; + +enum { + BTF_VAR_STATIC = 0, + BTF_VAR_GLOBAL_ALLOCATED = 1, + BTF_VAR_GLOBAL_EXTERN = 2, +}; + +enum btf_func_linkage { + BTF_FUNC_STATIC = 0, + BTF_FUNC_GLOBAL = 1, + BTF_FUNC_EXTERN = 2, +}; + +/* BTF_KIND_VAR is followed by a single "struct btf_var" to describe + * additional information related to the variable such as its linkage. + */ +struct btf_var { + __u32 linkage; +}; + +/* BTF_KIND_DATASEC is followed by multiple "struct btf_var_secinfo" + * to describe all BTF_KIND_VAR types it contains along with it's + * in-section offset as well as size. + */ +struct btf_var_secinfo { + __u32 type; + __u32 offset; + __u32 size; +}; + +/* BTF_KIND_DECL_TAG is followed by a single "struct btf_decl_tag" to describe + * additional information related to the tag applied location. + * If component_idx == -1, the tag is applied to a struct, union, + * variable or function. Otherwise, it is applied to a struct/union + * member or a func argument, and component_idx indicates which member + * or argument (0 ... vlen-1). + */ +struct btf_decl_tag { + __s32 component_idx; +}; + +/* BTF_KIND_ENUM64 is followed by multiple "struct btf_enum64". + * The exact number of btf_enum64 is stored in the vlen (of the + * info in "struct btf_type"). + */ +struct btf_enum64 { + __u32 name_off; + __u32 val_lo32; + __u32 val_hi32; +}; + +#endif /* _UAPI__LINUX_BTF_H__ */ diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/if_link.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/if_link.h new file mode 100644 index 0000000000..f0d71b2a3f --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/if_link.h @@ -0,0 +1,1426 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI_LINUX_IF_LINK_H +#define _UAPI_LINUX_IF_LINK_H + +#include +#include + +/* This struct should be in sync with struct rtnl_link_stats64 */ +struct rtnl_link_stats { + __u32 rx_packets; + __u32 tx_packets; + __u32 rx_bytes; + __u32 tx_bytes; + __u32 rx_errors; + __u32 tx_errors; + __u32 rx_dropped; + __u32 tx_dropped; + __u32 multicast; + __u32 collisions; + /* detailed rx_errors: */ + __u32 rx_length_errors; + __u32 rx_over_errors; + __u32 rx_crc_errors; + __u32 rx_frame_errors; + __u32 rx_fifo_errors; + __u32 rx_missed_errors; + + /* detailed tx_errors */ + __u32 tx_aborted_errors; + __u32 tx_carrier_errors; + __u32 tx_fifo_errors; + __u32 tx_heartbeat_errors; + __u32 tx_window_errors; + + /* for cslip etc */ + __u32 rx_compressed; + __u32 tx_compressed; + + __u32 rx_nohandler; +}; + +/** + * struct rtnl_link_stats64 - The main device statistics structure. + * + * @rx_packets: Number of good packets received by the interface. + * For hardware interfaces counts all good packets received from the device + * by the host, including packets which host had to drop at various stages + * of processing (even in the driver). + * + * @tx_packets: Number of packets successfully transmitted. + * For hardware interfaces counts packets which host was able to successfully + * hand over to the device, which does not necessarily mean that packets + * had been successfully transmitted out of the device, only that device + * acknowledged it copied them out of host memory. + * + * @rx_bytes: Number of good received bytes, corresponding to @rx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @tx_bytes: Number of good transmitted bytes, corresponding to @tx_packets. + * + * For IEEE 802.3 devices should count the length of Ethernet Frames + * excluding the FCS. + * + * @rx_errors: Total number of bad packets received on this network device. + * This counter must include events counted by @rx_length_errors, + * @rx_crc_errors, @rx_frame_errors and other errors not otherwise + * counted. + * + * @tx_errors: Total number of transmit problems. + * This counter must include events counter by @tx_aborted_errors, + * @tx_carrier_errors, @tx_fifo_errors, @tx_heartbeat_errors, + * @tx_window_errors and other errors not otherwise counted. + * + * @rx_dropped: Number of packets received but not processed, + * e.g. due to lack of resources or unsupported protocol. + * For hardware interfaces this counter may include packets discarded + * due to L2 address filtering but should not include packets dropped + * by the device due to buffer exhaustion which are counted separately in + * @rx_missed_errors (since procfs folds those two counters together). + * + * @tx_dropped: Number of packets dropped on their way to transmission, + * e.g. due to lack of resources. + * + * @multicast: Multicast packets received. + * For hardware interfaces this statistic is commonly calculated + * at the device level (unlike @rx_packets) and therefore may include + * packets which did not reach the host. + * + * For IEEE 802.3 devices this counter may be equivalent to: + * + * - 30.3.1.1.21 aMulticastFramesReceivedOK + * + * @collisions: Number of collisions during packet transmissions. + * + * @rx_length_errors: Number of packets dropped due to invalid length. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to a sum + * of the following attributes: + * + * - 30.3.1.1.23 aInRangeLengthErrors + * - 30.3.1.1.24 aOutOfRangeLengthField + * - 30.3.1.1.25 aFrameTooLongErrors + * + * @rx_over_errors: Receiver FIFO overflow event counter. + * + * Historically the count of overflow events. Such events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * The recommended interpretation for high speed interfaces is - + * number of packets dropped because they did not fit into buffers + * provided by the host, e.g. packets larger than MTU or next buffer + * in the ring was not available for a scatter transfer. + * + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * This statistics was historically used interchangeably with + * @rx_fifo_errors. + * + * This statistic corresponds to hardware events and is not commonly used + * on software devices. + * + * @rx_crc_errors: Number of packets received with a CRC error. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.6 aFrameCheckSequenceErrors + * + * @rx_frame_errors: Receiver frame alignment errors. + * Part of aggregate "frame" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter should be equivalent to: + * + * - 30.3.1.1.7 aAlignmentErrors + * + * @rx_fifo_errors: Receiver FIFO error counter. + * + * Historically the count of overflow events. Those events may be + * reported in the receive descriptors or via interrupts, and may + * not correspond one-to-one with dropped packets. + * + * This statistics was used interchangeably with @rx_over_errors. + * Not recommended for use in drivers for high speed interfaces. + * + * This statistic is used on software devices, e.g. to count software + * packet queue overflow (can) or sequencing errors (GRE). + * + * @rx_missed_errors: Count of packets missed by the host. + * Folded into the "drop" counter in `/proc/net/dev`. + * + * Counts number of packets dropped by the device due to lack + * of buffer space. This usually indicates that the host interface + * is slower than the network interface, or host is not keeping up + * with the receive packet rate. + * + * This statistic corresponds to hardware events and is not used + * on software devices. + * + * @tx_aborted_errors: + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * For IEEE 802.3 devices capable of half-duplex operation this counter + * must be equivalent to: + * + * - 30.3.1.1.11 aFramesAbortedDueToXSColls + * + * High speed interfaces may use this counter as a general device + * discard counter. + * + * @tx_carrier_errors: Number of frame transmission errors due to loss + * of carrier during transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.13 aCarrierSenseErrors + * + * @tx_fifo_errors: Number of frame transmission errors due to device + * FIFO underrun / underflow. This condition occurs when the device + * begins transmission of a frame but is unable to deliver the + * entire frame to the transmitter in time for transmission. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * @tx_heartbeat_errors: Number of Heartbeat / SQE Test errors for + * old half-duplex Ethernet. + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices possibly equivalent to: + * + * - 30.3.2.1.4 aSQETestErrors + * + * @tx_window_errors: Number of frame transmission errors due + * to late collisions (for Ethernet - after the first 64B of transmission). + * Part of aggregate "carrier" errors in `/proc/net/dev`. + * + * For IEEE 802.3 devices this counter must be equivalent to: + * + * - 30.3.1.1.10 aLateCollisions + * + * @rx_compressed: Number of correctly received compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @tx_compressed: Number of transmitted compressed packets. + * This counters is only meaningful for interfaces which support + * packet compression (e.g. CSLIP, PPP). + * + * @rx_nohandler: Number of packets received on the interface + * but dropped by the networking stack because the device is + * not designated to receive packets (e.g. backup link in a bond). + * + * @rx_otherhost_dropped: Number of packets dropped due to mismatch + * in destination MAC address. + */ +struct rtnl_link_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; + __u64 collisions; + + /* detailed rx_errors: */ + __u64 rx_length_errors; + __u64 rx_over_errors; + __u64 rx_crc_errors; + __u64 rx_frame_errors; + __u64 rx_fifo_errors; + __u64 rx_missed_errors; + + /* detailed tx_errors */ + __u64 tx_aborted_errors; + __u64 tx_carrier_errors; + __u64 tx_fifo_errors; + __u64 tx_heartbeat_errors; + __u64 tx_window_errors; + + /* for cslip etc */ + __u64 rx_compressed; + __u64 tx_compressed; + __u64 rx_nohandler; + + __u64 rx_otherhost_dropped; +}; + +/* Subset of link stats useful for in-HW collection. Meaning of the fields is as + * for struct rtnl_link_stats64. + */ +struct rtnl_hw_stats64 { + __u64 rx_packets; + __u64 tx_packets; + __u64 rx_bytes; + __u64 tx_bytes; + __u64 rx_errors; + __u64 tx_errors; + __u64 rx_dropped; + __u64 tx_dropped; + __u64 multicast; +}; + +/* The struct should be in sync with struct ifmap */ +struct rtnl_link_ifmap { + __u64 mem_start; + __u64 mem_end; + __u64 base_addr; + __u16 irq; + __u8 dma; + __u8 port; +}; + +/* + * IFLA_AF_SPEC + * Contains nested attributes for address family specific attributes. + * Each address family may create a attribute with the address family + * number as type and create its own attribute structure in it. + * + * Example: + * [IFLA_AF_SPEC] = { + * [AF_INET] = { + * [IFLA_INET_CONF] = ..., + * }, + * [AF_INET6] = { + * [IFLA_INET6_FLAGS] = ..., + * [IFLA_INET6_CONF] = ..., + * } + * } + */ + +enum { + IFLA_UNSPEC, + IFLA_ADDRESS, + IFLA_BROADCAST, + IFLA_IFNAME, + IFLA_MTU, + IFLA_LINK, + IFLA_QDISC, + IFLA_STATS, + IFLA_COST, +#define IFLA_COST IFLA_COST + IFLA_PRIORITY, +#define IFLA_PRIORITY IFLA_PRIORITY + IFLA_MASTER, +#define IFLA_MASTER IFLA_MASTER + IFLA_WIRELESS, /* Wireless Extension event - see wireless.h */ +#define IFLA_WIRELESS IFLA_WIRELESS + IFLA_PROTINFO, /* Protocol specific information for a link */ +#define IFLA_PROTINFO IFLA_PROTINFO + IFLA_TXQLEN, +#define IFLA_TXQLEN IFLA_TXQLEN + IFLA_MAP, +#define IFLA_MAP IFLA_MAP + IFLA_WEIGHT, +#define IFLA_WEIGHT IFLA_WEIGHT + IFLA_OPERSTATE, + IFLA_LINKMODE, + IFLA_LINKINFO, +#define IFLA_LINKINFO IFLA_LINKINFO + IFLA_NET_NS_PID, + IFLA_IFALIAS, + IFLA_NUM_VF, /* Number of VFs if device is SR-IOV PF */ + IFLA_VFINFO_LIST, + IFLA_STATS64, + IFLA_VF_PORTS, + IFLA_PORT_SELF, + IFLA_AF_SPEC, + IFLA_GROUP, /* Group the device belongs to */ + IFLA_NET_NS_FD, + IFLA_EXT_MASK, /* Extended info mask, VFs, etc */ + IFLA_PROMISCUITY, /* Promiscuity count: > 0 means acts PROMISC */ +#define IFLA_PROMISCUITY IFLA_PROMISCUITY + IFLA_NUM_TX_QUEUES, + IFLA_NUM_RX_QUEUES, + IFLA_CARRIER, + IFLA_PHYS_PORT_ID, + IFLA_CARRIER_CHANGES, + IFLA_PHYS_SWITCH_ID, + IFLA_LINK_NETNSID, + IFLA_PHYS_PORT_NAME, + IFLA_PROTO_DOWN, + IFLA_GSO_MAX_SEGS, + IFLA_GSO_MAX_SIZE, + IFLA_PAD, + IFLA_XDP, + IFLA_EVENT, + IFLA_NEW_NETNSID, + IFLA_IF_NETNSID, + IFLA_TARGET_NETNSID = IFLA_IF_NETNSID, /* new alias */ + IFLA_CARRIER_UP_COUNT, + IFLA_CARRIER_DOWN_COUNT, + IFLA_NEW_IFINDEX, + IFLA_MIN_MTU, + IFLA_MAX_MTU, + IFLA_PROP_LIST, + IFLA_ALT_IFNAME, /* Alternative ifname */ + IFLA_PERM_ADDRESS, + IFLA_PROTO_DOWN_REASON, + + /* device (sysfs) name as parent, used instead + * of IFLA_LINK where there's no parent netdev + */ + IFLA_PARENT_DEV_NAME, + IFLA_PARENT_DEV_BUS_NAME, + IFLA_GRO_MAX_SIZE, + IFLA_TSO_MAX_SIZE, + IFLA_TSO_MAX_SEGS, + IFLA_ALLMULTI, /* Allmulti count: > 0 means acts ALLMULTI */ + + IFLA_DEVLINK_PORT, + + IFLA_GSO_IPV4_MAX_SIZE, + IFLA_GRO_IPV4_MAX_SIZE, + IFLA_DPLL_PIN, + __IFLA_MAX +}; + + +#define IFLA_MAX (__IFLA_MAX - 1) + +enum { + IFLA_PROTO_DOWN_REASON_UNSPEC, + IFLA_PROTO_DOWN_REASON_MASK, /* u32, mask for reason bits */ + IFLA_PROTO_DOWN_REASON_VALUE, /* u32, reason bit value */ + + __IFLA_PROTO_DOWN_REASON_CNT, + IFLA_PROTO_DOWN_REASON_MAX = __IFLA_PROTO_DOWN_REASON_CNT - 1 +}; + +/* backwards compatibility for userspace */ +#ifndef __KERNEL__ +#define IFLA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifinfomsg)))) +#define IFLA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifinfomsg)) +#endif + +enum { + IFLA_INET_UNSPEC, + IFLA_INET_CONF, + __IFLA_INET_MAX, +}; + +#define IFLA_INET_MAX (__IFLA_INET_MAX - 1) + +/* ifi_flags. + + IFF_* flags. + + The only change is: + IFF_LOOPBACK, IFF_BROADCAST and IFF_POINTOPOINT are + more not changeable by user. They describe link media + characteristics and set by device driver. + + Comments: + - Combination IFF_BROADCAST|IFF_POINTOPOINT is invalid + - If neither of these three flags are set; + the interface is NBMA. + + - IFF_MULTICAST does not mean anything special: + multicasts can be used on all not-NBMA links. + IFF_MULTICAST means that this media uses special encapsulation + for multicast frames. Apparently, all IFF_POINTOPOINT and + IFF_BROADCAST devices are able to use multicasts too. + */ + +/* IFLA_LINK. + For usual devices it is equal ifi_index. + If it is a "virtual interface" (f.e. tunnel), ifi_link + can point to real physical interface (f.e. for bandwidth calculations), + or maybe 0, what means, that real media is unknown (usual + for IPIP tunnels, when route to endpoint is allowed to change) + */ + +/* Subtype attributes for IFLA_PROTINFO */ +enum { + IFLA_INET6_UNSPEC, + IFLA_INET6_FLAGS, /* link flags */ + IFLA_INET6_CONF, /* sysctl parameters */ + IFLA_INET6_STATS, /* statistics */ + IFLA_INET6_MCAST, /* MC things. What of them? */ + IFLA_INET6_CACHEINFO, /* time values and max reasm size */ + IFLA_INET6_ICMP6STATS, /* statistics (icmpv6) */ + IFLA_INET6_TOKEN, /* device token */ + IFLA_INET6_ADDR_GEN_MODE, /* implicit address generator mode */ + IFLA_INET6_RA_MTU, /* mtu carried in the RA message */ + __IFLA_INET6_MAX +}; + +#define IFLA_INET6_MAX (__IFLA_INET6_MAX - 1) + +enum in6_addr_gen_mode { + IN6_ADDR_GEN_MODE_EUI64, + IN6_ADDR_GEN_MODE_NONE, + IN6_ADDR_GEN_MODE_STABLE_PRIVACY, + IN6_ADDR_GEN_MODE_RANDOM, +}; + +/* Bridge section */ + +enum { + IFLA_BR_UNSPEC, + IFLA_BR_FORWARD_DELAY, + IFLA_BR_HELLO_TIME, + IFLA_BR_MAX_AGE, + IFLA_BR_AGEING_TIME, + IFLA_BR_STP_STATE, + IFLA_BR_PRIORITY, + IFLA_BR_VLAN_FILTERING, + IFLA_BR_VLAN_PROTOCOL, + IFLA_BR_GROUP_FWD_MASK, + IFLA_BR_ROOT_ID, + IFLA_BR_BRIDGE_ID, + IFLA_BR_ROOT_PORT, + IFLA_BR_ROOT_PATH_COST, + IFLA_BR_TOPOLOGY_CHANGE, + IFLA_BR_TOPOLOGY_CHANGE_DETECTED, + IFLA_BR_HELLO_TIMER, + IFLA_BR_TCN_TIMER, + IFLA_BR_TOPOLOGY_CHANGE_TIMER, + IFLA_BR_GC_TIMER, + IFLA_BR_GROUP_ADDR, + IFLA_BR_FDB_FLUSH, + IFLA_BR_MCAST_ROUTER, + IFLA_BR_MCAST_SNOOPING, + IFLA_BR_MCAST_QUERY_USE_IFADDR, + IFLA_BR_MCAST_QUERIER, + IFLA_BR_MCAST_HASH_ELASTICITY, + IFLA_BR_MCAST_HASH_MAX, + IFLA_BR_MCAST_LAST_MEMBER_CNT, + IFLA_BR_MCAST_STARTUP_QUERY_CNT, + IFLA_BR_MCAST_LAST_MEMBER_INTVL, + IFLA_BR_MCAST_MEMBERSHIP_INTVL, + IFLA_BR_MCAST_QUERIER_INTVL, + IFLA_BR_MCAST_QUERY_INTVL, + IFLA_BR_MCAST_QUERY_RESPONSE_INTVL, + IFLA_BR_MCAST_STARTUP_QUERY_INTVL, + IFLA_BR_NF_CALL_IPTABLES, + IFLA_BR_NF_CALL_IP6TABLES, + IFLA_BR_NF_CALL_ARPTABLES, + IFLA_BR_VLAN_DEFAULT_PVID, + IFLA_BR_PAD, + IFLA_BR_VLAN_STATS_ENABLED, + IFLA_BR_MCAST_STATS_ENABLED, + IFLA_BR_MCAST_IGMP_VERSION, + IFLA_BR_MCAST_MLD_VERSION, + IFLA_BR_VLAN_STATS_PER_PORT, + IFLA_BR_MULTI_BOOLOPT, + IFLA_BR_MCAST_QUERIER_STATE, + __IFLA_BR_MAX, +}; + +#define IFLA_BR_MAX (__IFLA_BR_MAX - 1) + +struct ifla_bridge_id { + __u8 prio[2]; + __u8 addr[6]; /* ETH_ALEN */ +}; + +enum { + BRIDGE_MODE_UNSPEC, + BRIDGE_MODE_HAIRPIN, +}; + +enum { + IFLA_BRPORT_UNSPEC, + IFLA_BRPORT_STATE, /* Spanning tree state */ + IFLA_BRPORT_PRIORITY, /* " priority */ + IFLA_BRPORT_COST, /* " cost */ + IFLA_BRPORT_MODE, /* mode (hairpin) */ + IFLA_BRPORT_GUARD, /* bpdu guard */ + IFLA_BRPORT_PROTECT, /* root port protection */ + IFLA_BRPORT_FAST_LEAVE, /* multicast fast leave */ + IFLA_BRPORT_LEARNING, /* mac learning */ + IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */ + IFLA_BRPORT_PROXYARP, /* proxy ARP */ + IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */ + IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */ + IFLA_BRPORT_ROOT_ID, /* designated root */ + IFLA_BRPORT_BRIDGE_ID, /* designated bridge */ + IFLA_BRPORT_DESIGNATED_PORT, + IFLA_BRPORT_DESIGNATED_COST, + IFLA_BRPORT_ID, + IFLA_BRPORT_NO, + IFLA_BRPORT_TOPOLOGY_CHANGE_ACK, + IFLA_BRPORT_CONFIG_PENDING, + IFLA_BRPORT_MESSAGE_AGE_TIMER, + IFLA_BRPORT_FORWARD_DELAY_TIMER, + IFLA_BRPORT_HOLD_TIMER, + IFLA_BRPORT_FLUSH, + IFLA_BRPORT_MULTICAST_ROUTER, + IFLA_BRPORT_PAD, + IFLA_BRPORT_MCAST_FLOOD, + IFLA_BRPORT_MCAST_TO_UCAST, + IFLA_BRPORT_VLAN_TUNNEL, + IFLA_BRPORT_BCAST_FLOOD, + IFLA_BRPORT_GROUP_FWD_MASK, + IFLA_BRPORT_NEIGH_SUPPRESS, + IFLA_BRPORT_ISOLATED, + IFLA_BRPORT_BACKUP_PORT, + IFLA_BRPORT_MRP_RING_OPEN, + IFLA_BRPORT_MRP_IN_OPEN, + IFLA_BRPORT_MCAST_EHT_HOSTS_LIMIT, + IFLA_BRPORT_MCAST_EHT_HOSTS_CNT, + IFLA_BRPORT_LOCKED, + IFLA_BRPORT_MAB, + IFLA_BRPORT_MCAST_N_GROUPS, + IFLA_BRPORT_MCAST_MAX_GROUPS, + IFLA_BRPORT_NEIGH_VLAN_SUPPRESS, + IFLA_BRPORT_BACKUP_NHID, + __IFLA_BRPORT_MAX +}; +#define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1) + +struct ifla_cacheinfo { + __u32 max_reasm_len; + __u32 tstamp; /* ipv6InterfaceTable updated timestamp */ + __u32 reachable_time; + __u32 retrans_time; +}; + +enum { + IFLA_INFO_UNSPEC, + IFLA_INFO_KIND, + IFLA_INFO_DATA, + IFLA_INFO_XSTATS, + IFLA_INFO_SLAVE_KIND, + IFLA_INFO_SLAVE_DATA, + __IFLA_INFO_MAX, +}; + +#define IFLA_INFO_MAX (__IFLA_INFO_MAX - 1) + +/* VLAN section */ + +enum { + IFLA_VLAN_UNSPEC, + IFLA_VLAN_ID, + IFLA_VLAN_FLAGS, + IFLA_VLAN_EGRESS_QOS, + IFLA_VLAN_INGRESS_QOS, + IFLA_VLAN_PROTOCOL, + __IFLA_VLAN_MAX, +}; + +#define IFLA_VLAN_MAX (__IFLA_VLAN_MAX - 1) + +struct ifla_vlan_flags { + __u32 flags; + __u32 mask; +}; + +enum { + IFLA_VLAN_QOS_UNSPEC, + IFLA_VLAN_QOS_MAPPING, + __IFLA_VLAN_QOS_MAX +}; + +#define IFLA_VLAN_QOS_MAX (__IFLA_VLAN_QOS_MAX - 1) + +struct ifla_vlan_qos_mapping { + __u32 from; + __u32 to; +}; + +/* MACVLAN section */ +enum { + IFLA_MACVLAN_UNSPEC, + IFLA_MACVLAN_MODE, + IFLA_MACVLAN_FLAGS, + IFLA_MACVLAN_MACADDR_MODE, + IFLA_MACVLAN_MACADDR, + IFLA_MACVLAN_MACADDR_DATA, + IFLA_MACVLAN_MACADDR_COUNT, + IFLA_MACVLAN_BC_QUEUE_LEN, + IFLA_MACVLAN_BC_QUEUE_LEN_USED, + IFLA_MACVLAN_BC_CUTOFF, + __IFLA_MACVLAN_MAX, +}; + +#define IFLA_MACVLAN_MAX (__IFLA_MACVLAN_MAX - 1) + +enum macvlan_mode { + MACVLAN_MODE_PRIVATE = 1, /* don't talk to other macvlans */ + MACVLAN_MODE_VEPA = 2, /* talk to other ports through ext bridge */ + MACVLAN_MODE_BRIDGE = 4, /* talk to bridge ports directly */ + MACVLAN_MODE_PASSTHRU = 8,/* take over the underlying device */ + MACVLAN_MODE_SOURCE = 16,/* use source MAC address list to assign */ +}; + +enum macvlan_macaddr_mode { + MACVLAN_MACADDR_ADD, + MACVLAN_MACADDR_DEL, + MACVLAN_MACADDR_FLUSH, + MACVLAN_MACADDR_SET, +}; + +#define MACVLAN_FLAG_NOPROMISC 1 +#define MACVLAN_FLAG_NODST 2 /* skip dst macvlan if matching src macvlan */ + +/* VRF section */ +enum { + IFLA_VRF_UNSPEC, + IFLA_VRF_TABLE, + __IFLA_VRF_MAX +}; + +#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) + +enum { + IFLA_VRF_PORT_UNSPEC, + IFLA_VRF_PORT_TABLE, + __IFLA_VRF_PORT_MAX +}; + +#define IFLA_VRF_PORT_MAX (__IFLA_VRF_PORT_MAX - 1) + +/* MACSEC section */ +enum { + IFLA_MACSEC_UNSPEC, + IFLA_MACSEC_SCI, + IFLA_MACSEC_PORT, + IFLA_MACSEC_ICV_LEN, + IFLA_MACSEC_CIPHER_SUITE, + IFLA_MACSEC_WINDOW, + IFLA_MACSEC_ENCODING_SA, + IFLA_MACSEC_ENCRYPT, + IFLA_MACSEC_PROTECT, + IFLA_MACSEC_INC_SCI, + IFLA_MACSEC_ES, + IFLA_MACSEC_SCB, + IFLA_MACSEC_REPLAY_PROTECT, + IFLA_MACSEC_VALIDATION, + IFLA_MACSEC_PAD, + IFLA_MACSEC_OFFLOAD, + __IFLA_MACSEC_MAX, +}; + +#define IFLA_MACSEC_MAX (__IFLA_MACSEC_MAX - 1) + +/* XFRM section */ +enum { + IFLA_XFRM_UNSPEC, + IFLA_XFRM_LINK, + IFLA_XFRM_IF_ID, + IFLA_XFRM_COLLECT_METADATA, + __IFLA_XFRM_MAX +}; + +#define IFLA_XFRM_MAX (__IFLA_XFRM_MAX - 1) + +enum macsec_validation_type { + MACSEC_VALIDATE_DISABLED = 0, + MACSEC_VALIDATE_CHECK = 1, + MACSEC_VALIDATE_STRICT = 2, + __MACSEC_VALIDATE_END, + MACSEC_VALIDATE_MAX = __MACSEC_VALIDATE_END - 1, +}; + +enum macsec_offload { + MACSEC_OFFLOAD_OFF = 0, + MACSEC_OFFLOAD_PHY = 1, + MACSEC_OFFLOAD_MAC = 2, + __MACSEC_OFFLOAD_END, + MACSEC_OFFLOAD_MAX = __MACSEC_OFFLOAD_END - 1, +}; + +/* IPVLAN section */ +enum { + IFLA_IPVLAN_UNSPEC, + IFLA_IPVLAN_MODE, + IFLA_IPVLAN_FLAGS, + __IFLA_IPVLAN_MAX +}; + +#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) + +enum ipvlan_mode { + IPVLAN_MODE_L2 = 0, + IPVLAN_MODE_L3, + IPVLAN_MODE_L3S, + IPVLAN_MODE_MAX +}; + +#define IPVLAN_F_PRIVATE 0x01 +#define IPVLAN_F_VEPA 0x02 + +/* Tunnel RTM header */ +struct tunnel_msg { + __u8 family; + __u8 flags; + __u16 reserved2; + __u32 ifindex; +}; + +/* netkit section */ +enum netkit_action { + NETKIT_NEXT = -1, + NETKIT_PASS = 0, + NETKIT_DROP = 2, + NETKIT_REDIRECT = 7, +}; + +enum netkit_mode { + NETKIT_L2, + NETKIT_L3, +}; + +enum { + IFLA_NETKIT_UNSPEC, + IFLA_NETKIT_PEER_INFO, + IFLA_NETKIT_PRIMARY, + IFLA_NETKIT_POLICY, + IFLA_NETKIT_PEER_POLICY, + IFLA_NETKIT_MODE, + __IFLA_NETKIT_MAX, +}; +#define IFLA_NETKIT_MAX (__IFLA_NETKIT_MAX - 1) + +/* VXLAN section */ + +/* include statistics in the dump */ +#define TUNNEL_MSG_FLAG_STATS 0x01 + +#define TUNNEL_MSG_VALID_USER_FLAGS TUNNEL_MSG_FLAG_STATS + +/* Embedded inside VXLAN_VNIFILTER_ENTRY_STATS */ +enum { + VNIFILTER_ENTRY_STATS_UNSPEC, + VNIFILTER_ENTRY_STATS_RX_BYTES, + VNIFILTER_ENTRY_STATS_RX_PKTS, + VNIFILTER_ENTRY_STATS_RX_DROPS, + VNIFILTER_ENTRY_STATS_RX_ERRORS, + VNIFILTER_ENTRY_STATS_TX_BYTES, + VNIFILTER_ENTRY_STATS_TX_PKTS, + VNIFILTER_ENTRY_STATS_TX_DROPS, + VNIFILTER_ENTRY_STATS_TX_ERRORS, + VNIFILTER_ENTRY_STATS_PAD, + __VNIFILTER_ENTRY_STATS_MAX +}; +#define VNIFILTER_ENTRY_STATS_MAX (__VNIFILTER_ENTRY_STATS_MAX - 1) + +enum { + VXLAN_VNIFILTER_ENTRY_UNSPEC, + VXLAN_VNIFILTER_ENTRY_START, + VXLAN_VNIFILTER_ENTRY_END, + VXLAN_VNIFILTER_ENTRY_GROUP, + VXLAN_VNIFILTER_ENTRY_GROUP6, + VXLAN_VNIFILTER_ENTRY_STATS, + __VXLAN_VNIFILTER_ENTRY_MAX +}; +#define VXLAN_VNIFILTER_ENTRY_MAX (__VXLAN_VNIFILTER_ENTRY_MAX - 1) + +enum { + VXLAN_VNIFILTER_UNSPEC, + VXLAN_VNIFILTER_ENTRY, + __VXLAN_VNIFILTER_MAX +}; +#define VXLAN_VNIFILTER_MAX (__VXLAN_VNIFILTER_MAX - 1) + +enum { + IFLA_VXLAN_UNSPEC, + IFLA_VXLAN_ID, + IFLA_VXLAN_GROUP, /* group or remote address */ + IFLA_VXLAN_LINK, + IFLA_VXLAN_LOCAL, + IFLA_VXLAN_TTL, + IFLA_VXLAN_TOS, + IFLA_VXLAN_LEARNING, + IFLA_VXLAN_AGEING, + IFLA_VXLAN_LIMIT, + IFLA_VXLAN_PORT_RANGE, /* source port */ + IFLA_VXLAN_PROXY, + IFLA_VXLAN_RSC, + IFLA_VXLAN_L2MISS, + IFLA_VXLAN_L3MISS, + IFLA_VXLAN_PORT, /* destination port */ + IFLA_VXLAN_GROUP6, + IFLA_VXLAN_LOCAL6, + IFLA_VXLAN_UDP_CSUM, + IFLA_VXLAN_UDP_ZERO_CSUM6_TX, + IFLA_VXLAN_UDP_ZERO_CSUM6_RX, + IFLA_VXLAN_REMCSUM_TX, + IFLA_VXLAN_REMCSUM_RX, + IFLA_VXLAN_GBP, + IFLA_VXLAN_REMCSUM_NOPARTIAL, + IFLA_VXLAN_COLLECT_METADATA, + IFLA_VXLAN_LABEL, + IFLA_VXLAN_GPE, + IFLA_VXLAN_TTL_INHERIT, + IFLA_VXLAN_DF, + IFLA_VXLAN_VNIFILTER, /* only applicable with COLLECT_METADATA mode */ + IFLA_VXLAN_LOCALBYPASS, + __IFLA_VXLAN_MAX +}; +#define IFLA_VXLAN_MAX (__IFLA_VXLAN_MAX - 1) + +struct ifla_vxlan_port_range { + __be16 low; + __be16 high; +}; + +enum ifla_vxlan_df { + VXLAN_DF_UNSET = 0, + VXLAN_DF_SET, + VXLAN_DF_INHERIT, + __VXLAN_DF_END, + VXLAN_DF_MAX = __VXLAN_DF_END - 1, +}; + +/* GENEVE section */ +enum { + IFLA_GENEVE_UNSPEC, + IFLA_GENEVE_ID, + IFLA_GENEVE_REMOTE, + IFLA_GENEVE_TTL, + IFLA_GENEVE_TOS, + IFLA_GENEVE_PORT, /* destination port */ + IFLA_GENEVE_COLLECT_METADATA, + IFLA_GENEVE_REMOTE6, + IFLA_GENEVE_UDP_CSUM, + IFLA_GENEVE_UDP_ZERO_CSUM6_TX, + IFLA_GENEVE_UDP_ZERO_CSUM6_RX, + IFLA_GENEVE_LABEL, + IFLA_GENEVE_TTL_INHERIT, + IFLA_GENEVE_DF, + IFLA_GENEVE_INNER_PROTO_INHERIT, + __IFLA_GENEVE_MAX +}; +#define IFLA_GENEVE_MAX (__IFLA_GENEVE_MAX - 1) + +enum ifla_geneve_df { + GENEVE_DF_UNSET = 0, + GENEVE_DF_SET, + GENEVE_DF_INHERIT, + __GENEVE_DF_END, + GENEVE_DF_MAX = __GENEVE_DF_END - 1, +}; + +/* Bareudp section */ +enum { + IFLA_BAREUDP_UNSPEC, + IFLA_BAREUDP_PORT, + IFLA_BAREUDP_ETHERTYPE, + IFLA_BAREUDP_SRCPORT_MIN, + IFLA_BAREUDP_MULTIPROTO_MODE, + __IFLA_BAREUDP_MAX +}; + +#define IFLA_BAREUDP_MAX (__IFLA_BAREUDP_MAX - 1) + +/* PPP section */ +enum { + IFLA_PPP_UNSPEC, + IFLA_PPP_DEV_FD, + __IFLA_PPP_MAX +}; +#define IFLA_PPP_MAX (__IFLA_PPP_MAX - 1) + +/* GTP section */ + +enum ifla_gtp_role { + GTP_ROLE_GGSN = 0, + GTP_ROLE_SGSN, +}; + +enum { + IFLA_GTP_UNSPEC, + IFLA_GTP_FD0, + IFLA_GTP_FD1, + IFLA_GTP_PDP_HASHSIZE, + IFLA_GTP_ROLE, + IFLA_GTP_CREATE_SOCKETS, + IFLA_GTP_RESTART_COUNT, + __IFLA_GTP_MAX, +}; +#define IFLA_GTP_MAX (__IFLA_GTP_MAX - 1) + +/* Bonding section */ + +enum { + IFLA_BOND_UNSPEC, + IFLA_BOND_MODE, + IFLA_BOND_ACTIVE_SLAVE, + IFLA_BOND_MIIMON, + IFLA_BOND_UPDELAY, + IFLA_BOND_DOWNDELAY, + IFLA_BOND_USE_CARRIER, + IFLA_BOND_ARP_INTERVAL, + IFLA_BOND_ARP_IP_TARGET, + IFLA_BOND_ARP_VALIDATE, + IFLA_BOND_ARP_ALL_TARGETS, + IFLA_BOND_PRIMARY, + IFLA_BOND_PRIMARY_RESELECT, + IFLA_BOND_FAIL_OVER_MAC, + IFLA_BOND_XMIT_HASH_POLICY, + IFLA_BOND_RESEND_IGMP, + IFLA_BOND_NUM_PEER_NOTIF, + IFLA_BOND_ALL_SLAVES_ACTIVE, + IFLA_BOND_MIN_LINKS, + IFLA_BOND_LP_INTERVAL, + IFLA_BOND_PACKETS_PER_SLAVE, + IFLA_BOND_AD_LACP_RATE, + IFLA_BOND_AD_SELECT, + IFLA_BOND_AD_INFO, + IFLA_BOND_AD_ACTOR_SYS_PRIO, + IFLA_BOND_AD_USER_PORT_KEY, + IFLA_BOND_AD_ACTOR_SYSTEM, + IFLA_BOND_TLB_DYNAMIC_LB, + IFLA_BOND_PEER_NOTIF_DELAY, + IFLA_BOND_AD_LACP_ACTIVE, + IFLA_BOND_MISSED_MAX, + IFLA_BOND_NS_IP6_TARGET, + IFLA_BOND_COUPLED_CONTROL, + __IFLA_BOND_MAX, +}; + +#define IFLA_BOND_MAX (__IFLA_BOND_MAX - 1) + +enum { + IFLA_BOND_AD_INFO_UNSPEC, + IFLA_BOND_AD_INFO_AGGREGATOR, + IFLA_BOND_AD_INFO_NUM_PORTS, + IFLA_BOND_AD_INFO_ACTOR_KEY, + IFLA_BOND_AD_INFO_PARTNER_KEY, + IFLA_BOND_AD_INFO_PARTNER_MAC, + __IFLA_BOND_AD_INFO_MAX, +}; + +#define IFLA_BOND_AD_INFO_MAX (__IFLA_BOND_AD_INFO_MAX - 1) + +enum { + IFLA_BOND_SLAVE_UNSPEC, + IFLA_BOND_SLAVE_STATE, + IFLA_BOND_SLAVE_MII_STATUS, + IFLA_BOND_SLAVE_LINK_FAILURE_COUNT, + IFLA_BOND_SLAVE_PERM_HWADDR, + IFLA_BOND_SLAVE_QUEUE_ID, + IFLA_BOND_SLAVE_AD_AGGREGATOR_ID, + IFLA_BOND_SLAVE_AD_ACTOR_OPER_PORT_STATE, + IFLA_BOND_SLAVE_AD_PARTNER_OPER_PORT_STATE, + IFLA_BOND_SLAVE_PRIO, + __IFLA_BOND_SLAVE_MAX, +}; + +#define IFLA_BOND_SLAVE_MAX (__IFLA_BOND_SLAVE_MAX - 1) + +/* SR-IOV virtual function management section */ + +enum { + IFLA_VF_INFO_UNSPEC, + IFLA_VF_INFO, + __IFLA_VF_INFO_MAX, +}; + +#define IFLA_VF_INFO_MAX (__IFLA_VF_INFO_MAX - 1) + +enum { + IFLA_VF_UNSPEC, + IFLA_VF_MAC, /* Hardware queue specific attributes */ + IFLA_VF_VLAN, /* VLAN ID and QoS */ + IFLA_VF_TX_RATE, /* Max TX Bandwidth Allocation */ + IFLA_VF_SPOOFCHK, /* Spoof Checking on/off switch */ + IFLA_VF_LINK_STATE, /* link state enable/disable/auto switch */ + IFLA_VF_RATE, /* Min and Max TX Bandwidth Allocation */ + IFLA_VF_RSS_QUERY_EN, /* RSS Redirection Table and Hash Key query + * on/off switch + */ + IFLA_VF_STATS, /* network device statistics */ + IFLA_VF_TRUST, /* Trust VF */ + IFLA_VF_IB_NODE_GUID, /* VF Infiniband node GUID */ + IFLA_VF_IB_PORT_GUID, /* VF Infiniband port GUID */ + IFLA_VF_VLAN_LIST, /* nested list of vlans, option for QinQ */ + IFLA_VF_BROADCAST, /* VF broadcast */ + __IFLA_VF_MAX, +}; + +#define IFLA_VF_MAX (__IFLA_VF_MAX - 1) + +struct ifla_vf_mac { + __u32 vf; + __u8 mac[32]; /* MAX_ADDR_LEN */ +}; + +struct ifla_vf_broadcast { + __u8 broadcast[32]; +}; + +struct ifla_vf_vlan { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; +}; + +enum { + IFLA_VF_VLAN_INFO_UNSPEC, + IFLA_VF_VLAN_INFO, /* VLAN ID, QoS and VLAN protocol */ + __IFLA_VF_VLAN_INFO_MAX, +}; + +#define IFLA_VF_VLAN_INFO_MAX (__IFLA_VF_VLAN_INFO_MAX - 1) +#define MAX_VLAN_LIST_LEN 1 + +struct ifla_vf_vlan_info { + __u32 vf; + __u32 vlan; /* 0 - 4095, 0 disables VLAN filter */ + __u32 qos; + __be16 vlan_proto; /* VLAN protocol either 802.1Q or 802.1ad */ +}; + +struct ifla_vf_tx_rate { + __u32 vf; + __u32 rate; /* Max TX bandwidth in Mbps, 0 disables throttling */ +}; + +struct ifla_vf_rate { + __u32 vf; + __u32 min_tx_rate; /* Min Bandwidth in Mbps */ + __u32 max_tx_rate; /* Max Bandwidth in Mbps */ +}; + +struct ifla_vf_spoofchk { + __u32 vf; + __u32 setting; +}; + +struct ifla_vf_guid { + __u32 vf; + __u64 guid; +}; + +enum { + IFLA_VF_LINK_STATE_AUTO, /* link state of the uplink */ + IFLA_VF_LINK_STATE_ENABLE, /* link always up */ + IFLA_VF_LINK_STATE_DISABLE, /* link always down */ + __IFLA_VF_LINK_STATE_MAX, +}; + +struct ifla_vf_link_state { + __u32 vf; + __u32 link_state; +}; + +struct ifla_vf_rss_query_en { + __u32 vf; + __u32 setting; +}; + +enum { + IFLA_VF_STATS_RX_PACKETS, + IFLA_VF_STATS_TX_PACKETS, + IFLA_VF_STATS_RX_BYTES, + IFLA_VF_STATS_TX_BYTES, + IFLA_VF_STATS_BROADCAST, + IFLA_VF_STATS_MULTICAST, + IFLA_VF_STATS_PAD, + IFLA_VF_STATS_RX_DROPPED, + IFLA_VF_STATS_TX_DROPPED, + __IFLA_VF_STATS_MAX, +}; + +#define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1) + +struct ifla_vf_trust { + __u32 vf; + __u32 setting; +}; + +/* VF ports management section + * + * Nested layout of set/get msg is: + * + * [IFLA_NUM_VF] + * [IFLA_VF_PORTS] + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * [IFLA_VF_PORT] + * [IFLA_PORT_*], ... + * ... + * [IFLA_PORT_SELF] + * [IFLA_PORT_*], ... + */ + +enum { + IFLA_VF_PORT_UNSPEC, + IFLA_VF_PORT, /* nest */ + __IFLA_VF_PORT_MAX, +}; + +#define IFLA_VF_PORT_MAX (__IFLA_VF_PORT_MAX - 1) + +enum { + IFLA_PORT_UNSPEC, + IFLA_PORT_VF, /* __u32 */ + IFLA_PORT_PROFILE, /* string */ + IFLA_PORT_VSI_TYPE, /* 802.1Qbg (pre-)standard VDP */ + IFLA_PORT_INSTANCE_UUID, /* binary UUID */ + IFLA_PORT_HOST_UUID, /* binary UUID */ + IFLA_PORT_REQUEST, /* __u8 */ + IFLA_PORT_RESPONSE, /* __u16, output only */ + __IFLA_PORT_MAX, +}; + +#define IFLA_PORT_MAX (__IFLA_PORT_MAX - 1) + +#define PORT_PROFILE_MAX 40 +#define PORT_UUID_MAX 16 +#define PORT_SELF_VF -1 + +enum { + PORT_REQUEST_PREASSOCIATE = 0, + PORT_REQUEST_PREASSOCIATE_RR, + PORT_REQUEST_ASSOCIATE, + PORT_REQUEST_DISASSOCIATE, +}; + +enum { + PORT_VDP_RESPONSE_SUCCESS = 0, + PORT_VDP_RESPONSE_INVALID_FORMAT, + PORT_VDP_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_VDP_RESPONSE_UNUSED_VTID, + PORT_VDP_RESPONSE_VTID_VIOLATION, + PORT_VDP_RESPONSE_VTID_VERSION_VIOALTION, + PORT_VDP_RESPONSE_OUT_OF_SYNC, + /* 0x08-0xFF reserved for future VDP use */ + PORT_PROFILE_RESPONSE_SUCCESS = 0x100, + PORT_PROFILE_RESPONSE_INPROGRESS, + PORT_PROFILE_RESPONSE_INVALID, + PORT_PROFILE_RESPONSE_BADSTATE, + PORT_PROFILE_RESPONSE_INSUFFICIENT_RESOURCES, + PORT_PROFILE_RESPONSE_ERROR, +}; + +struct ifla_port_vsi { + __u8 vsi_mgr_id; + __u8 vsi_type_id[3]; + __u8 vsi_type_version; + __u8 pad[3]; +}; + + +/* IPoIB section */ + +enum { + IFLA_IPOIB_UNSPEC, + IFLA_IPOIB_PKEY, + IFLA_IPOIB_MODE, + IFLA_IPOIB_UMCAST, + __IFLA_IPOIB_MAX +}; + +enum { + IPOIB_MODE_DATAGRAM = 0, /* using unreliable datagram QPs */ + IPOIB_MODE_CONNECTED = 1, /* using connected QPs */ +}; + +#define IFLA_IPOIB_MAX (__IFLA_IPOIB_MAX - 1) + + +/* HSR/PRP section, both uses same interface */ + +/* Different redundancy protocols for hsr device */ +enum { + HSR_PROTOCOL_HSR, + HSR_PROTOCOL_PRP, + HSR_PROTOCOL_MAX, +}; + +enum { + IFLA_HSR_UNSPEC, + IFLA_HSR_SLAVE1, + IFLA_HSR_SLAVE2, + IFLA_HSR_MULTICAST_SPEC, /* Last byte of supervision addr */ + IFLA_HSR_SUPERVISION_ADDR, /* Supervision frame multicast addr */ + IFLA_HSR_SEQ_NR, + IFLA_HSR_VERSION, /* HSR version */ + IFLA_HSR_PROTOCOL, /* Indicate different protocol than + * HSR. For example PRP. + */ + __IFLA_HSR_MAX, +}; + +#define IFLA_HSR_MAX (__IFLA_HSR_MAX - 1) + +/* STATS section */ + +struct if_stats_msg { + __u8 family; + __u8 pad1; + __u16 pad2; + __u32 ifindex; + __u32 filter_mask; +}; + +/* A stats attribute can be netdev specific or a global stat. + * For netdev stats, lets use the prefix IFLA_STATS_LINK_* + */ +enum { + IFLA_STATS_UNSPEC, /* also used as 64bit pad attribute */ + IFLA_STATS_LINK_64, + IFLA_STATS_LINK_XSTATS, + IFLA_STATS_LINK_XSTATS_SLAVE, + IFLA_STATS_LINK_OFFLOAD_XSTATS, + IFLA_STATS_AF_SPEC, + __IFLA_STATS_MAX, +}; + +#define IFLA_STATS_MAX (__IFLA_STATS_MAX - 1) + +#define IFLA_STATS_FILTER_BIT(ATTR) (1 << (ATTR - 1)) + +enum { + IFLA_STATS_GETSET_UNSPEC, + IFLA_STATS_GET_FILTERS, /* Nest of IFLA_STATS_LINK_xxx, each a u32 with + * a filter mask for the corresponding group. + */ + IFLA_STATS_SET_OFFLOAD_XSTATS_L3_STATS, /* 0 or 1 as u8 */ + __IFLA_STATS_GETSET_MAX, +}; + +#define IFLA_STATS_GETSET_MAX (__IFLA_STATS_GETSET_MAX - 1) + +/* These are embedded into IFLA_STATS_LINK_XSTATS: + * [IFLA_STATS_LINK_XSTATS] + * -> [LINK_XSTATS_TYPE_xxx] + * -> [rtnl link type specific attributes] + */ +enum { + LINK_XSTATS_TYPE_UNSPEC, + LINK_XSTATS_TYPE_BRIDGE, + LINK_XSTATS_TYPE_BOND, + __LINK_XSTATS_TYPE_MAX +}; +#define LINK_XSTATS_TYPE_MAX (__LINK_XSTATS_TYPE_MAX - 1) + +/* These are stats embedded into IFLA_STATS_LINK_OFFLOAD_XSTATS */ +enum { + IFLA_OFFLOAD_XSTATS_UNSPEC, + IFLA_OFFLOAD_XSTATS_CPU_HIT, /* struct rtnl_link_stats64 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO, /* HW stats info. A nest */ + IFLA_OFFLOAD_XSTATS_L3_STATS, /* struct rtnl_hw_stats64 */ + __IFLA_OFFLOAD_XSTATS_MAX +}; +#define IFLA_OFFLOAD_XSTATS_MAX (__IFLA_OFFLOAD_XSTATS_MAX - 1) + +enum { + IFLA_OFFLOAD_XSTATS_HW_S_INFO_UNSPEC, + IFLA_OFFLOAD_XSTATS_HW_S_INFO_REQUEST, /* u8 */ + IFLA_OFFLOAD_XSTATS_HW_S_INFO_USED, /* u8 */ + __IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX, +}; +#define IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX \ + (__IFLA_OFFLOAD_XSTATS_HW_S_INFO_MAX - 1) + +/* XDP section */ + +#define XDP_FLAGS_UPDATE_IF_NOEXIST (1U << 0) +#define XDP_FLAGS_SKB_MODE (1U << 1) +#define XDP_FLAGS_DRV_MODE (1U << 2) +#define XDP_FLAGS_HW_MODE (1U << 3) +#define XDP_FLAGS_REPLACE (1U << 4) +#define XDP_FLAGS_MODES (XDP_FLAGS_SKB_MODE | \ + XDP_FLAGS_DRV_MODE | \ + XDP_FLAGS_HW_MODE) +#define XDP_FLAGS_MASK (XDP_FLAGS_UPDATE_IF_NOEXIST | \ + XDP_FLAGS_MODES | XDP_FLAGS_REPLACE) + +/* These are stored into IFLA_XDP_ATTACHED on dump. */ +enum { + XDP_ATTACHED_NONE = 0, + XDP_ATTACHED_DRV, + XDP_ATTACHED_SKB, + XDP_ATTACHED_HW, + XDP_ATTACHED_MULTI, +}; + +enum { + IFLA_XDP_UNSPEC, + IFLA_XDP_FD, + IFLA_XDP_ATTACHED, + IFLA_XDP_FLAGS, + IFLA_XDP_PROG_ID, + IFLA_XDP_DRV_PROG_ID, + IFLA_XDP_SKB_PROG_ID, + IFLA_XDP_HW_PROG_ID, + IFLA_XDP_EXPECTED_FD, + __IFLA_XDP_MAX, +}; + +#define IFLA_XDP_MAX (__IFLA_XDP_MAX - 1) + +enum { + IFLA_EVENT_NONE, + IFLA_EVENT_REBOOT, /* internal reset / reboot */ + IFLA_EVENT_FEATURES, /* change in offload features */ + IFLA_EVENT_BONDING_FAILOVER, /* change in active slave */ + IFLA_EVENT_NOTIFY_PEERS, /* re-sent grat. arp/ndisc */ + IFLA_EVENT_IGMP_RESEND, /* re-sent IGMP JOIN */ + IFLA_EVENT_BONDING_OPTIONS, /* change in bonding options */ +}; + +/* tun section */ + +enum { + IFLA_TUN_UNSPEC, + IFLA_TUN_OWNER, + IFLA_TUN_GROUP, + IFLA_TUN_TYPE, + IFLA_TUN_PI, + IFLA_TUN_VNET_HDR, + IFLA_TUN_PERSIST, + IFLA_TUN_MULTI_QUEUE, + IFLA_TUN_NUM_QUEUES, + IFLA_TUN_NUM_DISABLED_QUEUES, + __IFLA_TUN_MAX, +}; + +#define IFLA_TUN_MAX (__IFLA_TUN_MAX - 1) + +/* rmnet section */ + +#define RMNET_FLAGS_INGRESS_DEAGGREGATION (1U << 0) +#define RMNET_FLAGS_INGRESS_MAP_COMMANDS (1U << 1) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV4 (1U << 2) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV4 (1U << 3) +#define RMNET_FLAGS_INGRESS_MAP_CKSUMV5 (1U << 4) +#define RMNET_FLAGS_EGRESS_MAP_CKSUMV5 (1U << 5) + +enum { + IFLA_RMNET_UNSPEC, + IFLA_RMNET_MUX_ID, + IFLA_RMNET_FLAGS, + __IFLA_RMNET_MAX, +}; + +#define IFLA_RMNET_MAX (__IFLA_RMNET_MAX - 1) + +struct ifla_rmnet_flags { + __u32 flags; + __u32 mask; +}; + +/* MCTP section */ + +enum { + IFLA_MCTP_UNSPEC, + IFLA_MCTP_NET, + __IFLA_MCTP_MAX, +}; + +#define IFLA_MCTP_MAX (__IFLA_MCTP_MAX - 1) + +/* DSA section */ + +enum { + IFLA_DSA_UNSPEC, + IFLA_DSA_MASTER, + __IFLA_DSA_MAX, +}; + +#define IFLA_DSA_MAX (__IFLA_DSA_MAX - 1) + +#endif /* _UAPI_LINUX_IF_LINK_H */ diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/if_xdp.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/if_xdp.h new file mode 100644 index 0000000000..638c606dfa --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/if_xdp.h @@ -0,0 +1,169 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * if_xdp: XDP socket user-space interface + * Copyright(c) 2018 Intel Corporation. + * + * Author(s): Björn Töpel + * Magnus Karlsson + */ + +#ifndef _LINUX_IF_XDP_H +#define _LINUX_IF_XDP_H + +#include + +/* Options for the sxdp_flags field */ +#define XDP_SHARED_UMEM (1 << 0) +#define XDP_COPY (1 << 1) /* Force copy-mode */ +#define XDP_ZEROCOPY (1 << 2) /* Force zero-copy mode */ +/* If this option is set, the driver might go sleep and in that case + * the XDP_RING_NEED_WAKEUP flag in the fill and/or Tx rings will be + * set. If it is set, the application need to explicitly wake up the + * driver with a poll() (Rx and Tx) or sendto() (Tx only). If you are + * running the driver and the application on the same core, you should + * use this option so that the kernel will yield to the user space + * application. + */ +#define XDP_USE_NEED_WAKEUP (1 << 3) +/* By setting this option, userspace application indicates that it can + * handle multiple descriptors per packet thus enabling AF_XDP to split + * multi-buffer XDP frames into multiple Rx descriptors. Without this set + * such frames will be dropped. + */ +#define XDP_USE_SG (1 << 4) + +/* Flags for xsk_umem_config flags */ +#define XDP_UMEM_UNALIGNED_CHUNK_FLAG (1 << 0) + +/* Force checksum calculation in software. Can be used for testing or + * working around potential HW issues. This option causes performance + * degradation and only works in XDP_COPY mode. + */ +#define XDP_UMEM_TX_SW_CSUM (1 << 1) + +struct sockaddr_xdp { + __u16 sxdp_family; + __u16 sxdp_flags; + __u32 sxdp_ifindex; + __u32 sxdp_queue_id; + __u32 sxdp_shared_umem_fd; +}; + +/* XDP_RING flags */ +#define XDP_RING_NEED_WAKEUP (1 << 0) + +struct xdp_ring_offset { + __u64 producer; + __u64 consumer; + __u64 desc; + __u64 flags; +}; + +struct xdp_mmap_offsets { + struct xdp_ring_offset rx; + struct xdp_ring_offset tx; + struct xdp_ring_offset fr; /* Fill */ + struct xdp_ring_offset cr; /* Completion */ +}; + +/* XDP socket options */ +#define XDP_MMAP_OFFSETS 1 +#define XDP_RX_RING 2 +#define XDP_TX_RING 3 +#define XDP_UMEM_REG 4 +#define XDP_UMEM_FILL_RING 5 +#define XDP_UMEM_COMPLETION_RING 6 +#define XDP_STATISTICS 7 +#define XDP_OPTIONS 8 + +struct xdp_umem_reg { + __u64 addr; /* Start of packet data area */ + __u64 len; /* Length of packet data area */ + __u32 chunk_size; + __u32 headroom; + __u32 flags; + __u32 tx_metadata_len; +}; + +struct xdp_statistics { + __u64 rx_dropped; /* Dropped for other reasons */ + __u64 rx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 tx_invalid_descs; /* Dropped due to invalid descriptor */ + __u64 rx_ring_full; /* Dropped due to rx ring being full */ + __u64 rx_fill_ring_empty_descs; /* Failed to retrieve item from fill ring */ + __u64 tx_ring_empty_descs; /* Failed to retrieve item from tx ring */ +}; + +struct xdp_options { + __u32 flags; +}; + +/* Flags for the flags field of struct xdp_options */ +#define XDP_OPTIONS_ZEROCOPY (1 << 0) + +/* Pgoff for mmaping the rings */ +#define XDP_PGOFF_RX_RING 0 +#define XDP_PGOFF_TX_RING 0x80000000 +#define XDP_UMEM_PGOFF_FILL_RING 0x100000000ULL +#define XDP_UMEM_PGOFF_COMPLETION_RING 0x180000000ULL + +/* Masks for unaligned chunks mode */ +#define XSK_UNALIGNED_BUF_OFFSET_SHIFT 48 +#define XSK_UNALIGNED_BUF_ADDR_MASK \ + ((1ULL << XSK_UNALIGNED_BUF_OFFSET_SHIFT) - 1) + +/* Request transmit timestamp. Upon completion, put it into tx_timestamp + * field of union xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_TIMESTAMP (1 << 0) + +/* Request transmit checksum offload. Checksum start position and offset + * are communicated via csum_start and csum_offset fields of union + * xsk_tx_metadata. + */ +#define XDP_TXMD_FLAGS_CHECKSUM (1 << 1) + +/* AF_XDP offloads request. 'request' union member is consumed by the driver + * when the packet is being transmitted. 'completion' union member is + * filled by the driver when the transmit completion arrives. + */ +struct xsk_tx_metadata { + __u64 flags; + + union { + struct { + /* XDP_TXMD_FLAGS_CHECKSUM */ + + /* Offset from desc->addr where checksumming should start. */ + __u16 csum_start; + /* Offset from csum_start where checksum should be stored. */ + __u16 csum_offset; + } request; + + struct { + /* XDP_TXMD_FLAGS_TIMESTAMP */ + __u64 tx_timestamp; + } completion; + }; +}; + +/* Rx/Tx descriptor */ +struct xdp_desc { + __u64 addr; + __u32 len; + __u32 options; +}; + +/* UMEM descriptor is __u64 */ + +/* Flag indicating that the packet continues with the buffer pointed out by the + * next frame in the ring. The end of the packet is signalled by setting this + * bit to zero. For single buffer packets, every descriptor has 'options' set + * to 0 and this maintains backward compatibility. + */ +#define XDP_PKT_CONTD (1 << 0) + +/* TX packet carries valid metadata. */ +#define XDP_TX_METADATA (1 << 1) + +#endif /* _LINUX_IF_XDP_H */ diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/netdev.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/netdev.h new file mode 100644 index 0000000000..43742ac5b0 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/netdev.h @@ -0,0 +1,197 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/netdev.yaml */ +/* YNL-GEN uapi header */ + +#ifndef _UAPI_LINUX_NETDEV_H +#define _UAPI_LINUX_NETDEV_H + +#define NETDEV_FAMILY_NAME "netdev" +#define NETDEV_FAMILY_VERSION 1 + +/** + * enum netdev_xdp_act + * @NETDEV_XDP_ACT_BASIC: XDP features set supported by all drivers + * (XDP_ABORTED, XDP_DROP, XDP_PASS, XDP_TX) + * @NETDEV_XDP_ACT_REDIRECT: The netdev supports XDP_REDIRECT + * @NETDEV_XDP_ACT_NDO_XMIT: This feature informs if netdev implements + * ndo_xdp_xmit callback. + * @NETDEV_XDP_ACT_XSK_ZEROCOPY: This feature informs if netdev supports AF_XDP + * in zero copy mode. + * @NETDEV_XDP_ACT_HW_OFFLOAD: This feature informs if netdev supports XDP hw + * offloading. + * @NETDEV_XDP_ACT_RX_SG: This feature informs if netdev implements non-linear + * XDP buffer support in the driver napi callback. + * @NETDEV_XDP_ACT_NDO_XMIT_SG: This feature informs if netdev implements + * non-linear XDP buffer support in ndo_xdp_xmit callback. + */ +enum netdev_xdp_act { + NETDEV_XDP_ACT_BASIC = 1, + NETDEV_XDP_ACT_REDIRECT = 2, + NETDEV_XDP_ACT_NDO_XMIT = 4, + NETDEV_XDP_ACT_XSK_ZEROCOPY = 8, + NETDEV_XDP_ACT_HW_OFFLOAD = 16, + NETDEV_XDP_ACT_RX_SG = 32, + NETDEV_XDP_ACT_NDO_XMIT_SG = 64, + + /* private: */ + NETDEV_XDP_ACT_MASK = 127, +}; + +/** + * enum netdev_xdp_rx_metadata + * @NETDEV_XDP_RX_METADATA_TIMESTAMP: Device is capable of exposing receive HW + * timestamp via bpf_xdp_metadata_rx_timestamp(). + * @NETDEV_XDP_RX_METADATA_HASH: Device is capable of exposing receive packet + * hash via bpf_xdp_metadata_rx_hash(). + * @NETDEV_XDP_RX_METADATA_VLAN_TAG: Device is capable of exposing receive + * packet VLAN tag via bpf_xdp_metadata_rx_vlan_tag(). + */ +enum netdev_xdp_rx_metadata { + NETDEV_XDP_RX_METADATA_TIMESTAMP = 1, + NETDEV_XDP_RX_METADATA_HASH = 2, + NETDEV_XDP_RX_METADATA_VLAN_TAG = 4, +}; + +/** + * enum netdev_xsk_flags + * @NETDEV_XSK_FLAGS_TX_TIMESTAMP: HW timestamping egress packets is supported + * by the driver. + * @NETDEV_XSK_FLAGS_TX_CHECKSUM: L3 checksum HW offload is supported by the + * driver. + */ +enum netdev_xsk_flags { + NETDEV_XSK_FLAGS_TX_TIMESTAMP = 1, + NETDEV_XSK_FLAGS_TX_CHECKSUM = 2, +}; + +enum netdev_queue_type { + NETDEV_QUEUE_TYPE_RX, + NETDEV_QUEUE_TYPE_TX, +}; + +enum netdev_qstats_scope { + NETDEV_QSTATS_SCOPE_QUEUE = 1, +}; + +enum { + NETDEV_A_DEV_IFINDEX = 1, + NETDEV_A_DEV_PAD, + NETDEV_A_DEV_XDP_FEATURES, + NETDEV_A_DEV_XDP_ZC_MAX_SEGS, + NETDEV_A_DEV_XDP_RX_METADATA_FEATURES, + NETDEV_A_DEV_XSK_FEATURES, + + __NETDEV_A_DEV_MAX, + NETDEV_A_DEV_MAX = (__NETDEV_A_DEV_MAX - 1) +}; + +enum { + NETDEV_A_PAGE_POOL_ID = 1, + NETDEV_A_PAGE_POOL_IFINDEX, + NETDEV_A_PAGE_POOL_NAPI_ID, + NETDEV_A_PAGE_POOL_INFLIGHT, + NETDEV_A_PAGE_POOL_INFLIGHT_MEM, + NETDEV_A_PAGE_POOL_DETACH_TIME, + + __NETDEV_A_PAGE_POOL_MAX, + NETDEV_A_PAGE_POOL_MAX = (__NETDEV_A_PAGE_POOL_MAX - 1) +}; + +enum { + NETDEV_A_PAGE_POOL_STATS_INFO = 1, + NETDEV_A_PAGE_POOL_STATS_ALLOC_FAST = 8, + NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW, + NETDEV_A_PAGE_POOL_STATS_ALLOC_SLOW_HIGH_ORDER, + NETDEV_A_PAGE_POOL_STATS_ALLOC_EMPTY, + NETDEV_A_PAGE_POOL_STATS_ALLOC_REFILL, + NETDEV_A_PAGE_POOL_STATS_ALLOC_WAIVE, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHED, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_CACHE_FULL, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_RING_FULL, + NETDEV_A_PAGE_POOL_STATS_RECYCLE_RELEASED_REFCNT, + + __NETDEV_A_PAGE_POOL_STATS_MAX, + NETDEV_A_PAGE_POOL_STATS_MAX = (__NETDEV_A_PAGE_POOL_STATS_MAX - 1) +}; + +enum { + NETDEV_A_NAPI_IFINDEX = 1, + NETDEV_A_NAPI_ID, + NETDEV_A_NAPI_IRQ, + NETDEV_A_NAPI_PID, + + __NETDEV_A_NAPI_MAX, + NETDEV_A_NAPI_MAX = (__NETDEV_A_NAPI_MAX - 1) +}; + +enum { + NETDEV_A_QUEUE_ID = 1, + NETDEV_A_QUEUE_IFINDEX, + NETDEV_A_QUEUE_TYPE, + NETDEV_A_QUEUE_NAPI_ID, + + __NETDEV_A_QUEUE_MAX, + NETDEV_A_QUEUE_MAX = (__NETDEV_A_QUEUE_MAX - 1) +}; + +enum { + NETDEV_A_QSTATS_IFINDEX = 1, + NETDEV_A_QSTATS_QUEUE_TYPE, + NETDEV_A_QSTATS_QUEUE_ID, + NETDEV_A_QSTATS_SCOPE, + NETDEV_A_QSTATS_RX_PACKETS = 8, + NETDEV_A_QSTATS_RX_BYTES, + NETDEV_A_QSTATS_TX_PACKETS, + NETDEV_A_QSTATS_TX_BYTES, + NETDEV_A_QSTATS_RX_ALLOC_FAIL, + NETDEV_A_QSTATS_RX_HW_DROPS, + NETDEV_A_QSTATS_RX_HW_DROP_OVERRUNS, + NETDEV_A_QSTATS_RX_CSUM_COMPLETE, + NETDEV_A_QSTATS_RX_CSUM_UNNECESSARY, + NETDEV_A_QSTATS_RX_CSUM_NONE, + NETDEV_A_QSTATS_RX_CSUM_BAD, + NETDEV_A_QSTATS_RX_HW_GRO_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_BYTES, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_PACKETS, + NETDEV_A_QSTATS_RX_HW_GRO_WIRE_BYTES, + NETDEV_A_QSTATS_RX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_HW_DROPS, + NETDEV_A_QSTATS_TX_HW_DROP_ERRORS, + NETDEV_A_QSTATS_TX_CSUM_NONE, + NETDEV_A_QSTATS_TX_NEEDS_CSUM, + NETDEV_A_QSTATS_TX_HW_GSO_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_BYTES, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_PACKETS, + NETDEV_A_QSTATS_TX_HW_GSO_WIRE_BYTES, + NETDEV_A_QSTATS_TX_HW_DROP_RATELIMITS, + NETDEV_A_QSTATS_TX_STOP, + NETDEV_A_QSTATS_TX_WAKE, + + __NETDEV_A_QSTATS_MAX, + NETDEV_A_QSTATS_MAX = (__NETDEV_A_QSTATS_MAX - 1) +}; + +enum { + NETDEV_CMD_DEV_GET = 1, + NETDEV_CMD_DEV_ADD_NTF, + NETDEV_CMD_DEV_DEL_NTF, + NETDEV_CMD_DEV_CHANGE_NTF, + NETDEV_CMD_PAGE_POOL_GET, + NETDEV_CMD_PAGE_POOL_ADD_NTF, + NETDEV_CMD_PAGE_POOL_DEL_NTF, + NETDEV_CMD_PAGE_POOL_CHANGE_NTF, + NETDEV_CMD_PAGE_POOL_STATS_GET, + NETDEV_CMD_QUEUE_GET, + NETDEV_CMD_NAPI_GET, + NETDEV_CMD_QSTATS_GET, + + __NETDEV_CMD_MAX, + NETDEV_CMD_MAX = (__NETDEV_CMD_MAX - 1) +}; + +#define NETDEV_MCGRP_MGMT "mgmt" +#define NETDEV_MCGRP_PAGE_POOL "page-pool" + +#endif /* _UAPI_LINUX_NETDEV_H */ diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/netlink.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/netlink.h new file mode 100644 index 0000000000..0a4d733177 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/netlink.h @@ -0,0 +1,252 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_NETLINK_H +#define _UAPI__LINUX_NETLINK_H + +#include +#include /* for __kernel_sa_family_t */ +#include + +#define NETLINK_ROUTE 0 /* Routing/device hook */ +#define NETLINK_UNUSED 1 /* Unused number */ +#define NETLINK_USERSOCK 2 /* Reserved for user mode socket protocols */ +#define NETLINK_FIREWALL 3 /* Unused number, formerly ip_queue */ +#define NETLINK_SOCK_DIAG 4 /* socket monitoring */ +#define NETLINK_NFLOG 5 /* netfilter/iptables ULOG */ +#define NETLINK_XFRM 6 /* ipsec */ +#define NETLINK_SELINUX 7 /* SELinux event notifications */ +#define NETLINK_ISCSI 8 /* Open-iSCSI */ +#define NETLINK_AUDIT 9 /* auditing */ +#define NETLINK_FIB_LOOKUP 10 +#define NETLINK_CONNECTOR 11 +#define NETLINK_NETFILTER 12 /* netfilter subsystem */ +#define NETLINK_IP6_FW 13 +#define NETLINK_DNRTMSG 14 /* DECnet routing messages */ +#define NETLINK_KOBJECT_UEVENT 15 /* Kernel messages to userspace */ +#define NETLINK_GENERIC 16 +/* leave room for NETLINK_DM (DM Events) */ +#define NETLINK_SCSITRANSPORT 18 /* SCSI Transports */ +#define NETLINK_ECRYPTFS 19 +#define NETLINK_RDMA 20 +#define NETLINK_CRYPTO 21 /* Crypto layer */ +#define NETLINK_SMC 22 /* SMC monitoring */ + +#define NETLINK_INET_DIAG NETLINK_SOCK_DIAG + +#define MAX_LINKS 32 + +struct sockaddr_nl { + __kernel_sa_family_t nl_family; /* AF_NETLINK */ + unsigned short nl_pad; /* zero */ + __u32 nl_pid; /* port ID */ + __u32 nl_groups; /* multicast groups mask */ +}; + +struct nlmsghdr { + __u32 nlmsg_len; /* Length of message including header */ + __u16 nlmsg_type; /* Message content */ + __u16 nlmsg_flags; /* Additional flags */ + __u32 nlmsg_seq; /* Sequence number */ + __u32 nlmsg_pid; /* Sending process port ID */ +}; + +/* Flags values */ + +#define NLM_F_REQUEST 0x01 /* It is request message. */ +#define NLM_F_MULTI 0x02 /* Multipart message, terminated by NLMSG_DONE */ +#define NLM_F_ACK 0x04 /* Reply with ack, with zero or error code */ +#define NLM_F_ECHO 0x08 /* Echo this request */ +#define NLM_F_DUMP_INTR 0x10 /* Dump was inconsistent due to sequence change */ +#define NLM_F_DUMP_FILTERED 0x20 /* Dump was filtered as requested */ + +/* Modifiers to GET request */ +#define NLM_F_ROOT 0x100 /* specify tree root */ +#define NLM_F_MATCH 0x200 /* return all matching */ +#define NLM_F_ATOMIC 0x400 /* atomic GET */ +#define NLM_F_DUMP (NLM_F_ROOT|NLM_F_MATCH) + +/* Modifiers to NEW request */ +#define NLM_F_REPLACE 0x100 /* Override existing */ +#define NLM_F_EXCL 0x200 /* Do not touch, if it exists */ +#define NLM_F_CREATE 0x400 /* Create, if it does not exist */ +#define NLM_F_APPEND 0x800 /* Add to end of list */ + +/* Modifiers to DELETE request */ +#define NLM_F_NONREC 0x100 /* Do not delete recursively */ + +/* Flags for ACK message */ +#define NLM_F_CAPPED 0x100 /* request was capped */ +#define NLM_F_ACK_TLVS 0x200 /* extended ACK TVLs were included */ + +/* + 4.4BSD ADD NLM_F_CREATE|NLM_F_EXCL + 4.4BSD CHANGE NLM_F_REPLACE + + True CHANGE NLM_F_CREATE|NLM_F_REPLACE + Append NLM_F_CREATE + Check NLM_F_EXCL + */ + +#define NLMSG_ALIGNTO 4U +#define NLMSG_ALIGN(len) ( ((len)+NLMSG_ALIGNTO-1) & ~(NLMSG_ALIGNTO-1) ) +#define NLMSG_HDRLEN ((int) NLMSG_ALIGN(sizeof(struct nlmsghdr))) +#define NLMSG_LENGTH(len) ((len) + NLMSG_HDRLEN) +#define NLMSG_SPACE(len) NLMSG_ALIGN(NLMSG_LENGTH(len)) +#define NLMSG_DATA(nlh) ((void*)(((char*)nlh) + NLMSG_LENGTH(0))) +#define NLMSG_NEXT(nlh,len) ((len) -= NLMSG_ALIGN((nlh)->nlmsg_len), \ + (struct nlmsghdr*)(((char*)(nlh)) + NLMSG_ALIGN((nlh)->nlmsg_len))) +#define NLMSG_OK(nlh,len) ((len) >= (int)sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len >= sizeof(struct nlmsghdr) && \ + (nlh)->nlmsg_len <= (len)) +#define NLMSG_PAYLOAD(nlh,len) ((nlh)->nlmsg_len - NLMSG_SPACE((len))) + +#define NLMSG_NOOP 0x1 /* Nothing. */ +#define NLMSG_ERROR 0x2 /* Error */ +#define NLMSG_DONE 0x3 /* End of a dump */ +#define NLMSG_OVERRUN 0x4 /* Data lost */ + +#define NLMSG_MIN_TYPE 0x10 /* < 0x10: reserved control messages */ + +struct nlmsgerr { + int error; + struct nlmsghdr msg; + /* + * followed by the message contents unless NETLINK_CAP_ACK was set + * or the ACK indicates success (error == 0) + * message length is aligned with NLMSG_ALIGN() + */ + /* + * followed by TLVs defined in enum nlmsgerr_attrs + * if NETLINK_EXT_ACK was set + */ +}; + +/** + * enum nlmsgerr_attrs - nlmsgerr attributes + * @NLMSGERR_ATTR_UNUSED: unused + * @NLMSGERR_ATTR_MSG: error message string (string) + * @NLMSGERR_ATTR_OFFS: offset of the invalid attribute in the original + * message, counting from the beginning of the header (u32) + * @NLMSGERR_ATTR_COOKIE: arbitrary subsystem specific cookie to + * be used - in the success case - to identify a created + * object or operation or similar (binary) + * @__NLMSGERR_ATTR_MAX: number of attributes + * @NLMSGERR_ATTR_MAX: highest attribute number + */ +enum nlmsgerr_attrs { + NLMSGERR_ATTR_UNUSED, + NLMSGERR_ATTR_MSG, + NLMSGERR_ATTR_OFFS, + NLMSGERR_ATTR_COOKIE, + + __NLMSGERR_ATTR_MAX, + NLMSGERR_ATTR_MAX = __NLMSGERR_ATTR_MAX - 1 +}; + +#define NETLINK_ADD_MEMBERSHIP 1 +#define NETLINK_DROP_MEMBERSHIP 2 +#define NETLINK_PKTINFO 3 +#define NETLINK_BROADCAST_ERROR 4 +#define NETLINK_NO_ENOBUFS 5 +#ifndef __KERNEL__ +#define NETLINK_RX_RING 6 +#define NETLINK_TX_RING 7 +#endif +#define NETLINK_LISTEN_ALL_NSID 8 +#define NETLINK_LIST_MEMBERSHIPS 9 +#define NETLINK_CAP_ACK 10 +#define NETLINK_EXT_ACK 11 +#define NETLINK_GET_STRICT_CHK 12 + +struct nl_pktinfo { + __u32 group; +}; + +struct nl_mmap_req { + unsigned int nm_block_size; + unsigned int nm_block_nr; + unsigned int nm_frame_size; + unsigned int nm_frame_nr; +}; + +struct nl_mmap_hdr { + unsigned int nm_status; + unsigned int nm_len; + __u32 nm_group; + /* credentials */ + __u32 nm_pid; + __u32 nm_uid; + __u32 nm_gid; +}; + +#ifndef __KERNEL__ +enum nl_mmap_status { + NL_MMAP_STATUS_UNUSED, + NL_MMAP_STATUS_RESERVED, + NL_MMAP_STATUS_VALID, + NL_MMAP_STATUS_COPY, + NL_MMAP_STATUS_SKIP, +}; + +#define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO +#define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) +#define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) +#endif + +#define NET_MAJOR 36 /* Major 36 is reserved for networking */ + +enum { + NETLINK_UNCONNECTED = 0, + NETLINK_CONNECTED, +}; + +/* + * <------- NLA_HDRLEN ------> <-- NLA_ALIGN(payload)--> + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * | Header | Pad | Payload | Pad | + * | (struct nlattr) | ing | | ing | + * +---------------------+- - -+- - - - - - - - - -+- - -+ + * <-------------- nlattr->nla_len --------------> + */ + +struct nlattr { + __u16 nla_len; + __u16 nla_type; +}; + +/* + * nla_type (16 bits) + * +---+---+-------------------------------+ + * | N | O | Attribute Type | + * +---+---+-------------------------------+ + * N := Carries nested attributes + * O := Payload stored in network byte order + * + * Note: The N and O flag are mutually exclusive. + */ +#define NLA_F_NESTED (1 << 15) +#define NLA_F_NET_BYTEORDER (1 << 14) +#define NLA_TYPE_MASK ~(NLA_F_NESTED | NLA_F_NET_BYTEORDER) + +#define NLA_ALIGNTO 4 +#define NLA_ALIGN(len) (((len) + NLA_ALIGNTO - 1) & ~(NLA_ALIGNTO - 1)) +#define NLA_HDRLEN ((int) NLA_ALIGN(sizeof(struct nlattr))) + +/* Generic 32 bitflags attribute content sent to the kernel. + * + * The value is a bitmap that defines the values being set + * The selector is a bitmask that defines which value is legit + * + * Examples: + * value = 0x0, and selector = 0x1 + * implies we are selecting bit 1 and we want to set its value to 0. + * + * value = 0x2, and selector = 0x2 + * implies we are selecting bit 2 and we want to set its value to 1. + * + */ +struct nla_bitfield32 { + __u32 value; + __u32 selector; +}; + +#endif /* _UAPI__LINUX_NETLINK_H */ diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/perf_event.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/perf_event.h new file mode 100644 index 0000000000..3a64499b0f --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/perf_event.h @@ -0,0 +1,1462 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Performance events: + * + * Copyright (C) 2008-2009, Thomas Gleixner + * Copyright (C) 2008-2011, Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2011, Red Hat, Inc., Peter Zijlstra + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _UAPI_LINUX_PERF_EVENT_H +#define _UAPI_LINUX_PERF_EVENT_H + +#include +#include +#include + +/* + * User-space ABI bits: + */ + +/* + * attr.type + */ +enum perf_type_id { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + PERF_TYPE_HW_CACHE = 3, + PERF_TYPE_RAW = 4, + PERF_TYPE_BREAKPOINT = 5, + + PERF_TYPE_MAX, /* non-ABI */ +}; + +/* + * attr.config layout for type PERF_TYPE_HARDWARE and PERF_TYPE_HW_CACHE + * PERF_TYPE_HARDWARE: 0xEEEEEEEE000000AA + * AA: hardware event ID + * EEEEEEEE: PMU type ID + * PERF_TYPE_HW_CACHE: 0xEEEEEEEE00DDCCBB + * BB: hardware cache ID + * CC: hardware cache op ID + * DD: hardware cache op result ID + * EEEEEEEE: PMU type ID + * If the PMU type ID is 0, the PERF_TYPE_RAW will be applied. + */ +#define PERF_PMU_TYPE_SHIFT 32 +#define PERF_HW_EVENT_MASK 0xffffffff + +/* + * Generalized performance event event_id types, used by the + * attr.event_id parameter of the sys_perf_event_open() + * syscall: + */ +enum perf_hw_id { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_HW_CPU_CYCLES = 0, + PERF_COUNT_HW_INSTRUCTIONS = 1, + PERF_COUNT_HW_CACHE_REFERENCES = 2, + PERF_COUNT_HW_CACHE_MISSES = 3, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_HW_BRANCH_MISSES = 5, + PERF_COUNT_HW_BUS_CYCLES = 6, + PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, + PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, + PERF_COUNT_HW_REF_CPU_CYCLES = 9, + + PERF_COUNT_HW_MAX, /* non-ABI */ +}; + +/* + * Generalized hardware cache events: + * + * { L1-D, L1-I, LLC, ITLB, DTLB, BPU, NODE } x + * { read, write, prefetch } x + * { accesses, misses } + */ +enum perf_hw_cache_id { + PERF_COUNT_HW_CACHE_L1D = 0, + PERF_COUNT_HW_CACHE_L1I = 1, + PERF_COUNT_HW_CACHE_LL = 2, + PERF_COUNT_HW_CACHE_DTLB = 3, + PERF_COUNT_HW_CACHE_ITLB = 4, + PERF_COUNT_HW_CACHE_BPU = 5, + PERF_COUNT_HW_CACHE_NODE = 6, + + PERF_COUNT_HW_CACHE_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_id { + PERF_COUNT_HW_CACHE_OP_READ = 0, + PERF_COUNT_HW_CACHE_OP_WRITE = 1, + PERF_COUNT_HW_CACHE_OP_PREFETCH = 2, + + PERF_COUNT_HW_CACHE_OP_MAX, /* non-ABI */ +}; + +enum perf_hw_cache_op_result_id { + PERF_COUNT_HW_CACHE_RESULT_ACCESS = 0, + PERF_COUNT_HW_CACHE_RESULT_MISS = 1, + + PERF_COUNT_HW_CACHE_RESULT_MAX, /* non-ABI */ +}; + +/* + * Special "software" events provided by the kernel, even if the hardware + * does not support performance events. These events measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum perf_sw_ids { + PERF_COUNT_SW_CPU_CLOCK = 0, + PERF_COUNT_SW_TASK_CLOCK = 1, + PERF_COUNT_SW_PAGE_FAULTS = 2, + PERF_COUNT_SW_CONTEXT_SWITCHES = 3, + PERF_COUNT_SW_CPU_MIGRATIONS = 4, + PERF_COUNT_SW_PAGE_FAULTS_MIN = 5, + PERF_COUNT_SW_PAGE_FAULTS_MAJ = 6, + PERF_COUNT_SW_ALIGNMENT_FAULTS = 7, + PERF_COUNT_SW_EMULATION_FAULTS = 8, + PERF_COUNT_SW_DUMMY = 9, + PERF_COUNT_SW_BPF_OUTPUT = 10, + PERF_COUNT_SW_CGROUP_SWITCHES = 11, + + PERF_COUNT_SW_MAX, /* non-ABI */ +}; + +/* + * Bits that can be set in attr.sample_type to request information + * in the overflow packets. + */ +enum perf_event_sample_format { + PERF_SAMPLE_IP = 1U << 0, + PERF_SAMPLE_TID = 1U << 1, + PERF_SAMPLE_TIME = 1U << 2, + PERF_SAMPLE_ADDR = 1U << 3, + PERF_SAMPLE_READ = 1U << 4, + PERF_SAMPLE_CALLCHAIN = 1U << 5, + PERF_SAMPLE_ID = 1U << 6, + PERF_SAMPLE_CPU = 1U << 7, + PERF_SAMPLE_PERIOD = 1U << 8, + PERF_SAMPLE_STREAM_ID = 1U << 9, + PERF_SAMPLE_RAW = 1U << 10, + PERF_SAMPLE_BRANCH_STACK = 1U << 11, + PERF_SAMPLE_REGS_USER = 1U << 12, + PERF_SAMPLE_STACK_USER = 1U << 13, + PERF_SAMPLE_WEIGHT = 1U << 14, + PERF_SAMPLE_DATA_SRC = 1U << 15, + PERF_SAMPLE_IDENTIFIER = 1U << 16, + PERF_SAMPLE_TRANSACTION = 1U << 17, + PERF_SAMPLE_REGS_INTR = 1U << 18, + PERF_SAMPLE_PHYS_ADDR = 1U << 19, + PERF_SAMPLE_AUX = 1U << 20, + PERF_SAMPLE_CGROUP = 1U << 21, + PERF_SAMPLE_DATA_PAGE_SIZE = 1U << 22, + PERF_SAMPLE_CODE_PAGE_SIZE = 1U << 23, + PERF_SAMPLE_WEIGHT_STRUCT = 1U << 24, + + PERF_SAMPLE_MAX = 1U << 25, /* non-ABI */ +}; + +#define PERF_SAMPLE_WEIGHT_TYPE (PERF_SAMPLE_WEIGHT | PERF_SAMPLE_WEIGHT_STRUCT) +/* + * values to program into branch_sample_type when PERF_SAMPLE_BRANCH is set + * + * If the user does not pass priv level information via branch_sample_type, + * the kernel uses the event's priv level. Branch and event priv levels do + * not have to match. Branch priv level is checked for permissions. + * + * The branch types can be combined, however BRANCH_ANY covers all types + * of branches and therefore it supersedes all the other types. + */ +enum perf_branch_sample_type_shift { + PERF_SAMPLE_BRANCH_USER_SHIFT = 0, /* user branches */ + PERF_SAMPLE_BRANCH_KERNEL_SHIFT = 1, /* kernel branches */ + PERF_SAMPLE_BRANCH_HV_SHIFT = 2, /* hypervisor branches */ + + PERF_SAMPLE_BRANCH_ANY_SHIFT = 3, /* any branch types */ + PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT = 4, /* any call branch */ + PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT = 5, /* any return branch */ + PERF_SAMPLE_BRANCH_IND_CALL_SHIFT = 6, /* indirect calls */ + PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT = 7, /* transaction aborts */ + PERF_SAMPLE_BRANCH_IN_TX_SHIFT = 8, /* in transaction */ + PERF_SAMPLE_BRANCH_NO_TX_SHIFT = 9, /* not in transaction */ + PERF_SAMPLE_BRANCH_COND_SHIFT = 10, /* conditional branches */ + + PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT = 11, /* call/ret stack */ + PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT = 12, /* indirect jumps */ + PERF_SAMPLE_BRANCH_CALL_SHIFT = 13, /* direct call */ + + PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT = 14, /* no flags */ + PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT = 15, /* no cycles */ + + PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT = 16, /* save branch type */ + + PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT = 17, /* save low level index of raw branch records */ + + PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT = 18, /* save privilege mode */ + + PERF_SAMPLE_BRANCH_COUNTERS_SHIFT = 19, /* save occurrences of events on a branch */ + + PERF_SAMPLE_BRANCH_MAX_SHIFT /* non-ABI */ +}; + +enum perf_branch_sample_type { + PERF_SAMPLE_BRANCH_USER = 1U << PERF_SAMPLE_BRANCH_USER_SHIFT, + PERF_SAMPLE_BRANCH_KERNEL = 1U << PERF_SAMPLE_BRANCH_KERNEL_SHIFT, + PERF_SAMPLE_BRANCH_HV = 1U << PERF_SAMPLE_BRANCH_HV_SHIFT, + + PERF_SAMPLE_BRANCH_ANY = 1U << PERF_SAMPLE_BRANCH_ANY_SHIFT, + PERF_SAMPLE_BRANCH_ANY_CALL = 1U << PERF_SAMPLE_BRANCH_ANY_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ANY_RETURN = 1U << PERF_SAMPLE_BRANCH_ANY_RETURN_SHIFT, + PERF_SAMPLE_BRANCH_IND_CALL = 1U << PERF_SAMPLE_BRANCH_IND_CALL_SHIFT, + PERF_SAMPLE_BRANCH_ABORT_TX = 1U << PERF_SAMPLE_BRANCH_ABORT_TX_SHIFT, + PERF_SAMPLE_BRANCH_IN_TX = 1U << PERF_SAMPLE_BRANCH_IN_TX_SHIFT, + PERF_SAMPLE_BRANCH_NO_TX = 1U << PERF_SAMPLE_BRANCH_NO_TX_SHIFT, + PERF_SAMPLE_BRANCH_COND = 1U << PERF_SAMPLE_BRANCH_COND_SHIFT, + + PERF_SAMPLE_BRANCH_CALL_STACK = 1U << PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT, + PERF_SAMPLE_BRANCH_IND_JUMP = 1U << PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT, + PERF_SAMPLE_BRANCH_CALL = 1U << PERF_SAMPLE_BRANCH_CALL_SHIFT, + + PERF_SAMPLE_BRANCH_NO_FLAGS = 1U << PERF_SAMPLE_BRANCH_NO_FLAGS_SHIFT, + PERF_SAMPLE_BRANCH_NO_CYCLES = 1U << PERF_SAMPLE_BRANCH_NO_CYCLES_SHIFT, + + PERF_SAMPLE_BRANCH_TYPE_SAVE = + 1U << PERF_SAMPLE_BRANCH_TYPE_SAVE_SHIFT, + + PERF_SAMPLE_BRANCH_HW_INDEX = 1U << PERF_SAMPLE_BRANCH_HW_INDEX_SHIFT, + + PERF_SAMPLE_BRANCH_PRIV_SAVE = 1U << PERF_SAMPLE_BRANCH_PRIV_SAVE_SHIFT, + + PERF_SAMPLE_BRANCH_COUNTERS = 1U << PERF_SAMPLE_BRANCH_COUNTERS_SHIFT, + + PERF_SAMPLE_BRANCH_MAX = 1U << PERF_SAMPLE_BRANCH_MAX_SHIFT, +}; + +/* + * Common flow change classification + */ +enum { + PERF_BR_UNKNOWN = 0, /* unknown */ + PERF_BR_COND = 1, /* conditional */ + PERF_BR_UNCOND = 2, /* unconditional */ + PERF_BR_IND = 3, /* indirect */ + PERF_BR_CALL = 4, /* function call */ + PERF_BR_IND_CALL = 5, /* indirect function call */ + PERF_BR_RET = 6, /* function return */ + PERF_BR_SYSCALL = 7, /* syscall */ + PERF_BR_SYSRET = 8, /* syscall return */ + PERF_BR_COND_CALL = 9, /* conditional function call */ + PERF_BR_COND_RET = 10, /* conditional function return */ + PERF_BR_ERET = 11, /* exception return */ + PERF_BR_IRQ = 12, /* irq */ + PERF_BR_SERROR = 13, /* system error */ + PERF_BR_NO_TX = 14, /* not in transaction */ + PERF_BR_EXTEND_ABI = 15, /* extend ABI */ + PERF_BR_MAX, +}; + +/* + * Common branch speculation outcome classification + */ +enum { + PERF_BR_SPEC_NA = 0, /* Not available */ + PERF_BR_SPEC_WRONG_PATH = 1, /* Speculative but on wrong path */ + PERF_BR_NON_SPEC_CORRECT_PATH = 2, /* Non-speculative but on correct path */ + PERF_BR_SPEC_CORRECT_PATH = 3, /* Speculative and on correct path */ + PERF_BR_SPEC_MAX, +}; + +enum { + PERF_BR_NEW_FAULT_ALGN = 0, /* Alignment fault */ + PERF_BR_NEW_FAULT_DATA = 1, /* Data fault */ + PERF_BR_NEW_FAULT_INST = 2, /* Inst fault */ + PERF_BR_NEW_ARCH_1 = 3, /* Architecture specific */ + PERF_BR_NEW_ARCH_2 = 4, /* Architecture specific */ + PERF_BR_NEW_ARCH_3 = 5, /* Architecture specific */ + PERF_BR_NEW_ARCH_4 = 6, /* Architecture specific */ + PERF_BR_NEW_ARCH_5 = 7, /* Architecture specific */ + PERF_BR_NEW_MAX, +}; + +enum { + PERF_BR_PRIV_UNKNOWN = 0, + PERF_BR_PRIV_USER = 1, + PERF_BR_PRIV_KERNEL = 2, + PERF_BR_PRIV_HV = 3, +}; + +#define PERF_BR_ARM64_FIQ PERF_BR_NEW_ARCH_1 +#define PERF_BR_ARM64_DEBUG_HALT PERF_BR_NEW_ARCH_2 +#define PERF_BR_ARM64_DEBUG_EXIT PERF_BR_NEW_ARCH_3 +#define PERF_BR_ARM64_DEBUG_INST PERF_BR_NEW_ARCH_4 +#define PERF_BR_ARM64_DEBUG_DATA PERF_BR_NEW_ARCH_5 + +#define PERF_SAMPLE_BRANCH_PLM_ALL \ + (PERF_SAMPLE_BRANCH_USER|\ + PERF_SAMPLE_BRANCH_KERNEL|\ + PERF_SAMPLE_BRANCH_HV) + +/* + * Values to determine ABI of the registers dump. + */ +enum perf_sample_regs_abi { + PERF_SAMPLE_REGS_ABI_NONE = 0, + PERF_SAMPLE_REGS_ABI_32 = 1, + PERF_SAMPLE_REGS_ABI_64 = 2, +}; + +/* + * Values for the memory transaction event qualifier, mostly for + * abort events. Multiple bits can be set. + */ +enum { + PERF_TXN_ELISION = (1 << 0), /* From elision */ + PERF_TXN_TRANSACTION = (1 << 1), /* From transaction */ + PERF_TXN_SYNC = (1 << 2), /* Instruction is related */ + PERF_TXN_ASYNC = (1 << 3), /* Instruction not related */ + PERF_TXN_RETRY = (1 << 4), /* Retry possible */ + PERF_TXN_CONFLICT = (1 << 5), /* Conflict abort */ + PERF_TXN_CAPACITY_WRITE = (1 << 6), /* Capacity write abort */ + PERF_TXN_CAPACITY_READ = (1 << 7), /* Capacity read abort */ + + PERF_TXN_MAX = (1 << 8), /* non-ABI */ + + /* bits 32..63 are reserved for the abort code */ + + PERF_TXN_ABORT_MASK = (0xffffffffULL << 32), + PERF_TXN_ABORT_SHIFT = 32, +}; + +/* + * The format of the data returned by read() on a perf event fd, + * as specified by attr.read_format: + * + * struct read_format { + * { u64 value; + * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED + * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING + * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST + * } && !PERF_FORMAT_GROUP + * + * { u64 nr; + * { u64 time_enabled; } && PERF_FORMAT_TOTAL_TIME_ENABLED + * { u64 time_running; } && PERF_FORMAT_TOTAL_TIME_RUNNING + * { u64 value; + * { u64 id; } && PERF_FORMAT_ID + * { u64 lost; } && PERF_FORMAT_LOST + * } cntr[nr]; + * } && PERF_FORMAT_GROUP + * }; + */ +enum perf_event_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1U << 0, + PERF_FORMAT_TOTAL_TIME_RUNNING = 1U << 1, + PERF_FORMAT_ID = 1U << 2, + PERF_FORMAT_GROUP = 1U << 3, + PERF_FORMAT_LOST = 1U << 4, + + PERF_FORMAT_MAX = 1U << 5, /* non-ABI */ +}; + +#define PERF_ATTR_SIZE_VER0 64 /* sizeof first published struct */ +#define PERF_ATTR_SIZE_VER1 72 /* add: config2 */ +#define PERF_ATTR_SIZE_VER2 80 /* add: branch_sample_type */ +#define PERF_ATTR_SIZE_VER3 96 /* add: sample_regs_user */ + /* add: sample_stack_user */ +#define PERF_ATTR_SIZE_VER4 104 /* add: sample_regs_intr */ +#define PERF_ATTR_SIZE_VER5 112 /* add: aux_watermark */ +#define PERF_ATTR_SIZE_VER6 120 /* add: aux_sample_size */ +#define PERF_ATTR_SIZE_VER7 128 /* add: sig_data */ +#define PERF_ATTR_SIZE_VER8 136 /* add: config3 */ + +/* + * Hardware event_id to monitor via a performance monitoring event: + * + * @sample_max_stack: Max number of frame pointers in a callchain, + * should be < /proc/sys/kernel/perf_event_max_stack + */ +struct perf_event_attr { + + /* + * Major type: hardware/software/tracepoint/etc. + */ + __u32 type; + + /* + * Size of the attr structure, for fwd/bwd compat. + */ + __u32 size; + + /* + * Type specific configuration information. + */ + __u64 config; + + union { + __u64 sample_period; + __u64 sample_freq; + }; + + __u64 sample_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ + inherit_stat : 1, /* per task counts */ + enable_on_exec : 1, /* next exec enables */ + task : 1, /* trace fork/exit */ + watermark : 1, /* wakeup_watermark */ + /* + * precise_ip: + * + * 0 - SAMPLE_IP can have arbitrary skid + * 1 - SAMPLE_IP must have constant skid + * 2 - SAMPLE_IP requested to have 0 skid + * 3 - SAMPLE_IP must have 0 skid + * + * See also PERF_RECORD_MISC_EXACT_IP + */ + precise_ip : 2, /* skid constraint */ + mmap_data : 1, /* non-exec mmap data */ + sample_id_all : 1, /* sample_type all events */ + + exclude_host : 1, /* don't count in host */ + exclude_guest : 1, /* don't count in guest */ + + exclude_callchain_kernel : 1, /* exclude kernel callchains */ + exclude_callchain_user : 1, /* exclude user callchains */ + mmap2 : 1, /* include mmap with inode data */ + comm_exec : 1, /* flag comm events that are due to an exec */ + use_clockid : 1, /* use @clockid for time fields */ + context_switch : 1, /* context switch data */ + write_backward : 1, /* Write ring buffer from end to beginning */ + namespaces : 1, /* include namespaces data */ + ksymbol : 1, /* include ksymbol events */ + bpf_event : 1, /* include bpf events */ + aux_output : 1, /* generate AUX records instead of events */ + cgroup : 1, /* include cgroup events */ + text_poke : 1, /* include text poke events */ + build_id : 1, /* use build id in mmap2 events */ + inherit_thread : 1, /* children only inherit if cloned with CLONE_THREAD */ + remove_on_exec : 1, /* event is removed from task on exec */ + sigtrap : 1, /* send synchronous SIGTRAP on event */ + __reserved_1 : 26; + + union { + __u32 wakeup_events; /* wakeup every n events */ + __u32 wakeup_watermark; /* bytes before wakeup */ + }; + + __u32 bp_type; + union { + __u64 bp_addr; + __u64 kprobe_func; /* for perf_kprobe */ + __u64 uprobe_path; /* for perf_uprobe */ + __u64 config1; /* extension of config */ + }; + union { + __u64 bp_len; + __u64 kprobe_addr; /* when kprobe_func == NULL */ + __u64 probe_offset; /* for perf_[k,u]probe */ + __u64 config2; /* extension of config1 */ + }; + __u64 branch_sample_type; /* enum perf_branch_sample_type */ + + /* + * Defines set of user regs to dump on samples. + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_user; + + /* + * Defines size of the user stack to dump on samples. + */ + __u32 sample_stack_user; + + __s32 clockid; + /* + * Defines set of regs to dump for each sample + * state captured on: + * - precise = 0: PMU interrupt + * - precise > 0: sampled instruction + * + * See asm/perf_regs.h for details. + */ + __u64 sample_regs_intr; + + /* + * Wakeup watermark for AUX area + */ + __u32 aux_watermark; + __u16 sample_max_stack; + __u16 __reserved_2; + __u32 aux_sample_size; + __u32 __reserved_3; + + /* + * User provided data if sigtrap=1, passed back to user via + * siginfo_t::si_perf_data, e.g. to permit user to identify the event. + * Note, siginfo_t::si_perf_data is long-sized, and sig_data will be + * truncated accordingly on 32 bit architectures. + */ + __u64 sig_data; + + __u64 config3; /* extension of config2 */ +}; + +/* + * Structure used by below PERF_EVENT_IOC_QUERY_BPF command + * to query bpf programs attached to the same perf tracepoint + * as the given perf event. + */ +struct perf_event_query_bpf { + /* + * The below ids array length + */ + __u32 ids_len; + /* + * Set by the kernel to indicate the number of + * available programs + */ + __u32 prog_cnt; + /* + * User provided buffer to store program ids + */ + __u32 ids[]; +}; + +/* + * Ioctls that can be done on a perf event fd: + */ +#define PERF_EVENT_IOC_ENABLE _IO ('$', 0) +#define PERF_EVENT_IOC_DISABLE _IO ('$', 1) +#define PERF_EVENT_IOC_REFRESH _IO ('$', 2) +#define PERF_EVENT_IOC_RESET _IO ('$', 3) +#define PERF_EVENT_IOC_PERIOD _IOW('$', 4, __u64) +#define PERF_EVENT_IOC_SET_OUTPUT _IO ('$', 5) +#define PERF_EVENT_IOC_SET_FILTER _IOW('$', 6, char *) +#define PERF_EVENT_IOC_ID _IOR('$', 7, __u64 *) +#define PERF_EVENT_IOC_SET_BPF _IOW('$', 8, __u32) +#define PERF_EVENT_IOC_PAUSE_OUTPUT _IOW('$', 9, __u32) +#define PERF_EVENT_IOC_QUERY_BPF _IOWR('$', 10, struct perf_event_query_bpf *) +#define PERF_EVENT_IOC_MODIFY_ATTRIBUTES _IOW('$', 11, struct perf_event_attr *) + +enum perf_event_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_event_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw events in user-space. + * + * u32 seq, time_mult, time_shift, index, width; + * u64 count, enabled, running; + * u64 cyc, time_offset; + * s64 pmc = 0; + * + * do { + * seq = pc->lock; + * barrier() + * + * enabled = pc->time_enabled; + * running = pc->time_running; + * + * if (pc->cap_usr_time && enabled != running) { + * cyc = rdtsc(); + * time_offset = pc->time_offset; + * time_mult = pc->time_mult; + * time_shift = pc->time_shift; + * } + * + * index = pc->index; + * count = pc->offset; + * if (pc->cap_user_rdpmc && index) { + * width = pc->pmc_width; + * pmc = rdpmc(index - 1); + * } + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware event identifier */ + __s64 offset; /* add to hardware event value */ + __u64 time_enabled; /* time event active */ + __u64 time_running; /* time event on cpu */ + union { + __u64 capabilities; + struct { + __u64 cap_bit0 : 1, /* Always 0, deprecated, see commit 860f085b74e9 */ + cap_bit0_is_deprecated : 1, /* Always 1, signals that bit 0 is zero */ + + cap_user_rdpmc : 1, /* The RDPMC instruction can be used to read counts */ + cap_user_time : 1, /* The time_{shift,mult,offset} fields are used */ + cap_user_time_zero : 1, /* The time_zero field is used */ + cap_user_time_short : 1, /* the time_{cycle,mask} fields are used */ + cap_____res : 58; + }; + }; + + /* + * If cap_user_rdpmc this field provides the bit-width of the value + * read using the rdpmc() or equivalent instruction. This can be used + * to sign extend the result like: + * + * pmc <<= 64 - width; + * pmc >>= 64 - width; // signed shift right + * count += pmc; + */ + __u16 pmc_width; + + /* + * If cap_usr_time the below fields can be used to compute the time + * delta since time_enabled (in ns) using rdtsc or similar. + * + * u64 quot, rem; + * u64 delta; + * + * quot = (cyc >> time_shift); + * rem = cyc & (((u64)1 << time_shift) - 1); + * delta = time_offset + quot * time_mult + + * ((rem * time_mult) >> time_shift); + * + * Where time_offset,time_mult,time_shift and cyc are read in the + * seqcount loop described above. This delta can then be added to + * enabled and possible running (if index), improving the scaling: + * + * enabled += delta; + * if (index) + * running += delta; + * + * quot = count / running; + * rem = count % running; + * count = quot * enabled + (rem * enabled) / running; + */ + __u16 time_shift; + __u32 time_mult; + __u64 time_offset; + /* + * If cap_usr_time_zero, the hardware clock (e.g. TSC) can be calculated + * from sample timestamps. + * + * time = timestamp - time_zero; + * quot = time / time_mult; + * rem = time % time_mult; + * cyc = (quot << time_shift) + (rem << time_shift) / time_mult; + * + * And vice versa: + * + * quot = cyc >> time_shift; + * rem = cyc & (((u64)1 << time_shift) - 1); + * timestamp = time_zero + quot * time_mult + + * ((rem * time_mult) >> time_shift); + */ + __u64 time_zero; + + __u32 size; /* Header size up to __reserved[] fields. */ + __u32 __reserved_1; + + /* + * If cap_usr_time_short, the hardware clock is less than 64bit wide + * and we must compute the 'cyc' value, as used by cap_usr_time, as: + * + * cyc = time_cycles + ((cyc - time_cycles) & time_mask) + * + * NOTE: this form is explicitly chosen such that cap_usr_time_short + * is a correction on top of cap_usr_time, and code that doesn't + * know about cap_usr_time_short still works under the assumption + * the counter doesn't wrap. + */ + __u64 time_cycles; + __u64 time_mask; + + /* + * Hole for extension of the self monitor capabilities + */ + + __u8 __reserved[116*8]; /* align to 1k. */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading the @data_head value should issue an smp_rmb(), + * after reading this value. + * + * When the mapping is PROT_WRITE the @data_tail value should be + * written by userspace to reflect the last read data, after issueing + * an smp_mb() to separate the data read from the ->data_tail store. + * In this case the kernel will not over-write unread data. + * + * See perf_output_put_handle() for the data ordering. + * + * data_{offset,size} indicate the location and size of the perf record + * buffer within the mmapped area. + */ + __u64 data_head; /* head in the data section */ + __u64 data_tail; /* user-space written tail */ + __u64 data_offset; /* where the buffer starts */ + __u64 data_size; /* data buffer size */ + + /* + * AUX area is defined by aux_{offset,size} fields that should be set + * by the userspace, so that + * + * aux_offset >= data_offset + data_size + * + * prior to mmap()ing it. Size of the mmap()ed area should be aux_size. + * + * Ring buffer pointers aux_{head,tail} have the same semantics as + * data_{head,tail} and same ordering rules apply. + */ + __u64 aux_head; + __u64 aux_tail; + __u64 aux_offset; + __u64 aux_size; +}; + +/* + * The current state of perf_event_header::misc bits usage: + * ('|' used bit, '-' unused bit) + * + * 012 CDEF + * |||---------|||| + * + * Where: + * 0-2 CPUMODE_MASK + * + * C PROC_MAP_PARSE_TIMEOUT + * D MMAP_DATA / COMM_EXEC / FORK_EXEC / SWITCH_OUT + * E MMAP_BUILD_ID / EXACT_IP / SCHED_OUT_PREEMPT + * F (reserved) + */ + +#define PERF_RECORD_MISC_CPUMODE_MASK (7 << 0) +#define PERF_RECORD_MISC_CPUMODE_UNKNOWN (0 << 0) +#define PERF_RECORD_MISC_KERNEL (1 << 0) +#define PERF_RECORD_MISC_USER (2 << 0) +#define PERF_RECORD_MISC_HYPERVISOR (3 << 0) +#define PERF_RECORD_MISC_GUEST_KERNEL (4 << 0) +#define PERF_RECORD_MISC_GUEST_USER (5 << 0) + +/* + * Indicates that /proc/PID/maps parsing are truncated by time out. + */ +#define PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT (1 << 12) +/* + * Following PERF_RECORD_MISC_* are used on different + * events, so can reuse the same bit position: + * + * PERF_RECORD_MISC_MMAP_DATA - PERF_RECORD_MMAP* events + * PERF_RECORD_MISC_COMM_EXEC - PERF_RECORD_COMM event + * PERF_RECORD_MISC_FORK_EXEC - PERF_RECORD_FORK event (perf internal) + * PERF_RECORD_MISC_SWITCH_OUT - PERF_RECORD_SWITCH* events + */ +#define PERF_RECORD_MISC_MMAP_DATA (1 << 13) +#define PERF_RECORD_MISC_COMM_EXEC (1 << 13) +#define PERF_RECORD_MISC_FORK_EXEC (1 << 13) +#define PERF_RECORD_MISC_SWITCH_OUT (1 << 13) +/* + * These PERF_RECORD_MISC_* flags below are safely reused + * for the following events: + * + * PERF_RECORD_MISC_EXACT_IP - PERF_RECORD_SAMPLE of precise events + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT - PERF_RECORD_SWITCH* events + * PERF_RECORD_MISC_MMAP_BUILD_ID - PERF_RECORD_MMAP2 event + * + * + * PERF_RECORD_MISC_EXACT_IP: + * Indicates that the content of PERF_SAMPLE_IP points to + * the actual instruction that triggered the event. See also + * perf_event_attr::precise_ip. + * + * PERF_RECORD_MISC_SWITCH_OUT_PREEMPT: + * Indicates that thread was preempted in TASK_RUNNING state. + * + * PERF_RECORD_MISC_MMAP_BUILD_ID: + * Indicates that mmap2 event carries build id data. + */ +#define PERF_RECORD_MISC_EXACT_IP (1 << 14) +#define PERF_RECORD_MISC_SWITCH_OUT_PREEMPT (1 << 14) +#define PERF_RECORD_MISC_MMAP_BUILD_ID (1 << 14) +/* + * Reserve the last bit to indicate some extended misc field + */ +#define PERF_RECORD_MISC_EXT_RESERVED (1 << 15) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +struct perf_ns_link_info { + __u64 dev; + __u64 ino; +}; + +enum { + NET_NS_INDEX = 0, + UTS_NS_INDEX = 1, + IPC_NS_INDEX = 2, + PID_NS_INDEX = 3, + USER_NS_INDEX = 4, + MNT_NS_INDEX = 5, + CGROUP_NS_INDEX = 6, + + NR_NAMESPACES, /* number of available namespaces */ +}; + +enum perf_event_type { + + /* + * If perf_event_attr.sample_id_all is set then all event types will + * have the sample_type selected fields related to where/when + * (identity) an event took place (TID, TIME, ID, STREAM_ID, CPU, + * IDENTIFIER) described in PERF_RECORD_SAMPLE below, it will be stashed + * just after the perf_event_header and the fields already present for + * the existing fields, i.e. at the end of the payload. That way a newer + * perf.data file will be supported by older perf tools, with these new + * optional fields being ignored. + * + * struct sample_id { + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 id; } && PERF_SAMPLE_IDENTIFIER + * } && perf_event_attr::sample_id_all + * + * Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. The + * advantage of PERF_SAMPLE_IDENTIFIER is that its position is fixed + * relative to header.size. + */ + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP = 1, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * u64 lost; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_LOST = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_COMM = 3, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_EXIT = 4, + + /* + * struct { + * struct perf_event_header header; + * u64 time; + * u64 id; + * u64 stream_id; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_THROTTLE = 5, + PERF_RECORD_UNTHROTTLE = 6, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, ppid; + * u32 tid, ptid; + * u64 time; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_FORK = 7, + + /* + * struct { + * struct perf_event_header header; + * u32 pid, tid; + * + * struct read_format values; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_READ = 8, + + /* + * struct { + * struct perf_event_header header; + * + * # + * # Note that PERF_SAMPLE_IDENTIFIER duplicates PERF_SAMPLE_ID. + * # The advantage of PERF_SAMPLE_IDENTIFIER is that its position + * # is fixed relative to header. + * # + * + * { u64 id; } && PERF_SAMPLE_IDENTIFIER + * { u64 ip; } && PERF_SAMPLE_IP + * { u32 pid, tid; } && PERF_SAMPLE_TID + * { u64 time; } && PERF_SAMPLE_TIME + * { u64 addr; } && PERF_SAMPLE_ADDR + * { u64 id; } && PERF_SAMPLE_ID + * { u64 stream_id;} && PERF_SAMPLE_STREAM_ID + * { u32 cpu, res; } && PERF_SAMPLE_CPU + * { u64 period; } && PERF_SAMPLE_PERIOD + * + * { struct read_format values; } && PERF_SAMPLE_READ + * + * { u64 nr, + * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * + * # + * # The RAW record below is opaque data wrt the ABI + * # + * # That is, the ABI doesn't make any promises wrt to + * # the stability of its content, it may vary depending + * # on event, hardware, kernel version and phase of + * # the moon. + * # + * # In other words, PERF_SAMPLE_RAW contents are not an ABI. + * # + * + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW + * + * { u64 nr; + * { u64 hw_idx; } && PERF_SAMPLE_BRANCH_HW_INDEX + * { u64 from, to, flags } lbr[nr]; + * # + * # The format of the counters is decided by the + * # "branch_counter_nr" and "branch_counter_width", + * # which are defined in the ABI. + * # + * { u64 counters; } cntr[nr] && PERF_SAMPLE_BRANCH_COUNTERS + * } && PERF_SAMPLE_BRANCH_STACK + * + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_USER + * + * { u64 size; + * char data[size]; + * u64 dyn_size; } && PERF_SAMPLE_STACK_USER + * + * { union perf_sample_weight + * { + * u64 full; && PERF_SAMPLE_WEIGHT + * #if defined(__LITTLE_ENDIAN_BITFIELD) + * struct { + * u32 var1_dw; + * u16 var2_w; + * u16 var3_w; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #elif defined(__BIG_ENDIAN_BITFIELD) + * struct { + * u16 var3_w; + * u16 var2_w; + * u32 var1_dw; + * } && PERF_SAMPLE_WEIGHT_STRUCT + * #endif + * } + * } + * { u64 data_src; } && PERF_SAMPLE_DATA_SRC + * { u64 transaction; } && PERF_SAMPLE_TRANSACTION + * { u64 abi; # enum perf_sample_regs_abi + * u64 regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR + * { u64 phys_addr;} && PERF_SAMPLE_PHYS_ADDR + * { u64 size; + * char data[size]; } && PERF_SAMPLE_AUX + * { u64 data_page_size;} && PERF_SAMPLE_DATA_PAGE_SIZE + * { u64 code_page_size;} && PERF_SAMPLE_CODE_PAGE_SIZE + * }; + */ + PERF_RECORD_SAMPLE = 9, + + /* + * The MMAP2 records are an augmented version of MMAP, they add + * maj, min, ino numbers to be used to uniquely identify each mapping + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * union { + * struct { + * u32 maj; + * u32 min; + * u64 ino; + * u64 ino_generation; + * }; + * struct { + * u8 build_id_size; + * u8 __reserved_1; + * u16 __reserved_2; + * u8 build_id[20]; + * }; + * }; + * u32 prot, flags; + * char filename[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_MMAP2 = 10, + + /* + * Records that new data landed in the AUX buffer part. + * + * struct { + * struct perf_event_header header; + * + * u64 aux_offset; + * u64 aux_size; + * u64 flags; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX = 11, + + /* + * Indicates that instruction trace has started + * + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_ITRACE_START = 12, + + /* + * Records the dropped/lost sample number. + * + * struct { + * struct perf_event_header header; + * + * u64 lost; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_LOST_SAMPLES = 13, + + /* + * Records a context switch in or out (flagged by + * PERF_RECORD_MISC_SWITCH_OUT). See also + * PERF_RECORD_SWITCH_CPU_WIDE. + * + * struct { + * struct perf_event_header header; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH = 14, + + /* + * CPU-wide version of PERF_RECORD_SWITCH with next_prev_pid and + * next_prev_tid that are the next (switching out) or previous + * (switching in) pid/tid. + * + * struct { + * struct perf_event_header header; + * u32 next_prev_pid; + * u32 next_prev_tid; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_SWITCH_CPU_WIDE = 15, + + /* + * struct { + * struct perf_event_header header; + * u32 pid; + * u32 tid; + * u64 nr_namespaces; + * { u64 dev, inode; } [nr_namespaces]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_NAMESPACES = 16, + + /* + * Record ksymbol register/unregister events: + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u32 len; + * u16 ksym_type; + * u16 flags; + * char name[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_KSYMBOL = 17, + + /* + * Record bpf events: + * enum perf_bpf_event_type { + * PERF_BPF_EVENT_UNKNOWN = 0, + * PERF_BPF_EVENT_PROG_LOAD = 1, + * PERF_BPF_EVENT_PROG_UNLOAD = 2, + * }; + * + * struct { + * struct perf_event_header header; + * u16 type; + * u16 flags; + * u32 id; + * u8 tag[BPF_TAG_SIZE]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_BPF_EVENT = 18, + + /* + * struct { + * struct perf_event_header header; + * u64 id; + * char path[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_CGROUP = 19, + + /* + * Records changes to kernel text i.e. self-modified code. 'old_len' is + * the number of old bytes, 'new_len' is the number of new bytes. Either + * 'old_len' or 'new_len' may be zero to indicate, for example, the + * addition or removal of a trampoline. 'bytes' contains the old bytes + * followed immediately by the new bytes. + * + * struct { + * struct perf_event_header header; + * u64 addr; + * u16 old_len; + * u16 new_len; + * u8 bytes[]; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_TEXT_POKE = 20, + + /* + * Data written to the AUX area by hardware due to aux_output, may need + * to be matched to the event by an architecture-specific hardware ID. + * This records the hardware ID, but requires sample_id to provide the + * event ID. e.g. Intel PT uses this record to disambiguate PEBS-via-PT + * records from multiple events. + * + * struct { + * struct perf_event_header header; + * u64 hw_id; + * struct sample_id sample_id; + * }; + */ + PERF_RECORD_AUX_OUTPUT_HW_ID = 21, + + PERF_RECORD_MAX, /* non-ABI */ +}; + +enum perf_record_ksymbol_type { + PERF_RECORD_KSYMBOL_TYPE_UNKNOWN = 0, + PERF_RECORD_KSYMBOL_TYPE_BPF = 1, + /* + * Out of line code such as kprobe-replaced instructions or optimized + * kprobes or ftrace trampolines. + */ + PERF_RECORD_KSYMBOL_TYPE_OOL = 2, + PERF_RECORD_KSYMBOL_TYPE_MAX /* non-ABI */ +}; + +#define PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER (1 << 0) + +enum perf_bpf_event_type { + PERF_BPF_EVENT_UNKNOWN = 0, + PERF_BPF_EVENT_PROG_LOAD = 1, + PERF_BPF_EVENT_PROG_UNLOAD = 2, + PERF_BPF_EVENT_MAX, /* non-ABI */ +}; + +#define PERF_MAX_STACK_DEPTH 127 +#define PERF_MAX_CONTEXTS_PER_STACK 8 + +enum perf_callchain_context { + PERF_CONTEXT_HV = (__u64)-32, + PERF_CONTEXT_KERNEL = (__u64)-128, + PERF_CONTEXT_USER = (__u64)-512, + + PERF_CONTEXT_GUEST = (__u64)-2048, + PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, + PERF_CONTEXT_GUEST_USER = (__u64)-2560, + + PERF_CONTEXT_MAX = (__u64)-4095, +}; + +/** + * PERF_RECORD_AUX::flags bits + */ +#define PERF_AUX_FLAG_TRUNCATED 0x01 /* record was truncated to fit */ +#define PERF_AUX_FLAG_OVERWRITE 0x02 /* snapshot from overwrite mode */ +#define PERF_AUX_FLAG_PARTIAL 0x04 /* record contains gaps */ +#define PERF_AUX_FLAG_COLLISION 0x08 /* sample collided with another */ +#define PERF_AUX_FLAG_PMU_FORMAT_TYPE_MASK 0xff00 /* PMU specific trace format type */ + +/* CoreSight PMU AUX buffer formats */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_CORESIGHT 0x0000 /* Default for backward compatibility */ +#define PERF_AUX_FLAG_CORESIGHT_FORMAT_RAW 0x0100 /* Raw format of the source */ + +#define PERF_FLAG_FD_NO_GROUP (1UL << 0) +#define PERF_FLAG_FD_OUTPUT (1UL << 1) +#define PERF_FLAG_PID_CGROUP (1UL << 2) /* pid=cgroup id, per-cpu mode only */ +#define PERF_FLAG_FD_CLOEXEC (1UL << 3) /* O_CLOEXEC */ + +#if defined(__LITTLE_ENDIAN_BITFIELD) +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_op:5, /* type of opcode */ + mem_lvl:14, /* memory hierarchy level */ + mem_snoop:5, /* snoop mode */ + mem_lock:2, /* lock instr */ + mem_dtlb:7, /* tlb access */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_remote:1, /* remote */ + mem_snoopx:2, /* snoop mode, ext */ + mem_blk:3, /* access blocked */ + mem_hops:3, /* hop level */ + mem_rsvd:18; + }; +}; +#elif defined(__BIG_ENDIAN_BITFIELD) +union perf_mem_data_src { + __u64 val; + struct { + __u64 mem_rsvd:18, + mem_hops:3, /* hop level */ + mem_blk:3, /* access blocked */ + mem_snoopx:2, /* snoop mode, ext */ + mem_remote:1, /* remote */ + mem_lvl_num:4, /* memory hierarchy level number */ + mem_dtlb:7, /* tlb access */ + mem_lock:2, /* lock instr */ + mem_snoop:5, /* snoop mode */ + mem_lvl:14, /* memory hierarchy level */ + mem_op:5; /* type of opcode */ + }; +}; +#else +#error "Unknown endianness" +#endif + +/* type of opcode (load/store/prefetch,code) */ +#define PERF_MEM_OP_NA 0x01 /* not available */ +#define PERF_MEM_OP_LOAD 0x02 /* load instruction */ +#define PERF_MEM_OP_STORE 0x04 /* store instruction */ +#define PERF_MEM_OP_PFETCH 0x08 /* prefetch */ +#define PERF_MEM_OP_EXEC 0x10 /* code (execution) */ +#define PERF_MEM_OP_SHIFT 0 + +/* + * PERF_MEM_LVL_* namespace being depricated to some extent in the + * favour of newer composite PERF_MEM_{LVLNUM_,REMOTE_,SNOOPX_} fields. + * Supporting this namespace inorder to not break defined ABIs. + * + * memory hierarchy (memory level, hit or miss) + */ +#define PERF_MEM_LVL_NA 0x01 /* not available */ +#define PERF_MEM_LVL_HIT 0x02 /* hit level */ +#define PERF_MEM_LVL_MISS 0x04 /* miss level */ +#define PERF_MEM_LVL_L1 0x08 /* L1 */ +#define PERF_MEM_LVL_LFB 0x10 /* Line Fill Buffer */ +#define PERF_MEM_LVL_L2 0x20 /* L2 */ +#define PERF_MEM_LVL_L3 0x40 /* L3 */ +#define PERF_MEM_LVL_LOC_RAM 0x80 /* Local DRAM */ +#define PERF_MEM_LVL_REM_RAM1 0x100 /* Remote DRAM (1 hop) */ +#define PERF_MEM_LVL_REM_RAM2 0x200 /* Remote DRAM (2 hops) */ +#define PERF_MEM_LVL_REM_CCE1 0x400 /* Remote Cache (1 hop) */ +#define PERF_MEM_LVL_REM_CCE2 0x800 /* Remote Cache (2 hops) */ +#define PERF_MEM_LVL_IO 0x1000 /* I/O memory */ +#define PERF_MEM_LVL_UNC 0x2000 /* Uncached memory */ +#define PERF_MEM_LVL_SHIFT 5 + +#define PERF_MEM_REMOTE_REMOTE 0x01 /* Remote */ +#define PERF_MEM_REMOTE_SHIFT 37 + +#define PERF_MEM_LVLNUM_L1 0x01 /* L1 */ +#define PERF_MEM_LVLNUM_L2 0x02 /* L2 */ +#define PERF_MEM_LVLNUM_L3 0x03 /* L3 */ +#define PERF_MEM_LVLNUM_L4 0x04 /* L4 */ +/* 5-0x7 available */ +#define PERF_MEM_LVLNUM_UNC 0x08 /* Uncached */ +#define PERF_MEM_LVLNUM_CXL 0x09 /* CXL */ +#define PERF_MEM_LVLNUM_IO 0x0a /* I/O */ +#define PERF_MEM_LVLNUM_ANY_CACHE 0x0b /* Any cache */ +#define PERF_MEM_LVLNUM_LFB 0x0c /* LFB */ +#define PERF_MEM_LVLNUM_RAM 0x0d /* RAM */ +#define PERF_MEM_LVLNUM_PMEM 0x0e /* PMEM */ +#define PERF_MEM_LVLNUM_NA 0x0f /* N/A */ + +#define PERF_MEM_LVLNUM_SHIFT 33 + +/* snoop mode */ +#define PERF_MEM_SNOOP_NA 0x01 /* not available */ +#define PERF_MEM_SNOOP_NONE 0x02 /* no snoop */ +#define PERF_MEM_SNOOP_HIT 0x04 /* snoop hit */ +#define PERF_MEM_SNOOP_MISS 0x08 /* snoop miss */ +#define PERF_MEM_SNOOP_HITM 0x10 /* snoop hit modified */ +#define PERF_MEM_SNOOP_SHIFT 19 + +#define PERF_MEM_SNOOPX_FWD 0x01 /* forward */ +#define PERF_MEM_SNOOPX_PEER 0x02 /* xfer from peer */ +#define PERF_MEM_SNOOPX_SHIFT 38 + +/* locked instruction */ +#define PERF_MEM_LOCK_NA 0x01 /* not available */ +#define PERF_MEM_LOCK_LOCKED 0x02 /* locked transaction */ +#define PERF_MEM_LOCK_SHIFT 24 + +/* TLB access */ +#define PERF_MEM_TLB_NA 0x01 /* not available */ +#define PERF_MEM_TLB_HIT 0x02 /* hit level */ +#define PERF_MEM_TLB_MISS 0x04 /* miss level */ +#define PERF_MEM_TLB_L1 0x08 /* L1 */ +#define PERF_MEM_TLB_L2 0x10 /* L2 */ +#define PERF_MEM_TLB_WK 0x20 /* Hardware Walker*/ +#define PERF_MEM_TLB_OS 0x40 /* OS fault handler */ +#define PERF_MEM_TLB_SHIFT 26 + +/* Access blocked */ +#define PERF_MEM_BLK_NA 0x01 /* not available */ +#define PERF_MEM_BLK_DATA 0x02 /* data could not be forwarded */ +#define PERF_MEM_BLK_ADDR 0x04 /* address conflict */ +#define PERF_MEM_BLK_SHIFT 40 + +/* hop level */ +#define PERF_MEM_HOPS_0 0x01 /* remote core, same node */ +#define PERF_MEM_HOPS_1 0x02 /* remote node, same socket */ +#define PERF_MEM_HOPS_2 0x03 /* remote socket, same board */ +#define PERF_MEM_HOPS_3 0x04 /* remote board */ +/* 5-7 available */ +#define PERF_MEM_HOPS_SHIFT 43 + +#define PERF_MEM_S(a, s) \ + (((__u64)PERF_MEM_##a##_##s) << PERF_MEM_##a##_SHIFT) + +/* + * single taken branch record layout: + * + * from: source instruction (may not always be a branch insn) + * to: branch target + * mispred: branch target was mispredicted + * predicted: branch target was predicted + * + * support for mispred, predicted is optional. In case it + * is not supported mispred = predicted = 0. + * + * in_tx: running in a hardware transaction + * abort: aborting a hardware transaction + * cycles: cycles from last branch (or 0 if not supported) + * type: branch type + * spec: branch speculation info (or 0 if not supported) + */ +struct perf_branch_entry { + __u64 from; + __u64 to; + __u64 mispred:1, /* target mispredicted */ + predicted:1,/* target predicted */ + in_tx:1, /* in transaction */ + abort:1, /* transaction abort */ + cycles:16, /* cycle count to last branch */ + type:4, /* branch type */ + spec:2, /* branch speculation info */ + new_type:4, /* additional branch type */ + priv:3, /* privilege level */ + reserved:31; +}; + +/* Size of used info bits in struct perf_branch_entry */ +#define PERF_BRANCH_ENTRY_INFO_BITS_MAX 33 + +union perf_sample_weight { + __u64 full; +#if defined(__LITTLE_ENDIAN_BITFIELD) + struct { + __u32 var1_dw; + __u16 var2_w; + __u16 var3_w; + }; +#elif defined(__BIG_ENDIAN_BITFIELD) + struct { + __u16 var3_w; + __u16 var2_w; + __u32 var1_dw; + }; +#else +#error "Unknown endianness" +#endif +}; + +#endif /* _UAPI_LINUX_PERF_EVENT_H */ diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/pkt_cls.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/pkt_cls.h new file mode 100644 index 0000000000..bd4b227ab4 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/pkt_cls.h @@ -0,0 +1,565 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_CLS_H +#define __LINUX_PKT_CLS_H + +#include +#include + +#define TC_COOKIE_MAX_SIZE 16 + +/* Action attributes */ +enum { + TCA_ACT_UNSPEC, + TCA_ACT_KIND, + TCA_ACT_OPTIONS, + TCA_ACT_INDEX, + TCA_ACT_STATS, + TCA_ACT_PAD, + TCA_ACT_COOKIE, + __TCA_ACT_MAX +}; + +#define TCA_ACT_MAX __TCA_ACT_MAX +#define TCA_OLD_COMPAT (TCA_ACT_MAX+1) +#define TCA_ACT_MAX_PRIO 32 +#define TCA_ACT_BIND 1 +#define TCA_ACT_NOBIND 0 +#define TCA_ACT_UNBIND 1 +#define TCA_ACT_NOUNBIND 0 +#define TCA_ACT_REPLACE 1 +#define TCA_ACT_NOREPLACE 0 + +#define TC_ACT_UNSPEC (-1) +#define TC_ACT_OK 0 +#define TC_ACT_RECLASSIFY 1 +#define TC_ACT_SHOT 2 +#define TC_ACT_PIPE 3 +#define TC_ACT_STOLEN 4 +#define TC_ACT_QUEUED 5 +#define TC_ACT_REPEAT 6 +#define TC_ACT_REDIRECT 7 +#define TC_ACT_TRAP 8 /* For hw path, this means "trap to cpu" + * and don't further process the frame + * in hardware. For sw path, this is + * equivalent of TC_ACT_STOLEN - drop + * the skb and act like everything + * is alright. + */ +#define TC_ACT_VALUE_MAX TC_ACT_TRAP + +/* There is a special kind of actions called "extended actions", + * which need a value parameter. These have a local opcode located in + * the highest nibble, starting from 1. The rest of the bits + * are used to carry the value. These two parts together make + * a combined opcode. + */ +#define __TC_ACT_EXT_SHIFT 28 +#define __TC_ACT_EXT(local) ((local) << __TC_ACT_EXT_SHIFT) +#define TC_ACT_EXT_VAL_MASK ((1 << __TC_ACT_EXT_SHIFT) - 1) +#define TC_ACT_EXT_OPCODE(combined) ((combined) & (~TC_ACT_EXT_VAL_MASK)) +#define TC_ACT_EXT_CMP(combined, opcode) (TC_ACT_EXT_OPCODE(combined) == opcode) + +#define TC_ACT_JUMP __TC_ACT_EXT(1) +#define TC_ACT_GOTO_CHAIN __TC_ACT_EXT(2) +#define TC_ACT_EXT_OPCODE_MAX TC_ACT_GOTO_CHAIN + +/* Action type identifiers*/ +enum { + TCA_ID_UNSPEC=0, + TCA_ID_POLICE=1, + /* other actions go here */ + __TCA_ID_MAX=255 +}; + +#define TCA_ID_MAX __TCA_ID_MAX + +struct tc_police { + __u32 index; + int action; +#define TC_POLICE_UNSPEC TC_ACT_UNSPEC +#define TC_POLICE_OK TC_ACT_OK +#define TC_POLICE_RECLASSIFY TC_ACT_RECLASSIFY +#define TC_POLICE_SHOT TC_ACT_SHOT +#define TC_POLICE_PIPE TC_ACT_PIPE + + __u32 limit; + __u32 burst; + __u32 mtu; + struct tc_ratespec rate; + struct tc_ratespec peakrate; + int refcnt; + int bindcnt; + __u32 capab; +}; + +struct tcf_t { + __u64 install; + __u64 lastuse; + __u64 expires; + __u64 firstuse; +}; + +struct tc_cnt { + int refcnt; + int bindcnt; +}; + +#define tc_gen \ + __u32 index; \ + __u32 capab; \ + int action; \ + int refcnt; \ + int bindcnt + +enum { + TCA_POLICE_UNSPEC, + TCA_POLICE_TBF, + TCA_POLICE_RATE, + TCA_POLICE_PEAKRATE, + TCA_POLICE_AVRATE, + TCA_POLICE_RESULT, + TCA_POLICE_TM, + TCA_POLICE_PAD, + __TCA_POLICE_MAX +#define TCA_POLICE_RESULT TCA_POLICE_RESULT +}; + +#define TCA_POLICE_MAX (__TCA_POLICE_MAX - 1) + +/* tca flags definitions */ +#define TCA_CLS_FLAGS_SKIP_HW (1 << 0) /* don't offload filter to HW */ +#define TCA_CLS_FLAGS_SKIP_SW (1 << 1) /* don't use filter in SW */ +#define TCA_CLS_FLAGS_IN_HW (1 << 2) /* filter is offloaded to HW */ +#define TCA_CLS_FLAGS_NOT_IN_HW (1 << 3) /* filter isn't offloaded to HW */ +#define TCA_CLS_FLAGS_VERBOSE (1 << 4) /* verbose logging */ + +/* U32 filters */ + +#define TC_U32_HTID(h) ((h)&0xFFF00000) +#define TC_U32_USERHTID(h) (TC_U32_HTID(h)>>20) +#define TC_U32_HASH(h) (((h)>>12)&0xFF) +#define TC_U32_NODE(h) ((h)&0xFFF) +#define TC_U32_KEY(h) ((h)&0xFFFFF) +#define TC_U32_UNSPEC 0 +#define TC_U32_ROOT (0xFFF00000) + +enum { + TCA_U32_UNSPEC, + TCA_U32_CLASSID, + TCA_U32_HASH, + TCA_U32_LINK, + TCA_U32_DIVISOR, + TCA_U32_SEL, + TCA_U32_POLICE, + TCA_U32_ACT, + TCA_U32_INDEV, + TCA_U32_PCNT, + TCA_U32_MARK, + TCA_U32_FLAGS, + TCA_U32_PAD, + __TCA_U32_MAX +}; + +#define TCA_U32_MAX (__TCA_U32_MAX - 1) + +struct tc_u32_key { + __be32 mask; + __be32 val; + int off; + int offmask; +}; + +struct tc_u32_sel { + unsigned char flags; + unsigned char offshift; + unsigned char nkeys; + + __be16 offmask; + __u16 off; + short offoff; + + short hoff; + __be32 hmask; + struct tc_u32_key keys[]; +}; + +struct tc_u32_mark { + __u32 val; + __u32 mask; + __u32 success; +}; + +struct tc_u32_pcnt { + __u64 rcnt; + __u64 rhit; + __u64 kcnts[]; +}; + +/* Flags */ + +#define TC_U32_TERMINAL 1 +#define TC_U32_OFFSET 2 +#define TC_U32_VAROFFSET 4 +#define TC_U32_EAT 8 + +#define TC_U32_MAXDEPTH 8 + +/* ROUTE filter */ + +enum { + TCA_ROUTE4_UNSPEC, + TCA_ROUTE4_CLASSID, + TCA_ROUTE4_TO, + TCA_ROUTE4_FROM, + TCA_ROUTE4_IIF, + TCA_ROUTE4_POLICE, + TCA_ROUTE4_ACT, + __TCA_ROUTE4_MAX +}; + +#define TCA_ROUTE4_MAX (__TCA_ROUTE4_MAX - 1) + + +/* FW filter */ + +enum { + TCA_FW_UNSPEC, + TCA_FW_CLASSID, + TCA_FW_POLICE, + TCA_FW_INDEV, + TCA_FW_ACT, /* used by CONFIG_NET_CLS_ACT */ + TCA_FW_MASK, + __TCA_FW_MAX +}; + +#define TCA_FW_MAX (__TCA_FW_MAX - 1) + +/* Flow filter */ + +enum { + FLOW_KEY_SRC, + FLOW_KEY_DST, + FLOW_KEY_PROTO, + FLOW_KEY_PROTO_SRC, + FLOW_KEY_PROTO_DST, + FLOW_KEY_IIF, + FLOW_KEY_PRIORITY, + FLOW_KEY_MARK, + FLOW_KEY_NFCT, + FLOW_KEY_NFCT_SRC, + FLOW_KEY_NFCT_DST, + FLOW_KEY_NFCT_PROTO_SRC, + FLOW_KEY_NFCT_PROTO_DST, + FLOW_KEY_RTCLASSID, + FLOW_KEY_SKUID, + FLOW_KEY_SKGID, + FLOW_KEY_VLAN_TAG, + FLOW_KEY_RXHASH, + __FLOW_KEY_MAX, +}; + +#define FLOW_KEY_MAX (__FLOW_KEY_MAX - 1) + +enum { + FLOW_MODE_MAP, + FLOW_MODE_HASH, +}; + +enum { + TCA_FLOW_UNSPEC, + TCA_FLOW_KEYS, + TCA_FLOW_MODE, + TCA_FLOW_BASECLASS, + TCA_FLOW_RSHIFT, + TCA_FLOW_ADDEND, + TCA_FLOW_MASK, + TCA_FLOW_XOR, + TCA_FLOW_DIVISOR, + TCA_FLOW_ACT, + TCA_FLOW_POLICE, + TCA_FLOW_EMATCHES, + TCA_FLOW_PERTURB, + __TCA_FLOW_MAX +}; + +#define TCA_FLOW_MAX (__TCA_FLOW_MAX - 1) + +/* Basic filter */ + +enum { + TCA_BASIC_UNSPEC, + TCA_BASIC_CLASSID, + TCA_BASIC_EMATCHES, + TCA_BASIC_ACT, + TCA_BASIC_POLICE, + __TCA_BASIC_MAX +}; + +#define TCA_BASIC_MAX (__TCA_BASIC_MAX - 1) + + +/* Cgroup classifier */ + +enum { + TCA_CGROUP_UNSPEC, + TCA_CGROUP_ACT, + TCA_CGROUP_POLICE, + TCA_CGROUP_EMATCHES, + __TCA_CGROUP_MAX, +}; + +#define TCA_CGROUP_MAX (__TCA_CGROUP_MAX - 1) + +/* BPF classifier */ + +#define TCA_BPF_FLAG_ACT_DIRECT (1 << 0) + +enum { + TCA_BPF_UNSPEC, + TCA_BPF_ACT, + TCA_BPF_POLICE, + TCA_BPF_CLASSID, + TCA_BPF_OPS_LEN, + TCA_BPF_OPS, + TCA_BPF_FD, + TCA_BPF_NAME, + TCA_BPF_FLAGS, + TCA_BPF_FLAGS_GEN, + TCA_BPF_TAG, + TCA_BPF_ID, + __TCA_BPF_MAX, +}; + +#define TCA_BPF_MAX (__TCA_BPF_MAX - 1) + +/* Flower classifier */ + +enum { + TCA_FLOWER_UNSPEC, + TCA_FLOWER_CLASSID, + TCA_FLOWER_INDEV, + TCA_FLOWER_ACT, + TCA_FLOWER_KEY_ETH_DST, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_DST_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_SRC_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ETH_TYPE, /* be16 */ + TCA_FLOWER_KEY_IP_PROTO, /* u8 */ + TCA_FLOWER_KEY_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_IPV4_SRC_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_IPV4_DST_MASK, /* be32 */ + TCA_FLOWER_KEY_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_SRC_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_IPV6_DST_MASK, /* struct in6_addr */ + TCA_FLOWER_KEY_TCP_SRC, /* be16 */ + TCA_FLOWER_KEY_TCP_DST, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC, /* be16 */ + TCA_FLOWER_KEY_UDP_DST, /* be16 */ + + TCA_FLOWER_FLAGS, + TCA_FLOWER_KEY_VLAN_ID, /* be16 */ + TCA_FLOWER_KEY_VLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_VLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_KEY_ID, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_SRC_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST, /* be32 */ + TCA_FLOWER_KEY_ENC_IPV4_DST_MASK,/* be32 */ + TCA_FLOWER_KEY_ENC_IPV6_SRC, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK,/* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST, /* struct in6_addr */ + TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */ + + TCA_FLOWER_KEY_TCP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_TCP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_UDP_DST_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_SRC_MASK, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST_MASK, /* be16 */ + + TCA_FLOWER_KEY_SCTP_SRC, /* be16 */ + TCA_FLOWER_KEY_SCTP_DST, /* be16 */ + + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_SRC_PORT_MASK, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT, /* be16 */ + TCA_FLOWER_KEY_ENC_UDP_DST_PORT_MASK, /* be16 */ + + TCA_FLOWER_KEY_FLAGS, /* be32 */ + TCA_FLOWER_KEY_FLAGS_MASK, /* be32 */ + + TCA_FLOWER_KEY_ICMPV4_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV4_TYPE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_CODE_MASK,/* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE, /* u8 */ + TCA_FLOWER_KEY_ICMPV6_TYPE_MASK,/* u8 */ + + TCA_FLOWER_KEY_ARP_SIP, /* be32 */ + TCA_FLOWER_KEY_ARP_SIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP, /* be32 */ + TCA_FLOWER_KEY_ARP_TIP_MASK, /* be32 */ + TCA_FLOWER_KEY_ARP_OP, /* u8 */ + TCA_FLOWER_KEY_ARP_OP_MASK, /* u8 */ + TCA_FLOWER_KEY_ARP_SHA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_SHA_MASK, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA, /* ETH_ALEN */ + TCA_FLOWER_KEY_ARP_THA_MASK, /* ETH_ALEN */ + + TCA_FLOWER_KEY_MPLS_TTL, /* u8 - 8 bits */ + TCA_FLOWER_KEY_MPLS_BOS, /* u8 - 1 bit */ + TCA_FLOWER_KEY_MPLS_TC, /* u8 - 3 bits */ + TCA_FLOWER_KEY_MPLS_LABEL, /* be32 - 20 bits */ + + TCA_FLOWER_KEY_TCP_FLAGS, /* be16 */ + TCA_FLOWER_KEY_TCP_FLAGS_MASK, /* be16 */ + + TCA_FLOWER_KEY_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_CVLAN_ID, /* be16 */ + TCA_FLOWER_KEY_CVLAN_PRIO, /* u8 */ + TCA_FLOWER_KEY_CVLAN_ETH_TYPE, /* be16 */ + + TCA_FLOWER_KEY_ENC_IP_TOS, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TOS_MASK, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL, /* u8 */ + TCA_FLOWER_KEY_ENC_IP_TTL_MASK, /* u8 */ + + TCA_FLOWER_KEY_ENC_OPTS, + TCA_FLOWER_KEY_ENC_OPTS_MASK, + + TCA_FLOWER_IN_HW_COUNT, + + __TCA_FLOWER_MAX, +}; + +#define TCA_FLOWER_MAX (__TCA_FLOWER_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPTS_UNSPEC, + TCA_FLOWER_KEY_ENC_OPTS_GENEVE, /* Nested + * TCA_FLOWER_KEY_ENC_OPT_GENEVE_ + * attributes + */ + __TCA_FLOWER_KEY_ENC_OPTS_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPTS_MAX (__TCA_FLOWER_KEY_ENC_OPTS_MAX - 1) + +enum { + TCA_FLOWER_KEY_ENC_OPT_GENEVE_UNSPEC, + TCA_FLOWER_KEY_ENC_OPT_GENEVE_CLASS, /* u16 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_TYPE, /* u8 */ + TCA_FLOWER_KEY_ENC_OPT_GENEVE_DATA, /* 4 to 128 bytes */ + + __TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX, +}; + +#define TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX \ + (__TCA_FLOWER_KEY_ENC_OPT_GENEVE_MAX - 1) + +enum { + TCA_FLOWER_KEY_FLAGS_IS_FRAGMENT = (1 << 0), + TCA_FLOWER_KEY_FLAGS_FRAG_IS_FIRST = (1 << 1), +}; + +/* Match-all classifier */ + +enum { + TCA_MATCHALL_UNSPEC, + TCA_MATCHALL_CLASSID, + TCA_MATCHALL_ACT, + TCA_MATCHALL_FLAGS, + __TCA_MATCHALL_MAX, +}; + +#define TCA_MATCHALL_MAX (__TCA_MATCHALL_MAX - 1) + +/* Extended Matches */ + +struct tcf_ematch_tree_hdr { + __u16 nmatches; + __u16 progid; +}; + +enum { + TCA_EMATCH_TREE_UNSPEC, + TCA_EMATCH_TREE_HDR, + TCA_EMATCH_TREE_LIST, + __TCA_EMATCH_TREE_MAX +}; +#define TCA_EMATCH_TREE_MAX (__TCA_EMATCH_TREE_MAX - 1) + +struct tcf_ematch_hdr { + __u16 matchid; + __u16 kind; + __u16 flags; + __u16 pad; /* currently unused */ +}; + +/* 0 1 + * 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + * +-----------------------+-+-+---+ + * | Unused |S|I| R | + * +-----------------------+-+-+---+ + * + * R(2) ::= relation to next ematch + * where: 0 0 END (last ematch) + * 0 1 AND + * 1 0 OR + * 1 1 Unused (invalid) + * I(1) ::= invert result + * S(1) ::= simple payload + */ +#define TCF_EM_REL_END 0 +#define TCF_EM_REL_AND (1<<0) +#define TCF_EM_REL_OR (1<<1) +#define TCF_EM_INVERT (1<<2) +#define TCF_EM_SIMPLE (1<<3) + +#define TCF_EM_REL_MASK 3 +#define TCF_EM_REL_VALID(v) (((v) & TCF_EM_REL_MASK) != TCF_EM_REL_MASK) + +enum { + TCF_LAYER_LINK, + TCF_LAYER_NETWORK, + TCF_LAYER_TRANSPORT, + __TCF_LAYER_MAX +}; +#define TCF_LAYER_MAX (__TCF_LAYER_MAX - 1) + +/* Ematch type assignments + * 1..32767 Reserved for ematches inside kernel tree + * 32768..65535 Free to use, not reliable + */ +#define TCF_EM_CONTAINER 0 +#define TCF_EM_CMP 1 +#define TCF_EM_NBYTE 2 +#define TCF_EM_U32 3 +#define TCF_EM_META 4 +#define TCF_EM_TEXT 5 +#define TCF_EM_VLAN 6 +#define TCF_EM_CANID 7 +#define TCF_EM_IPSET 8 +#define TCF_EM_IPT 9 +#define TCF_EM_MAX 9 + +enum { + TCF_EM_PROG_TC +}; + +enum { + TCF_EM_OPND_EQ, + TCF_EM_OPND_GT, + TCF_EM_OPND_LT +}; + +#endif diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/pkt_sched.h b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/pkt_sched.h new file mode 100644 index 0000000000..587481a194 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/pkt_sched.h @@ -0,0 +1,1055 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef __LINUX_PKT_SCHED_H +#define __LINUX_PKT_SCHED_H + +#include + +/* Logical priority bands not depending on specific packet scheduler. + Every scheduler will map them to real traffic classes, if it has + no more precise mechanism to classify packets. + + These numbers have no special meaning, though their coincidence + with obsolete IPv6 values is not occasional :-). New IPv6 drafts + preferred full anarchy inspired by diffserv group. + + Note: TC_PRIO_BESTEFFORT does not mean that it is the most unhappy + class, actually, as rule it will be handled with more care than + filler or even bulk. + */ + +#define TC_PRIO_BESTEFFORT 0 +#define TC_PRIO_FILLER 1 +#define TC_PRIO_BULK 2 +#define TC_PRIO_INTERACTIVE_BULK 4 +#define TC_PRIO_INTERACTIVE 6 +#define TC_PRIO_CONTROL 7 + +#define TC_PRIO_MAX 15 + +/* Generic queue statistics, available for all the elements. + Particular schedulers may have also their private records. + */ + +struct tc_stats { + __u64 bytes; /* Number of enqueued bytes */ + __u32 packets; /* Number of enqueued packets */ + __u32 drops; /* Packets dropped because of lack of resources */ + __u32 overlimits; /* Number of throttle events when this + * flow goes out of allocated bandwidth */ + __u32 bps; /* Current flow byte rate */ + __u32 pps; /* Current flow packet rate */ + __u32 qlen; + __u32 backlog; +}; + +struct tc_estimator { + signed char interval; + unsigned char ewma_log; +}; + +/* "Handles" + --------- + + All the traffic control objects have 32bit identifiers, or "handles". + + They can be considered as opaque numbers from user API viewpoint, + but actually they always consist of two fields: major and + minor numbers, which are interpreted by kernel specially, + that may be used by applications, though not recommended. + + F.e. qdisc handles always have minor number equal to zero, + classes (or flows) have major equal to parent qdisc major, and + minor uniquely identifying class inside qdisc. + + Macros to manipulate handles: + */ + +#define TC_H_MAJ_MASK (0xFFFF0000U) +#define TC_H_MIN_MASK (0x0000FFFFU) +#define TC_H_MAJ(h) ((h)&TC_H_MAJ_MASK) +#define TC_H_MIN(h) ((h)&TC_H_MIN_MASK) +#define TC_H_MAKE(maj,min) (((maj)&TC_H_MAJ_MASK)|((min)&TC_H_MIN_MASK)) + +#define TC_H_UNSPEC (0U) +#define TC_H_ROOT (0xFFFFFFFFU) +#define TC_H_INGRESS (0xFFFFFFF1U) +#define TC_H_CLSACT TC_H_INGRESS + +#define TC_H_MIN_PRIORITY 0xFFE0U +#define TC_H_MIN_INGRESS 0xFFF2U +#define TC_H_MIN_EGRESS 0xFFF3U + +/* Need to corrospond to iproute2 tc/tc_core.h "enum link_layer" */ +enum tc_link_layer { + TC_LINKLAYER_UNAWARE, /* Indicate unaware old iproute2 util */ + TC_LINKLAYER_ETHERNET, + TC_LINKLAYER_ATM, +}; +#define TC_LINKLAYER_MASK 0x0F /* limit use to lower 4 bits */ + +struct tc_ratespec { + unsigned char cell_log; + __u8 linklayer; /* lower 4 bits */ + unsigned short overhead; + short cell_align; + unsigned short mpu; + __u32 rate; +}; + +#define TC_RTAB_SIZE 1024 + +struct tc_sizespec { + unsigned char cell_log; + unsigned char size_log; + short cell_align; + int overhead; + unsigned int linklayer; + unsigned int mpu; + unsigned int mtu; + unsigned int tsize; +}; + +enum { + TCA_STAB_UNSPEC, + TCA_STAB_BASE, + TCA_STAB_DATA, + __TCA_STAB_MAX +}; + +#define TCA_STAB_MAX (__TCA_STAB_MAX - 1) + +/* FIFO section */ + +struct tc_fifo_qopt { + __u32 limit; /* Queue length: bytes for bfifo, packets for pfifo */ +}; + +/* SKBPRIO section */ + +/* + * Priorities go from zero to (SKBPRIO_MAX_PRIORITY - 1). + * SKBPRIO_MAX_PRIORITY should be at least 64 in order for skbprio to be able + * to map one to one the DS field of IPV4 and IPV6 headers. + * Memory allocation grows linearly with SKBPRIO_MAX_PRIORITY. + */ + +#define SKBPRIO_MAX_PRIORITY 64 + +struct tc_skbprio_qopt { + __u32 limit; /* Queue length in packets. */ +}; + +/* PRIO section */ + +#define TCQ_PRIO_BANDS 16 +#define TCQ_MIN_PRIO_BANDS 2 + +struct tc_prio_qopt { + int bands; /* Number of bands */ + __u8 priomap[TC_PRIO_MAX+1]; /* Map: logical priority -> PRIO band */ +}; + +/* MULTIQ section */ + +struct tc_multiq_qopt { + __u16 bands; /* Number of bands */ + __u16 max_bands; /* Maximum number of queues */ +}; + +/* PLUG section */ + +#define TCQ_PLUG_BUFFER 0 +#define TCQ_PLUG_RELEASE_ONE 1 +#define TCQ_PLUG_RELEASE_INDEFINITE 2 +#define TCQ_PLUG_LIMIT 3 + +struct tc_plug_qopt { + /* TCQ_PLUG_BUFFER: Inset a plug into the queue and + * buffer any incoming packets + * TCQ_PLUG_RELEASE_ONE: Dequeue packets from queue head + * to beginning of the next plug. + * TCQ_PLUG_RELEASE_INDEFINITE: Dequeue all packets from queue. + * Stop buffering packets until the next TCQ_PLUG_BUFFER + * command is received (just act as a pass-thru queue). + * TCQ_PLUG_LIMIT: Increase/decrease queue size + */ + int action; + __u32 limit; +}; + +/* TBF section */ + +struct tc_tbf_qopt { + struct tc_ratespec rate; + struct tc_ratespec peakrate; + __u32 limit; + __u32 buffer; + __u32 mtu; +}; + +enum { + TCA_TBF_UNSPEC, + TCA_TBF_PARMS, + TCA_TBF_RTAB, + TCA_TBF_PTAB, + TCA_TBF_RATE64, + TCA_TBF_PRATE64, + TCA_TBF_BURST, + TCA_TBF_PBURST, + TCA_TBF_PAD, + __TCA_TBF_MAX, +}; + +#define TCA_TBF_MAX (__TCA_TBF_MAX - 1) + + +/* TEQL section */ + +/* TEQL does not require any parameters */ + +/* SFQ section */ + +struct tc_sfq_qopt { + unsigned quantum; /* Bytes per round allocated to flow */ + int perturb_period; /* Period of hash perturbation */ + __u32 limit; /* Maximal packets in queue */ + unsigned divisor; /* Hash divisor */ + unsigned flows; /* Maximal number of flows */ +}; + +struct tc_sfqred_stats { + __u32 prob_drop; /* Early drops, below max threshold */ + __u32 forced_drop; /* Early drops, after max threshold */ + __u32 prob_mark; /* Marked packets, below max threshold */ + __u32 forced_mark; /* Marked packets, after max threshold */ + __u32 prob_mark_head; /* Marked packets, below max threshold */ + __u32 forced_mark_head;/* Marked packets, after max threshold */ +}; + +struct tc_sfq_qopt_v1 { + struct tc_sfq_qopt v0; + unsigned int depth; /* max number of packets per flow */ + unsigned int headdrop; +/* SFQRED parameters */ + __u32 limit; /* HARD maximal flow queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; + __u32 max_P; /* probability, high resolution */ +/* SFQRED stats */ + struct tc_sfqred_stats stats; +}; + + +struct tc_sfq_xstats { + __s32 allot; +}; + +/* RED section */ + +enum { + TCA_RED_UNSPEC, + TCA_RED_PARMS, + TCA_RED_STAB, + TCA_RED_MAX_P, + __TCA_RED_MAX, +}; + +#define TCA_RED_MAX (__TCA_RED_MAX - 1) + +struct tc_red_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; +#define TC_RED_ECN 1 +#define TC_RED_HARDDROP 2 +#define TC_RED_ADAPTATIVE 4 +}; + +struct tc_red_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ +}; + +/* GRED section */ + +#define MAX_DPs 16 + +enum { + TCA_GRED_UNSPEC, + TCA_GRED_PARMS, + TCA_GRED_STAB, + TCA_GRED_DPS, + TCA_GRED_MAX_P, + TCA_GRED_LIMIT, + TCA_GRED_VQ_LIST, /* nested TCA_GRED_VQ_ENTRY */ + __TCA_GRED_MAX, +}; + +#define TCA_GRED_MAX (__TCA_GRED_MAX - 1) + +enum { + TCA_GRED_VQ_ENTRY_UNSPEC, + TCA_GRED_VQ_ENTRY, /* nested TCA_GRED_VQ_* */ + __TCA_GRED_VQ_ENTRY_MAX, +}; +#define TCA_GRED_VQ_ENTRY_MAX (__TCA_GRED_VQ_ENTRY_MAX - 1) + +enum { + TCA_GRED_VQ_UNSPEC, + TCA_GRED_VQ_PAD, + TCA_GRED_VQ_DP, /* u32 */ + TCA_GRED_VQ_STAT_BYTES, /* u64 */ + TCA_GRED_VQ_STAT_PACKETS, /* u32 */ + TCA_GRED_VQ_STAT_BACKLOG, /* u32 */ + TCA_GRED_VQ_STAT_PROB_DROP, /* u32 */ + TCA_GRED_VQ_STAT_PROB_MARK, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_DROP, /* u32 */ + TCA_GRED_VQ_STAT_FORCED_MARK, /* u32 */ + TCA_GRED_VQ_STAT_PDROP, /* u32 */ + TCA_GRED_VQ_STAT_OTHER, /* u32 */ + TCA_GRED_VQ_FLAGS, /* u32 */ + __TCA_GRED_VQ_MAX +}; + +#define TCA_GRED_VQ_MAX (__TCA_GRED_VQ_MAX - 1) + +struct tc_gred_qopt { + __u32 limit; /* HARD maximal queue length (bytes) */ + __u32 qth_min; /* Min average length threshold (bytes) */ + __u32 qth_max; /* Max average length threshold (bytes) */ + __u32 DP; /* up to 2^32 DPs */ + __u32 backlog; + __u32 qave; + __u32 forced; + __u32 early; + __u32 other; + __u32 pdrop; + __u8 Wlog; /* log(W) */ + __u8 Plog; /* log(P_max/(qth_max-qth_min)) */ + __u8 Scell_log; /* cell size for idle damping */ + __u8 prio; /* prio of this VQ */ + __u32 packets; + __u32 bytesin; +}; + +/* gred setup */ +struct tc_gred_sopt { + __u32 DPs; + __u32 def_DP; + __u8 grio; + __u8 flags; + __u16 pad1; +}; + +/* CHOKe section */ + +enum { + TCA_CHOKE_UNSPEC, + TCA_CHOKE_PARMS, + TCA_CHOKE_STAB, + TCA_CHOKE_MAX_P, + __TCA_CHOKE_MAX, +}; + +#define TCA_CHOKE_MAX (__TCA_CHOKE_MAX - 1) + +struct tc_choke_qopt { + __u32 limit; /* Hard queue length (packets) */ + __u32 qth_min; /* Min average threshold (packets) */ + __u32 qth_max; /* Max average threshold (packets) */ + unsigned char Wlog; /* log(W) */ + unsigned char Plog; /* log(P_max/(qth_max-qth_min)) */ + unsigned char Scell_log; /* cell size for idle damping */ + unsigned char flags; /* see RED flags */ +}; + +struct tc_choke_xstats { + __u32 early; /* Early drops */ + __u32 pdrop; /* Drops due to queue limits */ + __u32 other; /* Drops due to drop() calls */ + __u32 marked; /* Marked packets */ + __u32 matched; /* Drops due to flow match */ +}; + +/* HTB section */ +#define TC_HTB_NUMPRIO 8 +#define TC_HTB_MAXDEPTH 8 +#define TC_HTB_PROTOVER 3 /* the same as HTB and TC's major */ + +struct tc_htb_opt { + struct tc_ratespec rate; + struct tc_ratespec ceil; + __u32 buffer; + __u32 cbuffer; + __u32 quantum; + __u32 level; /* out only */ + __u32 prio; +}; +struct tc_htb_glob { + __u32 version; /* to match HTB/TC */ + __u32 rate2quantum; /* bps->quantum divisor */ + __u32 defcls; /* default class number */ + __u32 debug; /* debug flags */ + + /* stats */ + __u32 direct_pkts; /* count of non shaped packets */ +}; +enum { + TCA_HTB_UNSPEC, + TCA_HTB_PARMS, + TCA_HTB_INIT, + TCA_HTB_CTAB, + TCA_HTB_RTAB, + TCA_HTB_DIRECT_QLEN, + TCA_HTB_RATE64, + TCA_HTB_CEIL64, + TCA_HTB_PAD, + TCA_HTB_OFFLOAD, + __TCA_HTB_MAX, +}; + +#define TCA_HTB_MAX (__TCA_HTB_MAX - 1) + +struct tc_htb_xstats { + __u32 lends; + __u32 borrows; + __u32 giants; /* unused since 'Make HTB scheduler work with TSO.' */ + __s32 tokens; + __s32 ctokens; +}; + +/* HFSC section */ + +struct tc_hfsc_qopt { + __u16 defcls; /* default class */ +}; + +struct tc_service_curve { + __u32 m1; /* slope of the first segment in bps */ + __u32 d; /* x-projection of the first segment in us */ + __u32 m2; /* slope of the second segment in bps */ +}; + +struct tc_hfsc_stats { + __u64 work; /* total work done */ + __u64 rtwork; /* work done by real-time criteria */ + __u32 period; /* current period */ + __u32 level; /* class level in hierarchy */ +}; + +enum { + TCA_HFSC_UNSPEC, + TCA_HFSC_RSC, + TCA_HFSC_FSC, + TCA_HFSC_USC, + __TCA_HFSC_MAX, +}; + +#define TCA_HFSC_MAX (__TCA_HFSC_MAX - 1) + +/* Network emulator */ + +enum { + TCA_NETEM_UNSPEC, + TCA_NETEM_CORR, + TCA_NETEM_DELAY_DIST, + TCA_NETEM_REORDER, + TCA_NETEM_CORRUPT, + TCA_NETEM_LOSS, + TCA_NETEM_RATE, + TCA_NETEM_ECN, + TCA_NETEM_RATE64, + TCA_NETEM_PAD, + TCA_NETEM_LATENCY64, + TCA_NETEM_JITTER64, + TCA_NETEM_SLOT, + TCA_NETEM_SLOT_DIST, + __TCA_NETEM_MAX, +}; + +#define TCA_NETEM_MAX (__TCA_NETEM_MAX - 1) + +struct tc_netem_qopt { + __u32 latency; /* added delay (us) */ + __u32 limit; /* fifo limit (packets) */ + __u32 loss; /* random packet loss (0=none ~0=100%) */ + __u32 gap; /* re-ordering gap (0 for none) */ + __u32 duplicate; /* random packet dup (0=none ~0=100%) */ + __u32 jitter; /* random jitter in latency (us) */ +}; + +struct tc_netem_corr { + __u32 delay_corr; /* delay correlation */ + __u32 loss_corr; /* packet loss correlation */ + __u32 dup_corr; /* duplicate correlation */ +}; + +struct tc_netem_reorder { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_corrupt { + __u32 probability; + __u32 correlation; +}; + +struct tc_netem_rate { + __u32 rate; /* byte/s */ + __s32 packet_overhead; + __u32 cell_size; + __s32 cell_overhead; +}; + +struct tc_netem_slot { + __s64 min_delay; /* nsec */ + __s64 max_delay; + __s32 max_packets; + __s32 max_bytes; + __s64 dist_delay; /* nsec */ + __s64 dist_jitter; /* nsec */ +}; + +enum { + NETEM_LOSS_UNSPEC, + NETEM_LOSS_GI, /* General Intuitive - 4 state model */ + NETEM_LOSS_GE, /* Gilbert Elliot models */ + __NETEM_LOSS_MAX +}; +#define NETEM_LOSS_MAX (__NETEM_LOSS_MAX - 1) + +/* State transition probabilities for 4 state model */ +struct tc_netem_gimodel { + __u32 p13; + __u32 p31; + __u32 p32; + __u32 p14; + __u32 p23; +}; + +/* Gilbert-Elliot models */ +struct tc_netem_gemodel { + __u32 p; + __u32 r; + __u32 h; + __u32 k1; +}; + +#define NETEM_DIST_SCALE 8192 +#define NETEM_DIST_MAX 16384 + +/* DRR */ + +enum { + TCA_DRR_UNSPEC, + TCA_DRR_QUANTUM, + __TCA_DRR_MAX +}; + +#define TCA_DRR_MAX (__TCA_DRR_MAX - 1) + +struct tc_drr_stats { + __u32 deficit; +}; + +/* MQPRIO */ +#define TC_QOPT_BITMASK 15 +#define TC_QOPT_MAX_QUEUE 16 + +enum { + TC_MQPRIO_HW_OFFLOAD_NONE, /* no offload requested */ + TC_MQPRIO_HW_OFFLOAD_TCS, /* offload TCs, no queue counts */ + __TC_MQPRIO_HW_OFFLOAD_MAX +}; + +#define TC_MQPRIO_HW_OFFLOAD_MAX (__TC_MQPRIO_HW_OFFLOAD_MAX - 1) + +enum { + TC_MQPRIO_MODE_DCB, + TC_MQPRIO_MODE_CHANNEL, + __TC_MQPRIO_MODE_MAX +}; + +#define __TC_MQPRIO_MODE_MAX (__TC_MQPRIO_MODE_MAX - 1) + +enum { + TC_MQPRIO_SHAPER_DCB, + TC_MQPRIO_SHAPER_BW_RATE, /* Add new shapers below */ + __TC_MQPRIO_SHAPER_MAX +}; + +#define __TC_MQPRIO_SHAPER_MAX (__TC_MQPRIO_SHAPER_MAX - 1) + +struct tc_mqprio_qopt { + __u8 num_tc; + __u8 prio_tc_map[TC_QOPT_BITMASK + 1]; + __u8 hw; + __u16 count[TC_QOPT_MAX_QUEUE]; + __u16 offset[TC_QOPT_MAX_QUEUE]; +}; + +#define TC_MQPRIO_F_MODE 0x1 +#define TC_MQPRIO_F_SHAPER 0x2 +#define TC_MQPRIO_F_MIN_RATE 0x4 +#define TC_MQPRIO_F_MAX_RATE 0x8 + +enum { + TCA_MQPRIO_UNSPEC, + TCA_MQPRIO_MODE, + TCA_MQPRIO_SHAPER, + TCA_MQPRIO_MIN_RATE64, + TCA_MQPRIO_MAX_RATE64, + __TCA_MQPRIO_MAX, +}; + +#define TCA_MQPRIO_MAX (__TCA_MQPRIO_MAX - 1) + +/* SFB */ + +enum { + TCA_SFB_UNSPEC, + TCA_SFB_PARMS, + __TCA_SFB_MAX, +}; + +#define TCA_SFB_MAX (__TCA_SFB_MAX - 1) + +/* + * Note: increment, decrement are Q0.16 fixed-point values. + */ +struct tc_sfb_qopt { + __u32 rehash_interval; /* delay between hash move, in ms */ + __u32 warmup_time; /* double buffering warmup time in ms (warmup_time < rehash_interval) */ + __u32 max; /* max len of qlen_min */ + __u32 bin_size; /* maximum queue length per bin */ + __u32 increment; /* probability increment, (d1 in Blue) */ + __u32 decrement; /* probability decrement, (d2 in Blue) */ + __u32 limit; /* max SFB queue length */ + __u32 penalty_rate; /* inelastic flows are rate limited to 'rate' pps */ + __u32 penalty_burst; +}; + +struct tc_sfb_xstats { + __u32 earlydrop; + __u32 penaltydrop; + __u32 bucketdrop; + __u32 queuedrop; + __u32 childdrop; /* drops in child qdisc */ + __u32 marked; + __u32 maxqlen; + __u32 maxprob; + __u32 avgprob; +}; + +#define SFB_MAX_PROB 0xFFFF + +/* QFQ */ +enum { + TCA_QFQ_UNSPEC, + TCA_QFQ_WEIGHT, + TCA_QFQ_LMAX, + __TCA_QFQ_MAX +}; + +#define TCA_QFQ_MAX (__TCA_QFQ_MAX - 1) + +struct tc_qfq_stats { + __u32 weight; + __u32 lmax; +}; + +/* CODEL */ + +enum { + TCA_CODEL_UNSPEC, + TCA_CODEL_TARGET, + TCA_CODEL_LIMIT, + TCA_CODEL_INTERVAL, + TCA_CODEL_ECN, + TCA_CODEL_CE_THRESHOLD, + __TCA_CODEL_MAX +}; + +#define TCA_CODEL_MAX (__TCA_CODEL_MAX - 1) + +struct tc_codel_xstats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 count; /* how many drops we've done since the last time we + * entered dropping state + */ + __u32 lastcount; /* count at entry to dropping state */ + __u32 ldelay; /* in-queue delay seen by most recently dequeued packet */ + __s32 drop_next; /* time to drop next packet */ + __u32 drop_overlimit; /* number of time max qdisc packet limit was hit */ + __u32 ecn_mark; /* number of packets we ECN marked instead of dropped */ + __u32 dropping; /* are we in dropping state ? */ + __u32 ce_mark; /* number of CE marked packets because of ce_threshold */ +}; + +/* FQ_CODEL */ + +enum { + TCA_FQ_CODEL_UNSPEC, + TCA_FQ_CODEL_TARGET, + TCA_FQ_CODEL_LIMIT, + TCA_FQ_CODEL_INTERVAL, + TCA_FQ_CODEL_ECN, + TCA_FQ_CODEL_FLOWS, + TCA_FQ_CODEL_QUANTUM, + TCA_FQ_CODEL_CE_THRESHOLD, + TCA_FQ_CODEL_DROP_BATCH_SIZE, + TCA_FQ_CODEL_MEMORY_LIMIT, + __TCA_FQ_CODEL_MAX +}; + +#define TCA_FQ_CODEL_MAX (__TCA_FQ_CODEL_MAX - 1) + +enum { + TCA_FQ_CODEL_XSTATS_QDISC, + TCA_FQ_CODEL_XSTATS_CLASS, +}; + +struct tc_fq_codel_qd_stats { + __u32 maxpacket; /* largest packet we've seen so far */ + __u32 drop_overlimit; /* number of time max qdisc + * packet limit was hit + */ + __u32 ecn_mark; /* number of packets we ECN marked + * instead of being dropped + */ + __u32 new_flow_count; /* number of time packets + * created a 'new flow' + */ + __u32 new_flows_len; /* count of flows in new list */ + __u32 old_flows_len; /* count of flows in old list */ + __u32 ce_mark; /* packets above ce_threshold */ + __u32 memory_usage; /* in bytes */ + __u32 drop_overmemory; +}; + +struct tc_fq_codel_cl_stats { + __s32 deficit; + __u32 ldelay; /* in-queue delay seen by most recently + * dequeued packet + */ + __u32 count; + __u32 lastcount; + __u32 dropping; + __s32 drop_next; +}; + +struct tc_fq_codel_xstats { + __u32 type; + union { + struct tc_fq_codel_qd_stats qdisc_stats; + struct tc_fq_codel_cl_stats class_stats; + }; +}; + +/* FQ */ + +enum { + TCA_FQ_UNSPEC, + + TCA_FQ_PLIMIT, /* limit of total number of packets in queue */ + + TCA_FQ_FLOW_PLIMIT, /* limit of packets per flow */ + + TCA_FQ_QUANTUM, /* RR quantum */ + + TCA_FQ_INITIAL_QUANTUM, /* RR quantum for new flow */ + + TCA_FQ_RATE_ENABLE, /* enable/disable rate limiting */ + + TCA_FQ_FLOW_DEFAULT_RATE,/* obsolete, do not use */ + + TCA_FQ_FLOW_MAX_RATE, /* per flow max rate */ + + TCA_FQ_BUCKETS_LOG, /* log2(number of buckets) */ + + TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */ + + TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */ + + TCA_FQ_LOW_RATE_THRESHOLD, /* per packet delay under this rate */ + + TCA_FQ_CE_THRESHOLD, /* DCTCP-like CE-marking threshold */ + + __TCA_FQ_MAX +}; + +#define TCA_FQ_MAX (__TCA_FQ_MAX - 1) + +struct tc_fq_qd_stats { + __u64 gc_flows; + __u64 highprio_packets; + __u64 tcp_retrans; + __u64 throttled; + __u64 flows_plimit; + __u64 pkts_too_long; + __u64 allocation_errors; + __s64 time_next_delayed_flow; + __u32 flows; + __u32 inactive_flows; + __u32 throttled_flows; + __u32 unthrottle_latency_ns; + __u64 ce_mark; /* packets above ce_threshold */ +}; + +/* Heavy-Hitter Filter */ + +enum { + TCA_HHF_UNSPEC, + TCA_HHF_BACKLOG_LIMIT, + TCA_HHF_QUANTUM, + TCA_HHF_HH_FLOWS_LIMIT, + TCA_HHF_RESET_TIMEOUT, + TCA_HHF_ADMIT_BYTES, + TCA_HHF_EVICT_TIMEOUT, + TCA_HHF_NON_HH_WEIGHT, + __TCA_HHF_MAX +}; + +#define TCA_HHF_MAX (__TCA_HHF_MAX - 1) + +struct tc_hhf_xstats { + __u32 drop_overlimit; /* number of times max qdisc packet limit + * was hit + */ + __u32 hh_overlimit; /* number of times max heavy-hitters was hit */ + __u32 hh_tot_count; /* number of captured heavy-hitters so far */ + __u32 hh_cur_count; /* number of current heavy-hitters */ +}; + +/* PIE */ +enum { + TCA_PIE_UNSPEC, + TCA_PIE_TARGET, + TCA_PIE_LIMIT, + TCA_PIE_TUPDATE, + TCA_PIE_ALPHA, + TCA_PIE_BETA, + TCA_PIE_ECN, + TCA_PIE_BYTEMODE, + __TCA_PIE_MAX +}; +#define TCA_PIE_MAX (__TCA_PIE_MAX - 1) + +struct tc_pie_xstats { + __u32 prob; /* current probability */ + __u32 delay; /* current delay in ms */ + __u32 avg_dq_rate; /* current average dq_rate in bits/pie_time */ + __u32 packets_in; /* total number of packets enqueued */ + __u32 dropped; /* packets dropped due to pie_action */ + __u32 overlimit; /* dropped due to lack of space in queue */ + __u32 maxq; /* maximum queue size */ + __u32 ecn_mark; /* packets marked with ecn*/ +}; + +/* CBS */ +struct tc_cbs_qopt { + __u8 offload; + __u8 _pad[3]; + __s32 hicredit; + __s32 locredit; + __s32 idleslope; + __s32 sendslope; +}; + +enum { + TCA_CBS_UNSPEC, + TCA_CBS_PARMS, + __TCA_CBS_MAX, +}; + +#define TCA_CBS_MAX (__TCA_CBS_MAX - 1) + + +/* ETF */ +struct tc_etf_qopt { + __s32 delta; + __s32 clockid; + __u32 flags; +#define TC_ETF_DEADLINE_MODE_ON BIT(0) +#define TC_ETF_OFFLOAD_ON BIT(1) +}; + +enum { + TCA_ETF_UNSPEC, + TCA_ETF_PARMS, + __TCA_ETF_MAX, +}; + +#define TCA_ETF_MAX (__TCA_ETF_MAX - 1) + + +/* CAKE */ +enum { + TCA_CAKE_UNSPEC, + TCA_CAKE_PAD, + TCA_CAKE_BASE_RATE64, + TCA_CAKE_DIFFSERV_MODE, + TCA_CAKE_ATM, + TCA_CAKE_FLOW_MODE, + TCA_CAKE_OVERHEAD, + TCA_CAKE_RTT, + TCA_CAKE_TARGET, + TCA_CAKE_AUTORATE, + TCA_CAKE_MEMORY, + TCA_CAKE_NAT, + TCA_CAKE_RAW, + TCA_CAKE_WASH, + TCA_CAKE_MPU, + TCA_CAKE_INGRESS, + TCA_CAKE_ACK_FILTER, + TCA_CAKE_SPLIT_GSO, + __TCA_CAKE_MAX +}; +#define TCA_CAKE_MAX (__TCA_CAKE_MAX - 1) + +enum { + __TCA_CAKE_STATS_INVALID, + TCA_CAKE_STATS_PAD, + TCA_CAKE_STATS_CAPACITY_ESTIMATE64, + TCA_CAKE_STATS_MEMORY_LIMIT, + TCA_CAKE_STATS_MEMORY_USED, + TCA_CAKE_STATS_AVG_NETOFF, + TCA_CAKE_STATS_MIN_NETLEN, + TCA_CAKE_STATS_MAX_NETLEN, + TCA_CAKE_STATS_MIN_ADJLEN, + TCA_CAKE_STATS_MAX_ADJLEN, + TCA_CAKE_STATS_TIN_STATS, + TCA_CAKE_STATS_DEFICIT, + TCA_CAKE_STATS_COBALT_COUNT, + TCA_CAKE_STATS_DROPPING, + TCA_CAKE_STATS_DROP_NEXT_US, + TCA_CAKE_STATS_P_DROP, + TCA_CAKE_STATS_BLUE_TIMER_US, + __TCA_CAKE_STATS_MAX +}; +#define TCA_CAKE_STATS_MAX (__TCA_CAKE_STATS_MAX - 1) + +enum { + __TCA_CAKE_TIN_STATS_INVALID, + TCA_CAKE_TIN_STATS_PAD, + TCA_CAKE_TIN_STATS_SENT_PACKETS, + TCA_CAKE_TIN_STATS_SENT_BYTES64, + TCA_CAKE_TIN_STATS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_PACKETS, + TCA_CAKE_TIN_STATS_ACKS_DROPPED_BYTES64, + TCA_CAKE_TIN_STATS_ECN_MARKED_PACKETS, + TCA_CAKE_TIN_STATS_ECN_MARKED_BYTES64, + TCA_CAKE_TIN_STATS_BACKLOG_PACKETS, + TCA_CAKE_TIN_STATS_BACKLOG_BYTES, + TCA_CAKE_TIN_STATS_THRESHOLD_RATE64, + TCA_CAKE_TIN_STATS_TARGET_US, + TCA_CAKE_TIN_STATS_INTERVAL_US, + TCA_CAKE_TIN_STATS_WAY_INDIRECT_HITS, + TCA_CAKE_TIN_STATS_WAY_MISSES, + TCA_CAKE_TIN_STATS_WAY_COLLISIONS, + TCA_CAKE_TIN_STATS_PEAK_DELAY_US, + TCA_CAKE_TIN_STATS_AVG_DELAY_US, + TCA_CAKE_TIN_STATS_BASE_DELAY_US, + TCA_CAKE_TIN_STATS_SPARSE_FLOWS, + TCA_CAKE_TIN_STATS_BULK_FLOWS, + TCA_CAKE_TIN_STATS_UNRESPONSIVE_FLOWS, + TCA_CAKE_TIN_STATS_MAX_SKBLEN, + TCA_CAKE_TIN_STATS_FLOW_QUANTUM, + __TCA_CAKE_TIN_STATS_MAX +}; +#define TCA_CAKE_TIN_STATS_MAX (__TCA_CAKE_TIN_STATS_MAX - 1) +#define TC_CAKE_MAX_TINS (8) + +enum { + CAKE_FLOW_NONE = 0, + CAKE_FLOW_SRC_IP, + CAKE_FLOW_DST_IP, + CAKE_FLOW_HOSTS, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_DST_IP */ + CAKE_FLOW_FLOWS, + CAKE_FLOW_DUAL_SRC, /* = CAKE_FLOW_SRC_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_DUAL_DST, /* = CAKE_FLOW_DST_IP | CAKE_FLOW_FLOWS */ + CAKE_FLOW_TRIPLE, /* = CAKE_FLOW_HOSTS | CAKE_FLOW_FLOWS */ + CAKE_FLOW_MAX, +}; + +enum { + CAKE_DIFFSERV_DIFFSERV3 = 0, + CAKE_DIFFSERV_DIFFSERV4, + CAKE_DIFFSERV_DIFFSERV8, + CAKE_DIFFSERV_BESTEFFORT, + CAKE_DIFFSERV_PRECEDENCE, + CAKE_DIFFSERV_MAX +}; + +enum { + CAKE_ACK_NONE = 0, + CAKE_ACK_FILTER, + CAKE_ACK_AGGRESSIVE, + CAKE_ACK_MAX +}; + +enum { + CAKE_ATM_NONE = 0, + CAKE_ATM_ATM, + CAKE_ATM_PTM, + CAKE_ATM_MAX +}; + + +/* TAPRIO */ +enum { + TC_TAPRIO_CMD_SET_GATES = 0x00, + TC_TAPRIO_CMD_SET_AND_HOLD = 0x01, + TC_TAPRIO_CMD_SET_AND_RELEASE = 0x02, +}; + +enum { + TCA_TAPRIO_SCHED_ENTRY_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY_INDEX, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_CMD, /* u8 */ + TCA_TAPRIO_SCHED_ENTRY_GATE_MASK, /* u32 */ + TCA_TAPRIO_SCHED_ENTRY_INTERVAL, /* u32 */ + __TCA_TAPRIO_SCHED_ENTRY_MAX, +}; +#define TCA_TAPRIO_SCHED_ENTRY_MAX (__TCA_TAPRIO_SCHED_ENTRY_MAX - 1) + +/* The format for schedule entry list is: + * [TCA_TAPRIO_SCHED_ENTRY_LIST] + * [TCA_TAPRIO_SCHED_ENTRY] + * [TCA_TAPRIO_SCHED_ENTRY_CMD] + * [TCA_TAPRIO_SCHED_ENTRY_GATES] + * [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] + */ +enum { + TCA_TAPRIO_SCHED_UNSPEC, + TCA_TAPRIO_SCHED_ENTRY, + __TCA_TAPRIO_SCHED_MAX, +}; + +#define TCA_TAPRIO_SCHED_MAX (__TCA_TAPRIO_SCHED_MAX - 1) + +enum { + TCA_TAPRIO_ATTR_UNSPEC, + TCA_TAPRIO_ATTR_PRIOMAP, /* struct tc_mqprio_qopt */ + TCA_TAPRIO_ATTR_SCHED_ENTRY_LIST, /* nested of entry */ + TCA_TAPRIO_ATTR_SCHED_BASE_TIME, /* s64 */ + TCA_TAPRIO_ATTR_SCHED_SINGLE_ENTRY, /* single entry */ + TCA_TAPRIO_ATTR_SCHED_CLOCKID, /* s32 */ + TCA_TAPRIO_PAD, + __TCA_TAPRIO_ATTR_MAX, +}; + +#define TCA_TAPRIO_ATTR_MAX (__TCA_TAPRIO_ATTR_MAX - 1) + +#endif diff --git a/pkg/plugin/lib/common/libbpf/_include/uapi/linux/placeholder.go b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/placeholder.go new file mode 100644 index 0000000000..237a542235 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_include/uapi/linux/placeholder.go @@ -0,0 +1,3 @@ +package uapi //nolint:all + +// This file is a placeholder to make Go include this directory when vendoring. diff --git a/pkg/plugin/lib/common/libbpf/_src/bpf.c b/pkg/plugin/lib/common/libbpf/_src/bpf.c index a9d292c106..2a4c71501a 100644 --- a/pkg/plugin/lib/common/libbpf/_src/bpf.c +++ b/pkg/plugin/lib/common/libbpf/_src/bpf.c @@ -84,9 +84,7 @@ static inline int sys_bpf_fd(enum bpf_cmd cmd, union bpf_attr *attr, return ensure_good_fd(fd); } -#define PROG_LOAD_ATTEMPTS 5 - -static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts) { int fd; @@ -105,9 +103,9 @@ static inline int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int * [0] https://lore.kernel.org/bpf/20201201215900.3569844-1-guro@fb.com/ * [1] d05512618056 ("bpf: Add bpf_ktime_get_coarse_ns helper") */ -int probe_memcg_account(void) +int probe_memcg_account(int token_fd) { - const size_t prog_load_attr_sz = offsetofend(union bpf_attr, attach_btf_obj_fd); + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); struct bpf_insn insns[] = { BPF_EMIT_CALL(BPF_FUNC_ktime_get_coarse_ns), BPF_EXIT_INSN(), @@ -117,13 +115,16 @@ int probe_memcg_account(void) int prog_fd; /* attempt loading freplace trying to use custom BTF */ - memset(&attr, 0, prog_load_attr_sz); + memset(&attr, 0, attr_sz); attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; attr.insns = ptr_to_u64(insns); attr.insn_cnt = insn_cnt; attr.license = ptr_to_u64("GPL"); + attr.prog_token_fd = token_fd; + if (token_fd) + attr.prog_flags |= BPF_F_TOKEN_FD; - prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, prog_load_attr_sz); + prog_fd = sys_bpf_fd(BPF_PROG_LOAD, &attr, attr_sz); if (prog_fd >= 0) { close(prog_fd); return 1; @@ -147,12 +148,8 @@ int bump_rlimit_memlock(void) { struct rlimit rlim; - /* this the default in libbpf 1.0, but for now user has to opt-in explicitly */ - if (!(libbpf_mode & LIBBPF_STRICT_AUTO_RLIMIT_MEMLOCK)) - return 0; - /* if kernel supports memcg-based accounting, skip bumping RLIMIT_MEMLOCK */ - if (memlock_bumped || kernel_supports(NULL, FEAT_MEMCG_ACCOUNT)) + if (memlock_bumped || feat_supported(NULL, FEAT_MEMCG_ACCOUNT)) return 0; memlock_bumped = true; @@ -175,7 +172,7 @@ int bpf_map_create(enum bpf_map_type map_type, __u32 max_entries, const struct bpf_map_create_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, map_extra); + const size_t attr_sz = offsetofend(union bpf_attr, map_token_fd); union bpf_attr attr; int fd; @@ -187,7 +184,7 @@ int bpf_map_create(enum bpf_map_type map_type, return libbpf_err(-EINVAL); attr.map_type = map_type; - if (map_name) + if (map_name && feat_supported(NULL, FEAT_PROG_NAME)) libbpf_strlcpy(attr.map_name, map_name, sizeof(attr.map_name)); attr.key_size = key_size; attr.value_size = value_size; @@ -197,6 +194,7 @@ int bpf_map_create(enum bpf_map_type map_type, attr.btf_key_type_id = OPTS_GET(opts, btf_key_type_id, 0); attr.btf_value_type_id = OPTS_GET(opts, btf_value_type_id, 0); attr.btf_vmlinux_value_type_id = OPTS_GET(opts, btf_vmlinux_value_type_id, 0); + attr.value_type_btf_obj_fd = OPTS_GET(opts, value_type_btf_obj_fd, 0); attr.inner_map_fd = OPTS_GET(opts, inner_map_fd, 0); attr.map_flags = OPTS_GET(opts, map_flags, 0); @@ -204,90 +202,12 @@ int bpf_map_create(enum bpf_map_type map_type, attr.numa_node = OPTS_GET(opts, numa_node, 0); attr.map_ifindex = OPTS_GET(opts, map_ifindex, 0); + attr.map_token_fd = OPTS_GET(opts, token_fd, 0); + fd = sys_bpf_fd(BPF_MAP_CREATE, &attr, attr_sz); return libbpf_err_errno(fd); } -int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr) -{ - LIBBPF_OPTS(bpf_map_create_opts, p); - - p.map_flags = create_attr->map_flags; - p.numa_node = create_attr->numa_node; - p.btf_fd = create_attr->btf_fd; - p.btf_key_type_id = create_attr->btf_key_type_id; - p.btf_value_type_id = create_attr->btf_value_type_id; - p.map_ifindex = create_attr->map_ifindex; - if (create_attr->map_type == BPF_MAP_TYPE_STRUCT_OPS) - p.btf_vmlinux_value_type_id = create_attr->btf_vmlinux_value_type_id; - else - p.inner_map_fd = create_attr->inner_map_fd; - - return bpf_map_create(create_attr->map_type, create_attr->name, - create_attr->key_size, create_attr->value_size, - create_attr->max_entries, &p); -} - -int bpf_create_map_node(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, int max_entries, - __u32 map_flags, int node) -{ - LIBBPF_OPTS(bpf_map_create_opts, opts); - - opts.map_flags = map_flags; - if (node >= 0) { - opts.numa_node = node; - opts.map_flags |= BPF_F_NUMA_NODE; - } - - return bpf_map_create(map_type, name, key_size, value_size, max_entries, &opts); -} - -int bpf_create_map(enum bpf_map_type map_type, int key_size, - int value_size, int max_entries, __u32 map_flags) -{ - LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = map_flags); - - return bpf_map_create(map_type, NULL, key_size, value_size, max_entries, &opts); -} - -int bpf_create_map_name(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, int max_entries, - __u32 map_flags) -{ - LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = map_flags); - - return bpf_map_create(map_type, name, key_size, value_size, max_entries, &opts); -} - -int bpf_create_map_in_map_node(enum bpf_map_type map_type, const char *name, - int key_size, int inner_map_fd, int max_entries, - __u32 map_flags, int node) -{ - LIBBPF_OPTS(bpf_map_create_opts, opts); - - opts.inner_map_fd = inner_map_fd; - opts.map_flags = map_flags; - if (node >= 0) { - opts.map_flags |= BPF_F_NUMA_NODE; - opts.numa_node = node; - } - - return bpf_map_create(map_type, name, key_size, 4, max_entries, &opts); -} - -int bpf_create_map_in_map(enum bpf_map_type map_type, const char *name, - int key_size, int inner_map_fd, int max_entries, - __u32 map_flags) -{ - LIBBPF_OPTS(bpf_map_create_opts, opts, - .inner_map_fd = inner_map_fd, - .map_flags = map_flags, - ); - - return bpf_map_create(map_type, name, key_size, 4, max_entries, &opts); -} - static void * alloc_zero_tailing_info(const void *orecord, __u32 cnt, __u32 actual_rec_size, __u32 expected_rec_size) @@ -313,12 +233,12 @@ alloc_zero_tailing_info(const void *orecord, __u32 cnt, return info; } -DEFAULT_VERSION(bpf_prog_load_v0_6_0, bpf_prog_load, LIBBPF_0.6.0) -int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, - const char *prog_name, const char *license, - const struct bpf_insn *insns, size_t insn_cnt, - const struct bpf_prog_load_opts *opts) +int bpf_prog_load(enum bpf_prog_type prog_type, + const char *prog_name, const char *license, + const struct bpf_insn *insns, size_t insn_cnt, + struct bpf_prog_load_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); void *finfo = NULL, *linfo = NULL; const char *func_info, *line_info; __u32 log_size, log_level, attach_prog_fd, attach_btf_obj_fd; @@ -338,7 +258,7 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, if (attempts == 0) attempts = PROG_LOAD_ATTEMPTS; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.prog_type = prog_type; attr.expected_attach_type = OPTS_GET(opts, expected_attach_type, 0); @@ -347,8 +267,9 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, attr.prog_flags = OPTS_GET(opts, prog_flags, 0); attr.prog_ifindex = OPTS_GET(opts, prog_ifindex, 0); attr.kern_version = OPTS_GET(opts, kern_version, 0); + attr.prog_token_fd = OPTS_GET(opts, token_fd, 0); - if (prog_name) + if (prog_name && feat_supported(NULL, FEAT_PROG_NAME)) libbpf_strlcpy(attr.prog_name, prog_name, sizeof(attr.prog_name)); attr.license = ptr_to_u64(license); @@ -376,10 +297,6 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, if (!!log_buf != !!log_size) return libbpf_err(-EINVAL); - if (log_level > (4 | 2 | 1)) - return libbpf_err(-EINVAL); - if (log_level && !log_buf) - return libbpf_err(-EINVAL); func_info_rec_size = OPTS_GET(opts, func_info_rec_size, 0); func_info = OPTS_GET(opts, func_info, NULL); @@ -401,7 +318,8 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, attr.log_level = log_level; } - fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts); + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); if (fd >= 0) return fd; @@ -441,7 +359,8 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, break; } - fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts); + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); if (fd >= 0) goto done; } @@ -455,7 +374,8 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, attr.log_size = log_size; attr.log_level = 1; - fd = sys_bpf_prog_load(&attr, sizeof(attr), attempts); + fd = sys_bpf_prog_load(&attr, attr_sz, attempts); + OPTS_SET(opts, log_true_size, attr.log_true_size); } done: /* free() doesn't affect errno, so we don't need to restore it */ @@ -464,204 +384,139 @@ int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, return libbpf_err_errno(fd); } -__attribute__((alias("bpf_load_program_xattr2"))) -int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz); - -static int bpf_load_program_xattr2(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz) -{ - LIBBPF_OPTS(bpf_prog_load_opts, p); - - if (!load_attr || !log_buf != !log_buf_sz) - return libbpf_err(-EINVAL); - - p.expected_attach_type = load_attr->expected_attach_type; - switch (load_attr->prog_type) { - case BPF_PROG_TYPE_STRUCT_OPS: - case BPF_PROG_TYPE_LSM: - p.attach_btf_id = load_attr->attach_btf_id; - break; - case BPF_PROG_TYPE_TRACING: - case BPF_PROG_TYPE_EXT: - p.attach_btf_id = load_attr->attach_btf_id; - p.attach_prog_fd = load_attr->attach_prog_fd; - break; - default: - p.prog_ifindex = load_attr->prog_ifindex; - p.kern_version = load_attr->kern_version; - } - p.log_level = load_attr->log_level; - p.log_buf = log_buf; - p.log_size = log_buf_sz; - p.prog_btf_fd = load_attr->prog_btf_fd; - p.func_info_rec_size = load_attr->func_info_rec_size; - p.func_info_cnt = load_attr->func_info_cnt; - p.func_info = load_attr->func_info; - p.line_info_rec_size = load_attr->line_info_rec_size; - p.line_info_cnt = load_attr->line_info_cnt; - p.line_info = load_attr->line_info; - p.prog_flags = load_attr->prog_flags; - - return bpf_prog_load(load_attr->prog_type, load_attr->name, load_attr->license, - load_attr->insns, load_attr->insns_cnt, &p); -} - -int bpf_load_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, const char *license, - __u32 kern_version, char *log_buf, - size_t log_buf_sz) -{ - struct bpf_load_program_attr load_attr; - - memset(&load_attr, 0, sizeof(struct bpf_load_program_attr)); - load_attr.prog_type = type; - load_attr.expected_attach_type = 0; - load_attr.name = NULL; - load_attr.insns = insns; - load_attr.insns_cnt = insns_cnt; - load_attr.license = license; - load_attr.kern_version = kern_version; - - return bpf_load_program_xattr2(&load_attr, log_buf, log_buf_sz); -} - -int bpf_verify_program(enum bpf_prog_type type, const struct bpf_insn *insns, - size_t insns_cnt, __u32 prog_flags, const char *license, - __u32 kern_version, char *log_buf, size_t log_buf_sz, - int log_level) -{ - union bpf_attr attr; - int fd; - - bump_rlimit_memlock(); - - memset(&attr, 0, sizeof(attr)); - attr.prog_type = type; - attr.insn_cnt = (__u32)insns_cnt; - attr.insns = ptr_to_u64(insns); - attr.license = ptr_to_u64(license); - attr.log_buf = ptr_to_u64(log_buf); - attr.log_size = log_buf_sz; - attr.log_level = log_level; - log_buf[0] = 0; - attr.kern_version = kern_version; - attr.prog_flags = prog_flags; - - fd = sys_bpf_prog_load(&attr, sizeof(attr), PROG_LOAD_ATTEMPTS); - return libbpf_err_errno(fd); -} - int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); attr.flags = flags; - ret = sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_lookup_elem(int fd, const void *key, void *value) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); - ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_lookup_elem_flags(int fd, const void *key, void *value, __u64 flags) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); attr.flags = flags; - ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_LOOKUP_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_lookup_and_delete_elem(int fd, const void *key, void *value) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); - ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.value = ptr_to_u64(value); attr.flags = flags; - ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_LOOKUP_AND_DELETE_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_delete_elem(int fd, const void *key) { + const size_t attr_sz = offsetofend(union bpf_attr, flags); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = ptr_to_u64(key); + + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); + return libbpf_err_errno(ret); +} + +int bpf_map_delete_elem_flags(int fd, const void *key, __u64 flags) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); + attr.flags = flags; - ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_get_next_key(int fd, const void *key, void *next_key) { + const size_t attr_sz = offsetofend(union bpf_attr, next_key); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; attr.key = ptr_to_u64(key); attr.next_key = ptr_to_u64(next_key); - ret = sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_GET_NEXT_KEY, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_map_freeze(int fd) { + const size_t attr_sz = offsetofend(union bpf_attr, map_fd); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.map_fd = fd; - ret = sys_bpf(BPF_MAP_FREEZE, &attr, sizeof(attr)); + ret = sys_bpf(BPF_MAP_FREEZE, &attr, attr_sz); return libbpf_err_errno(ret); } @@ -670,13 +525,14 @@ static int bpf_map_batch_common(int cmd, int fd, void *in_batch, __u32 *count, const struct bpf_map_batch_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, batch); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_map_batch_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.batch.map_fd = fd; attr.batch.in_batch = ptr_to_u64(in_batch); attr.batch.out_batch = ptr_to_u64(out_batch); @@ -686,7 +542,7 @@ static int bpf_map_batch_common(int cmd, int fd, void *in_batch, attr.batch.elem_flags = OPTS_GET(opts, elem_flags, 0); attr.batch.flags = OPTS_GET(opts, flags, 0); - ret = sys_bpf(cmd, &attr, sizeof(attr)); + ret = sys_bpf(cmd, &attr, attr_sz); *count = attr.batch.count; return libbpf_err_errno(ret); @@ -723,28 +579,50 @@ int bpf_map_update_batch(int fd, const void *keys, const void *values, __u32 *co (void *)keys, (void *)values, count, opts); } -int bpf_obj_pin(int fd, const char *pathname) +int bpf_obj_pin_opts(int fd, const char *pathname, const struct bpf_obj_pin_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, path_fd); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + if (!OPTS_VALID(opts, bpf_obj_pin_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.path_fd = OPTS_GET(opts, path_fd, 0); attr.pathname = ptr_to_u64((void *)pathname); + attr.file_flags = OPTS_GET(opts, file_flags, 0); attr.bpf_fd = fd; - ret = sys_bpf(BPF_OBJ_PIN, &attr, sizeof(attr)); + ret = sys_bpf(BPF_OBJ_PIN, &attr, attr_sz); return libbpf_err_errno(ret); } +int bpf_obj_pin(int fd, const char *pathname) +{ + return bpf_obj_pin_opts(fd, pathname, NULL); +} + int bpf_obj_get(const char *pathname) { + return bpf_obj_get_opts(pathname, NULL); +} + +int bpf_obj_get_opts(const char *pathname, const struct bpf_obj_get_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, path_fd); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + if (!OPTS_VALID(opts, bpf_obj_get_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.path_fd = OPTS_GET(opts, path_fd, 0); attr.pathname = ptr_to_u64((void *)pathname); + attr.file_flags = OPTS_GET(opts, file_flags, 0); - fd = sys_bpf_fd(BPF_OBJ_GET, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_OBJ_GET, &attr, attr_sz); return libbpf_err_errno(fd); } @@ -758,66 +636,99 @@ int bpf_prog_attach(int prog_fd, int target_fd, enum bpf_attach_type type, return bpf_prog_attach_opts(prog_fd, target_fd, type, &opts); } -int bpf_prog_attach_opts(int prog_fd, int target_fd, - enum bpf_attach_type type, - const struct bpf_prog_attach_opts *opts) +int bpf_prog_attach_opts(int prog_fd, int target, enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, expected_revision); + __u32 relative_id, flags; + int ret, relative_fd; union bpf_attr attr; - int ret; if (!OPTS_VALID(opts, bpf_prog_attach_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); - attr.target_fd = target_fd; - attr.attach_bpf_fd = prog_fd; - attr.attach_type = type; - attr.attach_flags = OPTS_GET(opts, flags, 0); - attr.replace_bpf_fd = OPTS_GET(opts, replace_prog_fd, 0); + relative_id = OPTS_GET(opts, relative_id, 0); + relative_fd = OPTS_GET(opts, relative_fd, 0); + flags = OPTS_GET(opts, flags, 0); - ret = sys_bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)); + /* validate we don't have unexpected combinations of non-zero fields */ + if (relative_fd && relative_id) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.target_fd = target; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + attr.replace_bpf_fd = OPTS_GET(opts, replace_fd, 0); + attr.expected_revision = OPTS_GET(opts, expected_revision, 0); + + if (relative_id) { + attr.attach_flags = flags | BPF_F_ID; + attr.relative_id = relative_id; + } else { + attr.attach_flags = flags; + attr.relative_fd = relative_fd; + } + + ret = sys_bpf(BPF_PROG_ATTACH, &attr, attr_sz); return libbpf_err_errno(ret); } -__attribute__((alias("bpf_prog_attach_opts"))) -int bpf_prog_attach_xattr(int prog_fd, int target_fd, - enum bpf_attach_type type, - const struct bpf_prog_attach_opts *opts); - -int bpf_prog_detach(int target_fd, enum bpf_attach_type type) +int bpf_prog_detach_opts(int prog_fd, int target, enum bpf_attach_type type, + const struct bpf_prog_detach_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, expected_revision); + __u32 relative_id, flags; + int ret, relative_fd; union bpf_attr attr; - int ret; - memset(&attr, 0, sizeof(attr)); - attr.target_fd = target_fd; - attr.attach_type = type; + if (!OPTS_VALID(opts, bpf_prog_detach_opts)) + return libbpf_err(-EINVAL); + + relative_id = OPTS_GET(opts, relative_id, 0); + relative_fd = OPTS_GET(opts, relative_fd, 0); + flags = OPTS_GET(opts, flags, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (relative_fd && relative_id) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.target_fd = target; + attr.attach_bpf_fd = prog_fd; + attr.attach_type = type; + attr.expected_revision = OPTS_GET(opts, expected_revision, 0); + + if (relative_id) { + attr.attach_flags = flags | BPF_F_ID; + attr.relative_id = relative_id; + } else { + attr.attach_flags = flags; + attr.relative_fd = relative_fd; + } - ret = sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_DETACH, &attr, attr_sz); return libbpf_err_errno(ret); } -int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) +int bpf_prog_detach(int target_fd, enum bpf_attach_type type) { - union bpf_attr attr; - int ret; - - memset(&attr, 0, sizeof(attr)); - attr.target_fd = target_fd; - attr.attach_bpf_fd = prog_fd; - attr.attach_type = type; + return bpf_prog_detach_opts(0, target_fd, type, NULL); +} - ret = sys_bpf(BPF_PROG_DETACH, &attr, sizeof(attr)); - return libbpf_err_errno(ret); +int bpf_prog_detach2(int prog_fd, int target_fd, enum bpf_attach_type type) +{ + return bpf_prog_detach_opts(prog_fd, target_fd, type, NULL); } int bpf_link_create(int prog_fd, int target_fd, enum bpf_attach_type attach_type, const struct bpf_link_create_opts *opts) { - __u32 target_btf_id, iter_info_len; + const size_t attr_sz = offsetofend(union bpf_attr, link_create); + __u32 target_btf_id, iter_info_len, relative_id; + int fd, err, relative_fd; union bpf_attr attr; - int fd, err; if (!OPTS_VALID(opts, bpf_link_create_opts)) return libbpf_err(-EINVAL); @@ -833,7 +744,7 @@ int bpf_link_create(int prog_fd, int target_fd, return libbpf_err(-EINVAL); } - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.link_create.prog_fd = prog_fd; attr.link_create.target_fd = target_fd; attr.link_create.attach_type = attach_type; @@ -855,6 +766,7 @@ int bpf_link_create(int prog_fd, int target_fd, return libbpf_err(-EINVAL); break; case BPF_TRACE_KPROBE_MULTI: + case BPF_TRACE_KPROBE_SESSION: attr.link_create.kprobe_multi.flags = OPTS_GET(opts, kprobe_multi.flags, 0); attr.link_create.kprobe_multi.cnt = OPTS_GET(opts, kprobe_multi.cnt, 0); attr.link_create.kprobe_multi.syms = ptr_to_u64(OPTS_GET(opts, kprobe_multi.syms, 0)); @@ -863,13 +775,73 @@ int bpf_link_create(int prog_fd, int target_fd, if (!OPTS_ZEROED(opts, kprobe_multi)) return libbpf_err(-EINVAL); break; + case BPF_TRACE_UPROBE_MULTI: + attr.link_create.uprobe_multi.flags = OPTS_GET(opts, uprobe_multi.flags, 0); + attr.link_create.uprobe_multi.cnt = OPTS_GET(opts, uprobe_multi.cnt, 0); + attr.link_create.uprobe_multi.path = ptr_to_u64(OPTS_GET(opts, uprobe_multi.path, 0)); + attr.link_create.uprobe_multi.offsets = ptr_to_u64(OPTS_GET(opts, uprobe_multi.offsets, 0)); + attr.link_create.uprobe_multi.ref_ctr_offsets = ptr_to_u64(OPTS_GET(opts, uprobe_multi.ref_ctr_offsets, 0)); + attr.link_create.uprobe_multi.cookies = ptr_to_u64(OPTS_GET(opts, uprobe_multi.cookies, 0)); + attr.link_create.uprobe_multi.pid = OPTS_GET(opts, uprobe_multi.pid, 0); + if (!OPTS_ZEROED(opts, uprobe_multi)) + return libbpf_err(-EINVAL); + break; + case BPF_TRACE_RAW_TP: + case BPF_TRACE_FENTRY: + case BPF_TRACE_FEXIT: + case BPF_MODIFY_RETURN: + case BPF_LSM_MAC: + attr.link_create.tracing.cookie = OPTS_GET(opts, tracing.cookie, 0); + if (!OPTS_ZEROED(opts, tracing)) + return libbpf_err(-EINVAL); + break; + case BPF_NETFILTER: + attr.link_create.netfilter.pf = OPTS_GET(opts, netfilter.pf, 0); + attr.link_create.netfilter.hooknum = OPTS_GET(opts, netfilter.hooknum, 0); + attr.link_create.netfilter.priority = OPTS_GET(opts, netfilter.priority, 0); + attr.link_create.netfilter.flags = OPTS_GET(opts, netfilter.flags, 0); + if (!OPTS_ZEROED(opts, netfilter)) + return libbpf_err(-EINVAL); + break; + case BPF_TCX_INGRESS: + case BPF_TCX_EGRESS: + relative_fd = OPTS_GET(opts, tcx.relative_fd, 0); + relative_id = OPTS_GET(opts, tcx.relative_id, 0); + if (relative_fd && relative_id) + return libbpf_err(-EINVAL); + if (relative_id) { + attr.link_create.tcx.relative_id = relative_id; + attr.link_create.flags |= BPF_F_ID; + } else { + attr.link_create.tcx.relative_fd = relative_fd; + } + attr.link_create.tcx.expected_revision = OPTS_GET(opts, tcx.expected_revision, 0); + if (!OPTS_ZEROED(opts, tcx)) + return libbpf_err(-EINVAL); + break; + case BPF_NETKIT_PRIMARY: + case BPF_NETKIT_PEER: + relative_fd = OPTS_GET(opts, netkit.relative_fd, 0); + relative_id = OPTS_GET(opts, netkit.relative_id, 0); + if (relative_fd && relative_id) + return libbpf_err(-EINVAL); + if (relative_id) { + attr.link_create.netkit.relative_id = relative_id; + attr.link_create.flags |= BPF_F_ID; + } else { + attr.link_create.netkit.relative_fd = relative_fd; + } + attr.link_create.netkit.expected_revision = OPTS_GET(opts, netkit.expected_revision, 0); + if (!OPTS_ZEROED(opts, netkit)) + return libbpf_err(-EINVAL); + break; default: if (!OPTS_ZEROED(opts, flags)) return libbpf_err(-EINVAL); break; } proceed: - fd = sys_bpf_fd(BPF_LINK_CREATE, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_LINK_CREATE, &attr, attr_sz); if (fd >= 0) return fd; /* we'll get EINVAL if LINK_CREATE doesn't support attaching fentry @@ -905,134 +877,114 @@ int bpf_link_create(int prog_fd, int target_fd, int bpf_link_detach(int link_fd) { + const size_t attr_sz = offsetofend(union bpf_attr, link_detach); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.link_detach.link_fd = link_fd; - ret = sys_bpf(BPF_LINK_DETACH, &attr, sizeof(attr)); + ret = sys_bpf(BPF_LINK_DETACH, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_link_update(int link_fd, int new_prog_fd, const struct bpf_link_update_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, link_update); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_link_update_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + if (OPTS_GET(opts, old_prog_fd, 0) && OPTS_GET(opts, old_map_fd, 0)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); attr.link_update.link_fd = link_fd; attr.link_update.new_prog_fd = new_prog_fd; attr.link_update.flags = OPTS_GET(opts, flags, 0); - attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + if (OPTS_GET(opts, old_prog_fd, 0)) + attr.link_update.old_prog_fd = OPTS_GET(opts, old_prog_fd, 0); + else if (OPTS_GET(opts, old_map_fd, 0)) + attr.link_update.old_map_fd = OPTS_GET(opts, old_map_fd, 0); - ret = sys_bpf(BPF_LINK_UPDATE, &attr, sizeof(attr)); + ret = sys_bpf(BPF_LINK_UPDATE, &attr, attr_sz); return libbpf_err_errno(ret); } int bpf_iter_create(int link_fd) { + const size_t attr_sz = offsetofend(union bpf_attr, iter_create); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.iter_create.link_fd = link_fd; - fd = sys_bpf_fd(BPF_ITER_CREATE, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_ITER_CREATE, &attr, attr_sz); return libbpf_err_errno(fd); } -int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, - __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) +int bpf_prog_query_opts(int target, enum bpf_attach_type type, + struct bpf_prog_query_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, query); union bpf_attr attr; int ret; - memset(&attr, 0, sizeof(attr)); - attr.query.target_fd = target_fd; - attr.query.attach_type = type; - attr.query.query_flags = query_flags; - attr.query.prog_cnt = *prog_cnt; - attr.query.prog_ids = ptr_to_u64(prog_ids); - - ret = sys_bpf(BPF_PROG_QUERY, &attr, sizeof(attr)); - - if (attach_flags) - *attach_flags = attr.query.attach_flags; - *prog_cnt = attr.query.prog_cnt; - - return libbpf_err_errno(ret); -} - -int bpf_prog_test_run(int prog_fd, int repeat, void *data, __u32 size, - void *data_out, __u32 *size_out, __u32 *retval, - __u32 *duration) -{ - union bpf_attr attr; - int ret; + if (!OPTS_VALID(opts, bpf_prog_query_opts)) + return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); - attr.test.prog_fd = prog_fd; - attr.test.data_in = ptr_to_u64(data); - attr.test.data_out = ptr_to_u64(data_out); - attr.test.data_size_in = size; - attr.test.repeat = repeat; + memset(&attr, 0, attr_sz); + attr.query.target_fd = target; + attr.query.attach_type = type; + attr.query.query_flags = OPTS_GET(opts, query_flags, 0); + attr.query.count = OPTS_GET(opts, count, 0); + attr.query.prog_ids = ptr_to_u64(OPTS_GET(opts, prog_ids, NULL)); + attr.query.link_ids = ptr_to_u64(OPTS_GET(opts, link_ids, NULL)); + attr.query.prog_attach_flags = ptr_to_u64(OPTS_GET(opts, prog_attach_flags, NULL)); + attr.query.link_attach_flags = ptr_to_u64(OPTS_GET(opts, link_attach_flags, NULL)); - ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_QUERY, &attr, attr_sz); - if (size_out) - *size_out = attr.test.data_size_out; - if (retval) - *retval = attr.test.retval; - if (duration) - *duration = attr.test.duration; + OPTS_SET(opts, attach_flags, attr.query.attach_flags); + OPTS_SET(opts, revision, attr.query.revision); + OPTS_SET(opts, count, attr.query.count); return libbpf_err_errno(ret); } -int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr) +int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, + __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt) { - union bpf_attr attr; + LIBBPF_OPTS(bpf_prog_query_opts, opts); int ret; - if (!test_attr->data_out && test_attr->data_size_out > 0) - return libbpf_err(-EINVAL); + opts.query_flags = query_flags; + opts.prog_ids = prog_ids; + opts.prog_cnt = *prog_cnt; - memset(&attr, 0, sizeof(attr)); - attr.test.prog_fd = test_attr->prog_fd; - attr.test.data_in = ptr_to_u64(test_attr->data_in); - attr.test.data_out = ptr_to_u64(test_attr->data_out); - attr.test.data_size_in = test_attr->data_size_in; - attr.test.data_size_out = test_attr->data_size_out; - attr.test.ctx_in = ptr_to_u64(test_attr->ctx_in); - attr.test.ctx_out = ptr_to_u64(test_attr->ctx_out); - attr.test.ctx_size_in = test_attr->ctx_size_in; - attr.test.ctx_size_out = test_attr->ctx_size_out; - attr.test.repeat = test_attr->repeat; - - ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); - - test_attr->data_size_out = attr.test.data_size_out; - test_attr->ctx_size_out = attr.test.ctx_size_out; - test_attr->retval = attr.test.retval; - test_attr->duration = attr.test.duration; + ret = bpf_prog_query_opts(target_fd, type, &opts); + + if (attach_flags) + *attach_flags = opts.attach_flags; + *prog_cnt = opts.prog_cnt; return libbpf_err_errno(ret); } int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, test); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_test_run_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.test.prog_fd = prog_fd; attr.test.batch_size = OPTS_GET(opts, batch_size, 0); attr.test.cpu = OPTS_GET(opts, cpu, 0); @@ -1048,7 +1000,7 @@ int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) attr.test.data_in = ptr_to_u64(OPTS_GET(opts, data_in, NULL)); attr.test.data_out = ptr_to_u64(OPTS_GET(opts, data_out, NULL)); - ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_TEST_RUN, &attr, attr_sz); OPTS_SET(opts, data_size_out, attr.test.data_size_out); OPTS_SET(opts, ctx_size_out, attr.test.ctx_size_out); @@ -1060,13 +1012,14 @@ int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts) static int bpf_obj_get_next_id(__u32 start_id, __u32 *next_id, int cmd) { + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int err; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.start_id = start_id; - err = sys_bpf(cmd, &attr, sizeof(attr)); + err = sys_bpf(cmd, &attr, attr_sz); if (!err) *next_id = attr.next_id; @@ -1093,88 +1046,163 @@ int bpf_link_get_next_id(__u32 start_id, __u32 *next_id) return bpf_obj_get_next_id(start_id, next_id, BPF_LINK_GET_NEXT_ID); } -int bpf_prog_get_fd_by_id(__u32 id) +int bpf_prog_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); attr.prog_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); - fd = sys_bpf_fd(BPF_PROG_GET_FD_BY_ID, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_PROG_GET_FD_BY_ID, &attr, attr_sz); return libbpf_err_errno(fd); } -int bpf_map_get_fd_by_id(__u32 id) +int bpf_prog_get_fd_by_id(__u32 id) +{ + return bpf_prog_get_fd_by_id_opts(id, NULL); +} + +int bpf_map_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); attr.map_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); - fd = sys_bpf_fd(BPF_MAP_GET_FD_BY_ID, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); return libbpf_err_errno(fd); } -int bpf_btf_get_fd_by_id(__u32 id) +int bpf_map_get_fd_by_id(__u32 id) { + return bpf_map_get_fd_by_id_opts(id, NULL); +} + +int bpf_btf_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); attr.btf_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); - fd = sys_bpf_fd(BPF_BTF_GET_FD_BY_ID, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_BTF_GET_FD_BY_ID, &attr, attr_sz); return libbpf_err_errno(fd); } -int bpf_link_get_fd_by_id(__u32 id) +int bpf_btf_get_fd_by_id(__u32 id) +{ + return bpf_btf_get_fd_by_id_opts(id, NULL); +} + +int bpf_link_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, open_flags); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + if (!OPTS_VALID(opts, bpf_get_fd_by_id_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); attr.link_id = id; + attr.open_flags = OPTS_GET(opts, open_flags, 0); - fd = sys_bpf_fd(BPF_LINK_GET_FD_BY_ID, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_LINK_GET_FD_BY_ID, &attr, attr_sz); return libbpf_err_errno(fd); } +int bpf_link_get_fd_by_id(__u32 id) +{ + return bpf_link_get_fd_by_id_opts(id, NULL); +} + int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len) { + const size_t attr_sz = offsetofend(union bpf_attr, info); union bpf_attr attr; int err; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.info.bpf_fd = bpf_fd; attr.info.info_len = *info_len; attr.info.info = ptr_to_u64(info); - err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, sizeof(attr)); - + err = sys_bpf(BPF_OBJ_GET_INFO_BY_FD, &attr, attr_sz); if (!err) *info_len = attr.info.info_len; - return libbpf_err_errno(err); } -int bpf_raw_tracepoint_open(const char *name, int prog_fd) +int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(prog_fd, info, info_len); +} + +int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(map_fd, info, info_len); +} + +int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(btf_fd, info, info_len); +} + +int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len) +{ + return bpf_obj_get_info_by_fd(link_fd, info, info_len); +} + +int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); - attr.raw_tracepoint.name = ptr_to_u64(name); + if (!OPTS_VALID(opts, bpf_raw_tp_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); attr.raw_tracepoint.prog_fd = prog_fd; + attr.raw_tracepoint.name = ptr_to_u64(OPTS_GET(opts, tp_name, NULL)); + attr.raw_tracepoint.cookie = OPTS_GET(opts, cookie, 0); - fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_RAW_TRACEPOINT_OPEN, &attr, attr_sz); return libbpf_err_errno(fd); } -int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_load_opts *opts) +int bpf_raw_tracepoint_open(const char *name, int prog_fd) +{ + LIBBPF_OPTS(bpf_raw_tp_opts, opts, .tp_name = name); + + return bpf_raw_tracepoint_open_opts(prog_fd, &opts); +} + +int bpf_btf_load(const void *btf_data, size_t btf_size, struct bpf_btf_load_opts *opts) { - const size_t attr_sz = offsetofend(union bpf_attr, btf_log_level); + const size_t attr_sz = offsetofend(union bpf_attr, btf_token_fd); union bpf_attr attr; char *log_buf; size_t log_size; @@ -1199,6 +1227,10 @@ int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_loa attr.btf = ptr_to_u64(btf_data); attr.btf_size = btf_size; + + attr.btf_flags = OPTS_GET(opts, btf_flags, 0); + attr.btf_token_fd = OPTS_GET(opts, token_fd, 0); + /* log_level == 0 and log_buf != NULL means "try loading without * log_buf, but retry with log_buf and log_level=1 on error", which is * consistent across low-level and high-level BTF and program loading @@ -1217,27 +1249,8 @@ int bpf_btf_load(const void *btf_data, size_t btf_size, const struct bpf_btf_loa attr.btf_log_level = 1; fd = sys_bpf_fd(BPF_BTF_LOAD, &attr, attr_sz); } - return libbpf_err_errno(fd); -} - -int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, __u32 log_buf_size, bool do_log) -{ - LIBBPF_OPTS(bpf_btf_load_opts, opts); - int fd; - -retry: - if (do_log && log_buf && log_buf_size) { - opts.log_buf = log_buf; - opts.log_size = log_buf_size; - opts.log_level = 1; - } - - fd = bpf_btf_load(btf, btf_size, &opts); - if (fd < 0 && !do_log && log_buf && log_buf_size) { - do_log = true; - goto retry; - } + OPTS_SET(opts, log_true_size, attr.btf_log_true_size); return libbpf_err_errno(fd); } @@ -1245,16 +1258,18 @@ int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr) { - union bpf_attr attr = {}; + const size_t attr_sz = offsetofend(union bpf_attr, task_fd_query); + union bpf_attr attr; int err; + memset(&attr, 0, attr_sz); attr.task_fd_query.pid = pid; attr.task_fd_query.fd = fd; attr.task_fd_query.flags = flags; attr.task_fd_query.buf = ptr_to_u64(buf); attr.task_fd_query.buf_len = *buf_len; - err = sys_bpf(BPF_TASK_FD_QUERY, &attr, sizeof(attr)); + err = sys_bpf(BPF_TASK_FD_QUERY, &attr, attr_sz); *buf_len = attr.task_fd_query.buf_len; *prog_id = attr.task_fd_query.prog_id; @@ -1267,30 +1282,49 @@ int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, int bpf_enable_stats(enum bpf_stats_type type) { + const size_t attr_sz = offsetofend(union bpf_attr, enable_stats); union bpf_attr attr; int fd; - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.enable_stats.type = type; - fd = sys_bpf_fd(BPF_ENABLE_STATS, &attr, sizeof(attr)); + fd = sys_bpf_fd(BPF_ENABLE_STATS, &attr, attr_sz); return libbpf_err_errno(fd); } int bpf_prog_bind_map(int prog_fd, int map_fd, const struct bpf_prog_bind_opts *opts) { + const size_t attr_sz = offsetofend(union bpf_attr, prog_bind_map); union bpf_attr attr; int ret; if (!OPTS_VALID(opts, bpf_prog_bind_opts)) return libbpf_err(-EINVAL); - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, attr_sz); attr.prog_bind_map.prog_fd = prog_fd; attr.prog_bind_map.map_fd = map_fd; attr.prog_bind_map.flags = OPTS_GET(opts, flags, 0); - ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, sizeof(attr)); + ret = sys_bpf(BPF_PROG_BIND_MAP, &attr, attr_sz); return libbpf_err_errno(ret); } + +int bpf_token_create(int bpffs_fd, struct bpf_token_create_opts *opts) +{ + const size_t attr_sz = offsetofend(union bpf_attr, token_create); + union bpf_attr attr; + int fd; + + if (!OPTS_VALID(opts, bpf_token_create_opts)) + return libbpf_err(-EINVAL); + + memset(&attr, 0, attr_sz); + attr.token_create.bpffs_fd = bpffs_fd; + attr.token_create.flags = OPTS_GET(opts, flags, 0); + + fd = sys_bpf_fd(BPF_TOKEN_CREATE, &attr, attr_sz); + return libbpf_err_errno(fd); +} diff --git a/pkg/plugin/lib/common/libbpf/_src/bpf.h b/pkg/plugin/lib/common/libbpf/_src/bpf.h index f4b4afb6d4..972e17ec0c 100644 --- a/pkg/plugin/lib/common/libbpf/_src/bpf.h +++ b/pkg/plugin/lib/common/libbpf/_src/bpf.h @@ -1,7 +1,7 @@ /* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ /* - * common eBPF ELF operations. + * Common BPF ELF operations. * * Copyright (C) 2013-2015 Alexei Starovoitov * Copyright (C) 2015 Wang Nan @@ -35,7 +35,7 @@ extern "C" { #endif -int libbpf_set_memlock_rlim(size_t memlock_bytes); +LIBBPF_API int libbpf_set_memlock_rlim(size_t memlock_bytes); struct bpf_map_create_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -51,8 +51,12 @@ struct bpf_map_create_opts { __u32 numa_node; __u32 map_ifindex; + __s32 value_type_btf_obj_fd; + + __u32 token_fd; + size_t :0; }; -#define bpf_map_create_opts__last_field map_ifindex +#define bpf_map_create_opts__last_field token_fd LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, const char *map_name, @@ -61,48 +65,6 @@ LIBBPF_API int bpf_map_create(enum bpf_map_type map_type, __u32 max_entries, const struct bpf_map_create_opts *opts); -struct bpf_create_map_attr { - const char *name; - enum bpf_map_type map_type; - __u32 map_flags; - __u32 key_size; - __u32 value_size; - __u32 max_entries; - __u32 numa_node; - __u32 btf_fd; - __u32 btf_key_type_id; - __u32 btf_value_type_id; - __u32 map_ifindex; - union { - __u32 inner_map_fd; - __u32 btf_vmlinux_value_type_id; - }; -}; - -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") -LIBBPF_API int bpf_create_map_xattr(const struct bpf_create_map_attr *create_attr); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") -LIBBPF_API int bpf_create_map_node(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, - int max_entries, __u32 map_flags, int node); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") -LIBBPF_API int bpf_create_map_name(enum bpf_map_type map_type, const char *name, - int key_size, int value_size, - int max_entries, __u32 map_flags); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") -LIBBPF_API int bpf_create_map(enum bpf_map_type map_type, int key_size, - int value_size, int max_entries, __u32 map_flags); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") -LIBBPF_API int bpf_create_map_in_map_node(enum bpf_map_type map_type, - const char *name, int key_size, - int inner_map_fd, int max_entries, - __u32 map_flags, int node); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_map_create() instead") -LIBBPF_API int bpf_create_map_in_map(enum bpf_map_type map_type, - const char *name, int key_size, - int inner_map_fd, int max_entries, - __u32 map_flags); - struct bpf_prog_load_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -138,61 +100,21 @@ struct bpf_prog_load_opts { __u32 log_level; __u32 log_size; char *log_buf; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + * If kernel doesn't support this feature, log_size is left unchanged. + */ + __u32 log_true_size; + __u32 token_fd; + size_t :0; }; -#define bpf_prog_load_opts__last_field log_buf +#define bpf_prog_load_opts__last_field token_fd LIBBPF_API int bpf_prog_load(enum bpf_prog_type prog_type, const char *prog_name, const char *license, const struct bpf_insn *insns, size_t insn_cnt, - const struct bpf_prog_load_opts *opts); -/* this "specialization" should go away in libbpf 1.0 */ -LIBBPF_API int bpf_prog_load_v0_6_0(enum bpf_prog_type prog_type, - const char *prog_name, const char *license, - const struct bpf_insn *insns, size_t insn_cnt, - const struct bpf_prog_load_opts *opts); - -/* This is an elaborate way to not conflict with deprecated bpf_prog_load() - * API, defined in libbpf.h. Once we hit libbpf 1.0, all this will be gone. - * With this approach, if someone is calling bpf_prog_load() with - * 4 arguments, they will use the deprecated API, which keeps backwards - * compatibility (both source code and binary). If bpf_prog_load() is called - * with 6 arguments, though, it gets redirected to __bpf_prog_load. - * So looking forward to libbpf 1.0 when this hack will be gone and - * __bpf_prog_load() will be called just bpf_prog_load(). - */ -#ifndef bpf_prog_load -#define bpf_prog_load(...) ___libbpf_overload(___bpf_prog_load, __VA_ARGS__) -#define ___bpf_prog_load4(file, type, pobj, prog_fd) \ - bpf_prog_load_deprecated(file, type, pobj, prog_fd) -#define ___bpf_prog_load6(prog_type, prog_name, license, insns, insn_cnt, opts) \ - bpf_prog_load(prog_type, prog_name, license, insns, insn_cnt, opts) -#endif /* bpf_prog_load */ - -struct bpf_load_program_attr { - enum bpf_prog_type prog_type; - enum bpf_attach_type expected_attach_type; - const char *name; - const struct bpf_insn *insns; - size_t insns_cnt; - const char *license; - union { - __u32 kern_version; - __u32 attach_prog_fd; - }; - union { - __u32 prog_ifindex; - __u32 attach_btf_id; - }; - __u32 prog_btf_fd; - __u32 func_info_rec_size; - const void *func_info; - __u32 func_info_cnt; - __u32 line_info_rec_size; - const void *line_info; - __u32 line_info_cnt; - __u32 log_level; - __u32 prog_flags; -}; + struct bpf_prog_load_opts *opts); /* Flags to direct loading requirements */ #define MAPS_RELAX_COMPAT 0x01 @@ -200,22 +122,6 @@ struct bpf_load_program_attr { /* Recommended log buffer size */ #define BPF_LOG_BUF_SIZE (UINT32_MAX >> 8) /* verifier maximum in kernels <= 5.1 */ -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") -LIBBPF_API int bpf_load_program_xattr(const struct bpf_load_program_attr *load_attr, - char *log_buf, size_t log_buf_sz); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") -LIBBPF_API int bpf_load_program(enum bpf_prog_type type, - const struct bpf_insn *insns, size_t insns_cnt, - const char *license, __u32 kern_version, - char *log_buf, size_t log_buf_sz); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_load() instead") -LIBBPF_API int bpf_verify_program(enum bpf_prog_type type, - const struct bpf_insn *insns, - size_t insns_cnt, __u32 prog_flags, - const char *license, __u32 kern_version, - char *log_buf, size_t log_buf_sz, - int log_level); - struct bpf_btf_load_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -223,15 +129,21 @@ struct bpf_btf_load_opts { char *log_buf; __u32 log_level; __u32 log_size; + /* output: actual total log contents size (including termintaing zero). + * It could be both larger than original log_size (if log was + * truncated), or smaller (if log buffer wasn't filled completely). + * If kernel doesn't support this feature, log_size is left unchanged. + */ + __u32 log_true_size; + + __u32 btf_flags; + __u32 token_fd; + size_t :0; }; -#define bpf_btf_load_opts__last_field log_size +#define bpf_btf_load_opts__last_field token_fd LIBBPF_API int bpf_btf_load(const void *btf_data, size_t btf_size, - const struct bpf_btf_load_opts *opts); - -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_btf_load() instead") -LIBBPF_API int bpf_load_btf(const void *btf, __u32 btf_size, char *log_buf, - __u32 log_buf_size, bool do_log); + struct bpf_btf_load_opts *opts); LIBBPF_API int bpf_map_update_elem(int fd, const void *key, const void *value, __u64 flags); @@ -244,6 +156,7 @@ LIBBPF_API int bpf_map_lookup_and_delete_elem(int fd, const void *key, LIBBPF_API int bpf_map_lookup_and_delete_elem_flags(int fd, const void *key, void *value, __u64 flags); LIBBPF_API int bpf_map_delete_elem(int fd, const void *key); +LIBBPF_API int bpf_map_delete_elem_flags(int fd, const void *key, __u64 flags); LIBBPF_API int bpf_map_get_next_key(int fd, const void *key, void *next_key); LIBBPF_API int bpf_map_freeze(int fd); @@ -277,10 +190,14 @@ LIBBPF_API int bpf_map_delete_batch(int fd, const void *keys, /** * @brief **bpf_map_lookup_batch()** allows for batch lookup of BPF map elements. * - * The parameter *in_batch* is the address of the first element in the batch to read. - * *out_batch* is an output parameter that should be passed as *in_batch* to subsequent - * calls to **bpf_map_lookup_batch()**. NULL can be passed for *in_batch* to indicate - * that the batched lookup starts from the beginning of the map. + * The parameter *in_batch* is the address of the first element in the batch to + * read. *out_batch* is an output parameter that should be passed as *in_batch* + * to subsequent calls to **bpf_map_lookup_batch()**. NULL can be passed for + * *in_batch* to indicate that the batched lookup starts from the beginning of + * the map. Both *in_batch* and *out_batch* must point to memory large enough to + * hold a single key, except for maps of type **BPF_MAP_TYPE_{HASH, PERCPU_HASH, + * LRU_HASH, LRU_PERCPU_HASH}**, for which the memory size must be at + * least 4 bytes wide regardless of key size. * * The *keys* and *values* are output parameters which must point to memory large enough to * hold *count* items based on the key and value size of the map *map_fd*. The *keys* @@ -313,7 +230,10 @@ LIBBPF_API int bpf_map_lookup_batch(int fd, void *in_batch, void *out_batch, * * @param fd BPF map file descriptor * @param in_batch address of the first element in batch to read, can pass NULL to - * get address of the first element in *out_batch* + * get address of the first element in *out_batch*. If not NULL, must be large + * enough to hold a key. For **BPF_MAP_TYPE_{HASH, PERCPU_HASH, LRU_HASH, + * LRU_PERCPU_HASH}**, the memory size must be at least 4 bytes wide regardless + * of key size. * @param out_batch output parameter that should be passed to next call as *in_batch* * @param keys pointer to an array of *count* keys * @param values pointer to an array large enough for *count* values @@ -379,29 +299,96 @@ LIBBPF_API int bpf_map_update_batch(int fd, const void *keys, const void *values __u32 *count, const struct bpf_map_batch_opts *opts); +struct bpf_obj_pin_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + + __u32 file_flags; + int path_fd; + + size_t :0; +}; +#define bpf_obj_pin_opts__last_field path_fd + LIBBPF_API int bpf_obj_pin(int fd, const char *pathname); -LIBBPF_API int bpf_obj_get(const char *pathname); +LIBBPF_API int bpf_obj_pin_opts(int fd, const char *pathname, + const struct bpf_obj_pin_opts *opts); -struct bpf_prog_attach_opts { +struct bpf_obj_get_opts { size_t sz; /* size of this struct for forward/backward compatibility */ - unsigned int flags; - int replace_prog_fd; + + __u32 file_flags; + int path_fd; + + size_t :0; }; -#define bpf_prog_attach_opts__last_field replace_prog_fd +#define bpf_obj_get_opts__last_field path_fd + +LIBBPF_API int bpf_obj_get(const char *pathname); +LIBBPF_API int bpf_obj_get_opts(const char *pathname, + const struct bpf_obj_get_opts *opts); LIBBPF_API int bpf_prog_attach(int prog_fd, int attachable_fd, enum bpf_attach_type type, unsigned int flags); -LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int attachable_fd, - enum bpf_attach_type type, - const struct bpf_prog_attach_opts *opts); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_prog_attach_opts() instead") -LIBBPF_API int bpf_prog_attach_xattr(int prog_fd, int attachable_fd, - enum bpf_attach_type type, - const struct bpf_prog_attach_opts *opts); LIBBPF_API int bpf_prog_detach(int attachable_fd, enum bpf_attach_type type); LIBBPF_API int bpf_prog_detach2(int prog_fd, int attachable_fd, enum bpf_attach_type type); +struct bpf_prog_attach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; + union { + int replace_prog_fd; + int replace_fd; + }; + int relative_fd; + __u32 relative_id; + __u64 expected_revision; + size_t :0; +}; +#define bpf_prog_attach_opts__last_field expected_revision + +struct bpf_prog_detach_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; + int relative_fd; + __u32 relative_id; + __u64 expected_revision; + size_t :0; +}; +#define bpf_prog_detach_opts__last_field expected_revision + +/** + * @brief **bpf_prog_attach_opts()** attaches the BPF program corresponding to + * *prog_fd* to a *target* which can represent a file descriptor or netdevice + * ifindex. + * + * @param prog_fd BPF program file descriptor + * @param target attach location file descriptor or ifindex + * @param type attach type for the BPF program + * @param opts options for configuring the attachment + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_attach_opts(int prog_fd, int target, + enum bpf_attach_type type, + const struct bpf_prog_attach_opts *opts); + +/** + * @brief **bpf_prog_detach_opts()** detaches the BPF program corresponding to + * *prog_fd* from a *target* which can represent a file descriptor or netdevice + * ifindex. + * + * @param prog_fd BPF program file descriptor + * @param target detach location file descriptor or ifindex + * @param type detach type for the BPF program + * @param opts options for configuring the detachment + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_detach_opts(int prog_fd, int target, + enum bpf_attach_type type, + const struct bpf_prog_detach_opts *opts); + union bpf_iter_link_info; /* defined in up-to-date linux/bpf.h */ struct bpf_link_create_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -420,10 +407,38 @@ struct bpf_link_create_opts { const unsigned long *addrs; const __u64 *cookies; } kprobe_multi; + struct { + __u32 flags; + __u32 cnt; + const char *path; + const unsigned long *offsets; + const unsigned long *ref_ctr_offsets; + const __u64 *cookies; + __u32 pid; + } uprobe_multi; + struct { + __u64 cookie; + } tracing; + struct { + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; + } netfilter; + struct { + __u32 relative_fd; + __u32 relative_id; + __u64 expected_revision; + } tcx; + struct { + __u32 relative_fd; + __u32 relative_id; + __u64 expected_revision; + } netkit; }; size_t :0; }; -#define bpf_link_create_opts__last_field kprobe_multi.cookies +#define bpf_link_create_opts__last_field uprobe_multi.pid LIBBPF_API int bpf_link_create(int prog_fd, int target_fd, enum bpf_attach_type attach_type, @@ -435,8 +450,9 @@ struct bpf_link_update_opts { size_t sz; /* size of this struct for forward/backward compatibility */ __u32 flags; /* extra flags */ __u32 old_prog_fd; /* expected old program FD */ + __u32 old_map_fd; /* expected old map FD */ }; -#define bpf_link_update_opts__last_field old_prog_fd +#define bpf_link_update_opts__last_field old_map_fd LIBBPF_API int bpf_link_update(int link_fd, int new_prog_fd, const struct bpf_link_update_opts *opts); @@ -460,36 +476,170 @@ struct bpf_prog_test_run_attr { * out: length of cxt_out */ }; -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_test_run_opts() instead") -LIBBPF_API int bpf_prog_test_run_xattr(struct bpf_prog_test_run_attr *test_attr); - -/* - * bpf_prog_test_run does not check that data_out is large enough. Consider - * using bpf_prog_test_run_opts instead. - */ -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_prog_test_run_opts() instead") -LIBBPF_API int bpf_prog_test_run(int prog_fd, int repeat, void *data, - __u32 size, void *data_out, __u32 *size_out, - __u32 *retval, __u32 *duration); LIBBPF_API int bpf_prog_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_map_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_btf_get_next_id(__u32 start_id, __u32 *next_id); LIBBPF_API int bpf_link_get_next_id(__u32 start_id, __u32 *next_id); + +struct bpf_get_fd_by_id_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 open_flags; /* permissions requested for the operation on fd */ + size_t :0; +}; +#define bpf_get_fd_by_id_opts__last_field open_flags + LIBBPF_API int bpf_prog_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_prog_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); LIBBPF_API int bpf_map_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_map_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); LIBBPF_API int bpf_btf_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_btf_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); LIBBPF_API int bpf_link_get_fd_by_id(__u32 id); +LIBBPF_API int bpf_link_get_fd_by_id_opts(__u32 id, + const struct bpf_get_fd_by_id_opts *opts); LIBBPF_API int bpf_obj_get_info_by_fd(int bpf_fd, void *info, __u32 *info_len); + +/** + * @brief **bpf_prog_get_info_by_fd()** obtains information about the BPF + * program corresponding to *prog_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. Note that *info* should be + * zero-initialized or initialized as expected by the requested *info* + * type. Failing to (zero-)initialize *info* under certain circumstances can + * result in this helper returning an error. + * + * @param prog_fd BPF program file descriptor + * @param info pointer to **struct bpf_prog_info** that will be populated with + * BPF program information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_get_info_by_fd(int prog_fd, struct bpf_prog_info *info, __u32 *info_len); + +/** + * @brief **bpf_map_get_info_by_fd()** obtains information about the BPF + * map corresponding to *map_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. Note that *info* should be + * zero-initialized or initialized as expected by the requested *info* + * type. Failing to (zero-)initialize *info* under certain circumstances can + * result in this helper returning an error. + * + * @param map_fd BPF map file descriptor + * @param info pointer to **struct bpf_map_info** that will be populated with + * BPF map information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_map_get_info_by_fd(int map_fd, struct bpf_map_info *info, __u32 *info_len); + +/** + * @brief **bpf_btf_get_info_by_fd()** obtains information about the + * BTF object corresponding to *btf_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. Note that *info* should be + * zero-initialized or initialized as expected by the requested *info* + * type. Failing to (zero-)initialize *info* under certain circumstances can + * result in this helper returning an error. + * + * @param btf_fd BTF object file descriptor + * @param info pointer to **struct bpf_btf_info** that will be populated with + * BTF object information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_btf_get_info_by_fd(int btf_fd, struct bpf_btf_info *info, __u32 *info_len); + +/** + * @brief **bpf_btf_get_info_by_fd()** obtains information about the BPF + * link corresponding to *link_fd*. + * + * Populates up to *info_len* bytes of *info* and updates *info_len* with the + * actual number of bytes written to *info*. Note that *info* should be + * zero-initialized or initialized as expected by the requested *info* + * type. Failing to (zero-)initialize *info* under certain circumstances can + * result in this helper returning an error. + * + * @param link_fd BPF link file descriptor + * @param info pointer to **struct bpf_link_info** that will be populated with + * BPF link information + * @param info_len pointer to the size of *info*; on success updated with the + * number of bytes written to *info* + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_link_get_info_by_fd(int link_fd, struct bpf_link_info *info, __u32 *info_len); + +struct bpf_prog_query_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 query_flags; + __u32 attach_flags; /* output argument */ + __u32 *prog_ids; + union { + /* input+output argument */ + __u32 prog_cnt; + __u32 count; + }; + __u32 *prog_attach_flags; + __u32 *link_ids; + __u32 *link_attach_flags; + __u64 revision; + size_t :0; +}; +#define bpf_prog_query_opts__last_field revision + +/** + * @brief **bpf_prog_query_opts()** queries the BPF programs and BPF links + * which are attached to *target* which can represent a file descriptor or + * netdevice ifindex. + * + * @param target query location file descriptor or ifindex + * @param type attach type for the BPF program + * @param opts options for configuring the query + * @return 0, on success; negative error code, otherwise (errno is also set to + * the error code) + */ +LIBBPF_API int bpf_prog_query_opts(int target, enum bpf_attach_type type, + struct bpf_prog_query_opts *opts); LIBBPF_API int bpf_prog_query(int target_fd, enum bpf_attach_type type, __u32 query_flags, __u32 *attach_flags, __u32 *prog_ids, __u32 *prog_cnt); + +struct bpf_raw_tp_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + const char *tp_name; + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tp_opts__last_field cookie + +LIBBPF_API int bpf_raw_tracepoint_open_opts(int prog_fd, struct bpf_raw_tp_opts *opts); LIBBPF_API int bpf_raw_tracepoint_open(const char *name, int prog_fd); LIBBPF_API int bpf_task_fd_query(int pid, int fd, __u32 flags, char *buf, __u32 *buf_len, __u32 *prog_id, __u32 *fd_type, __u64 *probe_offset, __u64 *probe_addr); +#ifdef __cplusplus +/* forward-declaring enums in C++ isn't compatible with pure C enums, so + * instead define bpf_enable_stats() as accepting int as an input + */ +LIBBPF_API int bpf_enable_stats(int type); +#else enum bpf_stats_type; /* defined in up-to-date linux/bpf.h */ LIBBPF_API int bpf_enable_stats(enum bpf_stats_type type); +#endif struct bpf_prog_bind_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -526,6 +676,30 @@ struct bpf_test_run_opts { LIBBPF_API int bpf_prog_test_run_opts(int prog_fd, struct bpf_test_run_opts *opts); +struct bpf_token_create_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u32 flags; + size_t :0; +}; +#define bpf_token_create_opts__last_field flags + +/** + * @brief **bpf_token_create()** creates a new instance of BPF token derived + * from specified BPF FS mount point. + * + * BPF token created with this API can be passed to bpf() syscall for + * commands like BPF_PROG_LOAD, BPF_MAP_CREATE, etc. + * + * @param bpffs_fd FD for BPF FS instance from which to derive a BPF token + * instance. + * @param opts optional BPF token creation options, can be NULL + * + * @return BPF token FD > 0, on success; negative error code, otherwise (errno + * is also set to the error code) + */ +LIBBPF_API int bpf_token_create(int bpffs_fd, + struct bpf_token_create_opts *opts); + #ifdef __cplusplus } /* extern "C" */ #endif diff --git a/pkg/plugin/lib/common/libbpf/_src/bpf_core_read.h b/pkg/plugin/lib/common/libbpf/_src/bpf_core_read.h index e4aa9996a5..c0e13cdf96 100644 --- a/pkg/plugin/lib/common/libbpf/_src/bpf_core_read.h +++ b/pkg/plugin/lib/common/libbpf/_src/bpf_core_read.h @@ -2,6 +2,8 @@ #ifndef __BPF_CORE_READ_H__ #define __BPF_CORE_READ_H__ +#include "bpf_helpers.h" + /* * enum bpf_field_info_kind is passed as a second argument into * __builtin_preserve_field_info() built-in to get a specific aspect of @@ -29,6 +31,7 @@ enum bpf_type_id_kind { enum bpf_type_info_kind { BPF_TYPE_EXISTS = 0, /* type existence in target kernel */ BPF_TYPE_SIZE = 1, /* type size in target kernel */ + BPF_TYPE_MATCHES = 2, /* type match in target kernel */ }; /* second argument to __builtin_preserve_enum_value() built-in */ @@ -43,7 +46,7 @@ enum bpf_enum_value_kind { #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define __CORE_BITFIELD_PROBE_READ(dst, src, fld) \ bpf_probe_read_kernel( \ - (void *)dst, \ + (void *)dst, \ __CORE_RELO(src, fld, BYTE_SIZE), \ (const void *)src + __CORE_RELO(src, fld, BYTE_OFFSET)) #else @@ -101,6 +104,7 @@ enum bpf_enum_value_kind { case 2: val = *(const unsigned short *)p; break; \ case 4: val = *(const unsigned int *)p; break; \ case 8: val = *(const unsigned long long *)p; break; \ + default: val = 0; break; \ } \ val <<= __CORE_RELO(s, field, LSHIFT_U64); \ if (__CORE_RELO(s, field, SIGNED)) \ @@ -110,21 +114,103 @@ enum bpf_enum_value_kind { val; \ }) +/* + * Write to a bitfield, identified by s->field. + * This is the inverse of BPF_CORE_WRITE_BITFIELD(). + */ +#define BPF_CORE_WRITE_BITFIELD(s, field, new_val) ({ \ + void *p = (void *)s + __CORE_RELO(s, field, BYTE_OFFSET); \ + unsigned int byte_size = __CORE_RELO(s, field, BYTE_SIZE); \ + unsigned int lshift = __CORE_RELO(s, field, LSHIFT_U64); \ + unsigned int rshift = __CORE_RELO(s, field, RSHIFT_U64); \ + unsigned long long mask, val, nval = new_val; \ + unsigned int rpad = rshift - lshift; \ + \ + asm volatile("" : "+r"(p)); \ + \ + switch (byte_size) { \ + case 1: val = *(unsigned char *)p; break; \ + case 2: val = *(unsigned short *)p; break; \ + case 4: val = *(unsigned int *)p; break; \ + case 8: val = *(unsigned long long *)p; break; \ + } \ + \ + mask = (~0ULL << rshift) >> lshift; \ + val = (val & ~mask) | ((nval << rpad) & mask); \ + \ + switch (byte_size) { \ + case 1: *(unsigned char *)p = val; break; \ + case 2: *(unsigned short *)p = val; break; \ + case 4: *(unsigned int *)p = val; break; \ + case 8: *(unsigned long long *)p = val; break; \ + } \ +}) + +/* Differentiator between compilers builtin implementations. This is a + * requirement due to the compiler parsing differences where GCC optimizes + * early in parsing those constructs of type pointers to the builtin specific + * type, resulting in not being possible to collect the required type + * information in the builtin expansion. + */ +#ifdef __clang__ +#define ___bpf_typeof(type) ((typeof(type) *) 0) +#else +#define ___bpf_typeof1(type, NR) ({ \ + extern typeof(type) *___concat(bpf_type_tmp_, NR); \ + ___concat(bpf_type_tmp_, NR); \ +}) +#define ___bpf_typeof(type) ___bpf_typeof1(type, __COUNTER__) +#endif + +#ifdef __clang__ +#define ___bpf_field_ref1(field) (field) +#define ___bpf_field_ref2(type, field) (___bpf_typeof(type)->field) +#else +#define ___bpf_field_ref1(field) (&(field)) +#define ___bpf_field_ref2(type, field) (&(___bpf_typeof(type)->field)) +#endif +#define ___bpf_field_ref(args...) \ + ___bpf_apply(___bpf_field_ref, ___bpf_narg(args))(args) + /* * Convenience macro to check that field actually exists in target kernel's. * Returns: * 1, if matching field is present in target kernel; * 0, if no matching field found. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_exists(p->my_field); + * - field reference through type and field names: + * bpf_core_field_exists(struct my_type, my_field). */ -#define bpf_core_field_exists(field) \ - __builtin_preserve_field_info(field, BPF_FIELD_EXISTS) +#define bpf_core_field_exists(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_EXISTS) /* * Convenience macro to get the byte size of a field. Works for integers, * struct/unions, pointers, arrays, and enums. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_size(p->my_field); + * - field reference through type and field names: + * bpf_core_field_size(struct my_type, my_field). + */ +#define bpf_core_field_size(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_SIZE) + +/* + * Convenience macro to get field's byte offset. + * + * Supports two forms: + * - field reference through variable access: + * bpf_core_field_offset(p->my_field); + * - field reference through type and field names: + * bpf_core_field_offset(struct my_type, my_field). */ -#define bpf_core_field_size(field) \ - __builtin_preserve_field_info(field, BPF_FIELD_BYTE_SIZE) +#define bpf_core_field_offset(field...) \ + __builtin_preserve_field_info(___bpf_field_ref(field), BPF_FIELD_BYTE_OFFSET) /* * Convenience macro to get BTF type ID of a specified type, using a local BTF @@ -132,7 +218,7 @@ enum bpf_enum_value_kind { * BTF. Always succeeds. */ #define bpf_core_type_id_local(type) \ - __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_LOCAL) + __builtin_btf_type_id(*___bpf_typeof(type), BPF_TYPE_ID_LOCAL) /* * Convenience macro to get BTF type ID of a target kernel's type that matches @@ -142,7 +228,7 @@ enum bpf_enum_value_kind { * - 0, if no matching type was found in a target kernel BTF. */ #define bpf_core_type_id_kernel(type) \ - __builtin_btf_type_id(*(typeof(type) *)0, BPF_TYPE_ID_TARGET) + __builtin_btf_type_id(*___bpf_typeof(type), BPF_TYPE_ID_TARGET) /* * Convenience macro to check that provided named type @@ -152,7 +238,17 @@ enum bpf_enum_value_kind { * 0, if no matching type is found. */ #define bpf_core_type_exists(type) \ - __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_EXISTS) + __builtin_preserve_type_info(*___bpf_typeof(type), BPF_TYPE_EXISTS) + +/* + * Convenience macro to check that provided named type + * (struct/union/enum/typedef) "matches" that in a target kernel. + * Returns: + * 1, if the type matches in the target kernel's BTF; + * 0, if the type does not match any in the target kernel + */ +#define bpf_core_type_matches(type) \ + __builtin_preserve_type_info(*___bpf_typeof(type), BPF_TYPE_MATCHES) /* * Convenience macro to get the byte size of a provided named type @@ -162,7 +258,7 @@ enum bpf_enum_value_kind { * 0, if no matching type is found. */ #define bpf_core_type_size(type) \ - __builtin_preserve_type_info(*(typeof(type) *)0, BPF_TYPE_SIZE) + __builtin_preserve_type_info(*___bpf_typeof(type), BPF_TYPE_SIZE) /* * Convenience macro to check that provided enumerator value is defined in @@ -172,8 +268,13 @@ enum bpf_enum_value_kind { * kernel's BTF; * 0, if no matching enum and/or enum value within that enum is found. */ +#ifdef __clang__ #define bpf_core_enum_value_exists(enum_type, enum_value) \ __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_EXISTS) +#else +#define bpf_core_enum_value_exists(enum_type, enum_value) \ + __builtin_preserve_enum_value(___bpf_typeof(enum_type), enum_value, BPF_ENUMVAL_EXISTS) +#endif /* * Convenience macro to get the integer value of an enumerator value in @@ -183,8 +284,13 @@ enum bpf_enum_value_kind { * present in target kernel's BTF; * 0, if no matching enum and/or enum value within that enum is found. */ +#ifdef __clang__ #define bpf_core_enum_value(enum_type, enum_value) \ __builtin_preserve_enum_value(*(typeof(enum_type) *)enum_value, BPF_ENUMVAL_VALUE) +#else +#define bpf_core_enum_value(enum_type, enum_value) \ + __builtin_preserve_enum_value(___bpf_typeof(enum_type), enum_value, BPF_ENUMVAL_VALUE) +#endif /* * bpf_core_read() abstracts away bpf_probe_read_kernel() call and captures @@ -196,7 +302,7 @@ enum bpf_enum_value_kind { * a relocation, which records BTF type ID describing root struct/union and an * accessor string which describes exact embedded field that was used to take * an address. See detailed description of this relocation format and - * semantics in comments to struct bpf_field_reloc in libbpf_internal.h. + * semantics in comments to struct bpf_core_relo in include/uapi/linux/bpf.h. * * This relocation allows libbpf to adjust BPF instruction to use correct * actual field offset, based on target kernel BTF type that matches original @@ -220,6 +326,17 @@ enum bpf_enum_value_kind { #define bpf_core_read_user_str(dst, sz, src) \ bpf_probe_read_user_str(dst, sz, (const void *)__builtin_preserve_access_index(src)) +extern void *bpf_rdonly_cast(const void *obj, __u32 btf_id) __ksym __weak; + +/* + * Cast provided pointer *ptr* into a pointer to a specified *type* in such + * a way that BPF verifier will become aware of associated kernel-side BTF + * type. This allows to access members of kernel types directly without the + * need to use BPF_CORE_READ() macros. + */ +#define bpf_core_cast(ptr, type) \ + ((typeof(type) *)bpf_rdonly_cast((ptr), bpf_core_type_id_kernel(type))) + #define ___concat(a, b) a ## b #define ___apply(fn, n) ___concat(fn, n) #define ___nth(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, __11, N, ...) N @@ -324,7 +441,7 @@ enum bpf_enum_value_kind { /* Non-CO-RE variant of BPF_CORE_READ_INTO() */ #define BPF_PROBE_READ_INTO(dst, src, a, ...) ({ \ - ___core_read(bpf_probe_read, bpf_probe_read, \ + ___core_read(bpf_probe_read_kernel, bpf_probe_read_kernel, \ dst, (src), a, ##__VA_ARGS__) \ }) @@ -360,7 +477,7 @@ enum bpf_enum_value_kind { /* Non-CO-RE variant of BPF_CORE_READ_STR_INTO() */ #define BPF_PROBE_READ_STR_INTO(dst, src, a, ...) ({ \ - ___core_read(bpf_probe_read_str, bpf_probe_read, \ + ___core_read(bpf_probe_read_kernel_str, bpf_probe_read_kernel, \ dst, (src), a, ##__VA_ARGS__) \ }) diff --git a/pkg/plugin/lib/common/libbpf/_src/bpf_gen_internal.h b/pkg/plugin/lib/common/libbpf/_src/bpf_gen_internal.h index 223308931d..fdf44403ff 100644 --- a/pkg/plugin/lib/common/libbpf/_src/bpf_gen_internal.h +++ b/pkg/plugin/lib/common/libbpf/_src/bpf_gen_internal.h @@ -11,6 +11,7 @@ struct ksym_relo_desc { int insn_idx; bool is_weak; bool is_typeless; + bool is_ld64; }; struct ksym_desc { @@ -24,6 +25,7 @@ struct ksym_desc { bool typeless; }; int insn; + bool is_ld64; }; struct bpf_gen { @@ -65,7 +67,7 @@ void bpf_gen__map_update_elem(struct bpf_gen *gen, int map_idx, void *value, __u void bpf_gen__map_freeze(struct bpf_gen *gen, int map_idx); void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *name, enum bpf_attach_type type); void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, - bool is_typeless, int kind, int insn_idx); + bool is_typeless, bool is_ld64, int kind, int insn_idx); void bpf_gen__record_relo_core(struct bpf_gen *gen, const struct bpf_core_relo *core_relo); void bpf_gen__populate_outer_map(struct bpf_gen *gen, int outer_map_idx, int key, int inner_map_idx); diff --git a/pkg/plugin/lib/common/libbpf/_src/bpf_helper_defs.h b/pkg/plugin/lib/common/libbpf/_src/bpf_helper_defs.h index 5d690a54c1..fecfcf3a95 100644 --- a/pkg/plugin/lib/common/libbpf/_src/bpf_helper_defs.h +++ b/pkg/plugin/lib/common/libbpf/_src/bpf_helper_defs.h @@ -29,6 +29,7 @@ struct tcp_request_sock; struct udp6_sock; struct unix_sock; struct task_struct; +struct cgroup; struct __sk_buff; struct sk_msg_md; struct xdp_md; @@ -38,6 +39,10 @@ struct inode; struct socket; struct file; struct bpf_timer; +struct mptcp_sock; +struct bpf_dynptr; +struct iphdr; +struct ipv6hdr; /* * bpf_map_lookup_elem @@ -48,7 +53,7 @@ struct bpf_timer; * Map value associated to *key*, or **NULL** if no entry was * found. */ -static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1; +static void *(* const bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1; /* * bpf_map_update_elem @@ -70,7 +75,7 @@ static void *(*bpf_map_lookup_elem)(void *map, const void *key) = (void *) 1; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *) 2; +static long (* const bpf_map_update_elem)(void *map, const void *key, const void *value, __u64 flags) = (void *) 2; /* * bpf_map_delete_elem @@ -80,7 +85,7 @@ static long (*bpf_map_update_elem)(void *map, const void *key, const void *value * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3; +static long (* const bpf_map_delete_elem)(void *map, const void *key) = (void *) 3; /* * bpf_probe_read @@ -94,7 +99,7 @@ static long (*bpf_map_delete_elem)(void *map, const void *key) = (void *) 3; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 4; +static long (* const bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 4; /* * bpf_ktime_get_ns @@ -106,24 +111,24 @@ static long (*bpf_probe_read)(void *dst, __u32 size, const void *unsafe_ptr) = ( * Returns * Current *ktime*. */ -static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5; +static __u64 (* const bpf_ktime_get_ns)(void) = (void *) 5; /* * bpf_trace_printk * * This helper is a "printk()-like" facility for debugging. It * prints a message defined by format *fmt* (of size *fmt_size*) - * to file *\/sys/kernel/debug/tracing/trace* from DebugFS, if + * to file *\/sys/kernel/tracing/trace* from TraceFS, if * available. It can take up to three additional **u64** * arguments (as an eBPF helpers, the total number of arguments is * limited to five). * * Each time the helper is called, it appends a line to the trace. - * Lines are discarded while *\/sys/kernel/debug/tracing/trace* is - * open, use *\/sys/kernel/debug/tracing/trace_pipe* to avoid this. + * Lines are discarded while *\/sys/kernel/tracing/trace* is + * open, use *\/sys/kernel/tracing/trace_pipe* to avoid this. * The format of the trace is customizable, and the exact output * one will get depends on the options set in - * *\/sys/kernel/debug/tracing/trace_options* (see also the + * *\/sys/kernel/tracing/trace_options* (see also the * *README* file under the same directory). However, it usually * defaults to something like: * @@ -169,7 +174,7 @@ static __u64 (*bpf_ktime_get_ns)(void) = (void *) 5; * The number of bytes written to the buffer, or a negative error * in case of failure. */ -static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) 6; +static long (* const bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) 6; /* * bpf_get_prandom_u32 @@ -185,7 +190,7 @@ static long (*bpf_trace_printk)(const char *fmt, __u32 fmt_size, ...) = (void *) * Returns * A random 32-bit unsigned value. */ -static __u32 (*bpf_get_prandom_u32)(void) = (void *) 7; +static __u32 (* const bpf_get_prandom_u32)(void) = (void *) 7; /* * bpf_get_smp_processor_id @@ -198,7 +203,7 @@ static __u32 (*bpf_get_prandom_u32)(void) = (void *) 7; * Returns * The SMP id of the processor running the program. */ -static __u32 (*bpf_get_smp_processor_id)(void) = (void *) 8; +static __u32 (* const bpf_get_smp_processor_id)(void) = (void *) 8; /* * bpf_skb_store_bytes @@ -219,7 +224,7 @@ static __u32 (*bpf_get_smp_processor_id)(void) = (void *) 8; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len, __u64 flags) = (void *) 9; +static long (* const bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len, __u64 flags) = (void *) 9; /* * bpf_l3_csum_replace @@ -248,7 +253,7 @@ static long (*bpf_skb_store_bytes)(struct __sk_buff *skb, __u32 offset, const vo * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 size) = (void *) 10; +static long (* const bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 size) = (void *) 10; /* * bpf_l4_csum_replace @@ -284,7 +289,7 @@ static long (*bpf_l3_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 fr * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 flags) = (void *) 11; +static long (* const bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 from, __u64 to, __u64 flags) = (void *) 11; /* * bpf_tail_call @@ -319,7 +324,7 @@ static long (*bpf_l4_csum_replace)(struct __sk_buff *skb, __u32 offset, __u64 fr * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (void *) 12; +static long (* const bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (void *) 12; /* * bpf_clone_redirect @@ -345,9 +350,11 @@ static long (*bpf_tail_call)(void *ctx, void *prog_array_map, __u32 index) = (vo * direct packet access. * * Returns - * 0 on success, or a negative error in case of failure. + * 0 on success, or a negative error in case of failure. Positive + * error indicates a potential drop or congestion in the target + * device. The particular positive error codes are not defined. */ -static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13; +static long (* const bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 flags) = (void *) 13; /* * bpf_get_current_pid_tgid @@ -360,7 +367,7 @@ static long (*bpf_clone_redirect)(struct __sk_buff *skb, __u32 ifindex, __u64 fl * *current_task*\ **->tgid << 32 \|** * *current_task*\ **->pid**. */ -static __u64 (*bpf_get_current_pid_tgid)(void) = (void *) 14; +static __u64 (* const bpf_get_current_pid_tgid)(void) = (void *) 14; /* * bpf_get_current_uid_gid @@ -371,7 +378,7 @@ static __u64 (*bpf_get_current_pid_tgid)(void) = (void *) 14; * A 64-bit integer containing the current GID and UID, and * created as such: *current_gid* **<< 32 \|** *current_uid*. */ -static __u64 (*bpf_get_current_uid_gid)(void) = (void *) 15; +static __u64 (* const bpf_get_current_uid_gid)(void) = (void *) 15; /* * bpf_get_current_comm @@ -386,7 +393,7 @@ static __u64 (*bpf_get_current_uid_gid)(void) = (void *) 15; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16; +static long (* const bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16; /* * bpf_get_cgroup_classid @@ -416,7 +423,7 @@ static long (*bpf_get_current_comm)(void *buf, __u32 size_of_buf) = (void *) 16; * Returns * The classid, or 0 for the default unconfigured classid. */ -static __u32 (*bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17; +static __u32 (* const bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17; /* * bpf_skb_vlan_push @@ -436,7 +443,7 @@ static __u32 (*bpf_get_cgroup_classid)(struct __sk_buff *skb) = (void *) 17; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 vlan_tci) = (void *) 18; +static long (* const bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 vlan_tci) = (void *) 18; /* * bpf_skb_vlan_pop @@ -452,7 +459,7 @@ static long (*bpf_skb_vlan_push)(struct __sk_buff *skb, __be16 vlan_proto, __u16 * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19; +static long (* const bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19; /* * bpf_skb_get_tunnel_key @@ -507,7 +514,7 @@ static long (*bpf_skb_vlan_pop)(struct __sk_buff *skb) = (void *) 19; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 20; +static long (* const bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 20; /* * bpf_skb_set_tunnel_key @@ -532,6 +539,9 @@ static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_k * sending the packet. This flag was added for GRE * encapsulation, but might be used with other protocols * as well in the future. + * **BPF_F_NO_TUNNEL_KEY** + * Add a flag to tunnel metadata indicating that no tunnel + * key should be set in the resulting tunnel header. * * Here is a typical usage on the transmit path: * @@ -548,7 +558,7 @@ static long (*bpf_skb_get_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_k * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 21; +static long (* const bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_key *key, __u32 size, __u64 flags) = (void *) 21; /* * bpf_perf_event_read @@ -581,7 +591,7 @@ static long (*bpf_skb_set_tunnel_key)(struct __sk_buff *skb, struct bpf_tunnel_k * The value of the perf event counter read from the map, or a * negative error code in case of failure. */ -static __u64 (*bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22; +static __u64 (* const bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22; /* * bpf_redirect @@ -608,7 +618,7 @@ static __u64 (*bpf_perf_event_read)(void *map, __u64 flags) = (void *) 22; * are **TC_ACT_REDIRECT** on success or **TC_ACT_SHOT** on * error. */ -static long (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23; +static long (* const bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23; /* * bpf_get_route_realm @@ -636,7 +646,7 @@ static long (*bpf_redirect)(__u32 ifindex, __u64 flags) = (void *) 23; * The realm of the route for the packet associated to *skb*, or 0 * if none was found. */ -static __u32 (*bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24; +static __u32 (* const bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24; /* * bpf_perf_event_output @@ -685,7 +695,7 @@ static __u32 (*bpf_get_route_realm)(struct __sk_buff *skb) = (void *) 24; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 25; +static long (* const bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 25; /* * bpf_skb_load_bytes @@ -706,7 +716,7 @@ static long (*bpf_perf_event_output)(void *ctx, void *map, __u64 flags, void *da * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 len) = (void *) 26; +static long (* const bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 len) = (void *) 26; /* * bpf_get_stackid @@ -752,7 +762,7 @@ static long (*bpf_skb_load_bytes)(const void *skb, __u32 offset, void *to, __u32 * The positive or null stack id on success, or a negative error * in case of failure. */ -static long (*bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27; +static long (* const bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27; /* * bpf_csum_diff @@ -783,7 +793,7 @@ static long (*bpf_get_stackid)(void *ctx, void *map, __u64 flags) = (void *) 27; * The checksum result, or a negative error code in case of * failure. */ -static __s64 (*bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 to_size, __wsum seed) = (void *) 28; +static __s64 (* const bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 to_size, __wsum seed) = (void *) 28; /* * bpf_skb_get_tunnel_opt @@ -805,7 +815,7 @@ static __s64 (*bpf_csum_diff)(__be32 *from, __u32 from_size, __be32 *to, __u32 t * Returns * The size of the option data retrieved. */ -static long (*bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 29; +static long (* const bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 29; /* * bpf_skb_set_tunnel_opt @@ -819,7 +829,7 @@ static long (*bpf_skb_get_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 si * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 30; +static long (* const bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 size) = (void *) 30; /* * bpf_skb_change_proto @@ -850,7 +860,7 @@ static long (*bpf_skb_set_tunnel_opt)(struct __sk_buff *skb, void *opt, __u32 si * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 flags) = (void *) 31; +static long (* const bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 flags) = (void *) 31; /* * bpf_skb_change_type @@ -881,7 +891,7 @@ static long (*bpf_skb_change_proto)(struct __sk_buff *skb, __be16 proto, __u64 f * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) 32; +static long (* const bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) 32; /* * bpf_skb_under_cgroup @@ -896,7 +906,7 @@ static long (*bpf_skb_change_type)(struct __sk_buff *skb, __u32 type) = (void *) * * 1, if the *skb* succeeded the cgroup2 descendant test. * * A negative error code, if an error occurred. */ -static long (*bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 index) = (void *) 33; +static long (* const bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 index) = (void *) 33; /* * bpf_get_hash_recalc @@ -916,7 +926,7 @@ static long (*bpf_skb_under_cgroup)(struct __sk_buff *skb, void *map, __u32 inde * Returns * The 32-bit hash. */ -static __u32 (*bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34; +static __u32 (* const bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34; /* * bpf_get_current_task @@ -926,7 +936,7 @@ static __u32 (*bpf_get_hash_recalc)(struct __sk_buff *skb) = (void *) 34; * Returns * A pointer to the current task struct. */ -static __u64 (*bpf_get_current_task)(void) = (void *) 35; +static __u64 (* const bpf_get_current_task)(void) = (void *) 35; /* * bpf_probe_write_user @@ -949,7 +959,7 @@ static __u64 (*bpf_get_current_task)(void) = (void *) 35; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (void *) 36; +static long (* const bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (void *) 36; /* * bpf_current_task_under_cgroup @@ -965,7 +975,7 @@ static long (*bpf_probe_write_user)(void *dst, const void *src, __u32 len) = (vo * * 0, if current task does not belong to the cgroup2. * * A negative error code, if an error occurred. */ -static long (*bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) 37; +static long (* const bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) 37; /* * bpf_skb_change_tail @@ -993,7 +1003,7 @@ static long (*bpf_current_task_under_cgroup)(void *map, __u32 index) = (void *) * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 38; +static long (* const bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 38; /* * bpf_skb_pull_data @@ -1001,7 +1011,8 @@ static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags * Pull in non-linear data in case the *skb* is non-linear and not * all of *len* are part of the linear section. Make *len* bytes * from *skb* readable and writable. If a zero value is passed for - * *len*, then the whole length of the *skb* is pulled. + * *len*, then all bytes in the linear part of *skb* will be made + * readable and writable. * * This helper is only needed for reading and writing with direct * packet access. @@ -1033,7 +1044,7 @@ static long (*bpf_skb_change_tail)(struct __sk_buff *skb, __u32 len, __u64 flags * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39; +static long (* const bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39; /* * bpf_csum_update @@ -1049,7 +1060,7 @@ static long (*bpf_skb_pull_data)(struct __sk_buff *skb, __u32 len) = (void *) 39 * The checksum on success, or a negative error code in case of * failure. */ -static __s64 (*bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 40; +static __s64 (* const bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 40; /* * bpf_set_hash_invalid @@ -1063,7 +1074,7 @@ static __s64 (*bpf_csum_update)(struct __sk_buff *skb, __wsum csum) = (void *) 4 * Returns * void. */ -static void (*bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41; +static void (* const bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41; /* * bpf_get_numa_node_id @@ -1078,7 +1089,7 @@ static void (*bpf_set_hash_invalid)(struct __sk_buff *skb) = (void *) 41; * Returns * The id of current NUMA node. */ -static long (*bpf_get_numa_node_id)(void) = (void *) 42; +static long (* const bpf_get_numa_node_id)(void) = (void *) 42; /* * bpf_skb_change_head @@ -1103,7 +1114,7 @@ static long (*bpf_get_numa_node_id)(void) = (void *) 42; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 43; +static long (* const bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags) = (void *) 43; /* * bpf_xdp_adjust_head @@ -1122,7 +1133,7 @@ static long (*bpf_skb_change_head)(struct __sk_buff *skb, __u32 len, __u64 flags * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44; +static long (* const bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) 44; /* * bpf_probe_read_str @@ -1139,7 +1150,7 @@ static long (*bpf_xdp_adjust_head)(struct xdp_md *xdp_md, int delta) = (void *) * including the trailing NUL character. On error, a negative * value. */ -static long (*bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 45; +static long (* const bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 45; /* * bpf_get_socket_cookie @@ -1156,7 +1167,7 @@ static long (*bpf_probe_read_str)(void *dst, __u32 size, const void *unsafe_ptr) * A 8-byte long unique number on success, or 0 if the socket * field is missing inside *skb*. */ -static __u64 (*bpf_get_socket_cookie)(void *ctx) = (void *) 46; +static __u64 (* const bpf_get_socket_cookie)(void *ctx) = (void *) 46; /* * bpf_get_socket_uid @@ -1170,7 +1181,7 @@ static __u64 (*bpf_get_socket_cookie)(void *ctx) = (void *) 46; * is returned (note that **overflowuid** might also be the actual * UID value for the socket). */ -static __u32 (*bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47; +static __u32 (* const bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47; /* * bpf_set_hash @@ -1181,7 +1192,7 @@ static __u32 (*bpf_get_socket_uid)(struct __sk_buff *skb) = (void *) 47; * Returns * 0 */ -static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; +static long (* const bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; /* * bpf_setsockopt @@ -1195,8 +1206,8 @@ static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **setsockopt()**. * It supports the following *level*\ s: @@ -1204,19 +1215,24 @@ static long (*bpf_set_hash)(struct __sk_buff *skb, __u32 hash) = (void *) 48; * * **SOL_SOCKET**, which supports the following *optname*\ s: * **SO_RCVBUF**, **SO_SNDBUF**, **SO_MAX_PACING_RATE**, * **SO_PRIORITY**, **SO_RCVLOWAT**, **SO_MARK**, - * **SO_BINDTODEVICE**, **SO_KEEPALIVE**. + * **SO_BINDTODEVICE**, **SO_KEEPALIVE**, **SO_REUSEADDR**, + * **SO_REUSEPORT**, **SO_BINDTOIFINDEX**, **SO_TXREHASH**. * * **IPPROTO_TCP**, which supports the following *optname*\ s: * **TCP_CONGESTION**, **TCP_BPF_IW**, * **TCP_BPF_SNDCWND_CLAMP**, **TCP_SAVE_SYN**, * **TCP_KEEPIDLE**, **TCP_KEEPINTVL**, **TCP_KEEPCNT**, - * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**. + * **TCP_SYNCNT**, **TCP_USER_TIMEOUT**, **TCP_NOTSENT_LOWAT**, + * **TCP_NODELAY**, **TCP_MAXSEG**, **TCP_WINDOW_CLAMP**, + * **TCP_THIN_LINEAR_TIMEOUTS**, **TCP_BPF_DELACK_MAX**, + * **TCP_BPF_RTO_MIN**. * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * * **IPPROTO_IPV6**, which supports the following *optname*\ s: + * **IPV6_TCLASS**, **IPV6_AUTOFLOWLABEL**. * * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49; +static long (* const bpf_setsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 49; /* * bpf_skb_adjust_room @@ -1234,10 +1250,12 @@ static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *op * There are two supported modes at this time: * * * **BPF_ADJ_ROOM_MAC**: Adjust room at the mac layer - * (room space is added or removed below the layer 2 header). + * (room space is added or removed between the layer 2 and + * layer 3 headers). * * * **BPF_ADJ_ROOM_NET**: Adjust room at the network layer - * (room space is added or removed below the layer 3 header). + * (room space is added or removed between the layer 3 and + * layer 4 headers). * * The following flags are supported at this time: * @@ -1261,6 +1279,11 @@ static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *op * Use with BPF_F_ADJ_ROOM_ENCAP_L2 flag to further specify the * L2 type as Ethernet. * + * * **BPF_F_ADJ_ROOM_DECAP_L3_IPV4**, + * **BPF_F_ADJ_ROOM_DECAP_L3_IPV6**: + * Indicate the new IP header version after decapsulating the outer + * IP header. Used when the inner and outer IP versions are different. + * * A call to this helper is susceptible to change the underlying * packet buffer. Therefore, at load time, all checks on pointers * previously done by the verifier are invalidated and must be @@ -1270,7 +1293,7 @@ static long (*bpf_setsockopt)(void *bpf_socket, int level, int optname, void *op * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 mode, __u64 flags) = (void *) 50; +static long (* const bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 mode, __u64 flags) = (void *) 50; /* * bpf_redirect_map @@ -1299,7 +1322,7 @@ static long (*bpf_skb_adjust_room)(struct __sk_buff *skb, __s32 len_diff, __u32 * **XDP_REDIRECT** on success, or the value of the two lower bits * of the *flags* argument on error. */ -static long (*bpf_redirect_map)(void *map, __u32 key, __u64 flags) = (void *) 51; +static long (* const bpf_redirect_map)(void *map, __u64 key, __u64 flags) = (void *) 51; /* * bpf_sk_redirect_map @@ -1314,7 +1337,7 @@ static long (*bpf_redirect_map)(void *map, __u32 key, __u64 flags) = (void *) 51 * Returns * **SK_PASS** on success, or **SK_DROP** on error. */ -static long (*bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, __u64 flags) = (void *) 52; +static long (* const bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, __u64 flags) = (void *) 52; /* * bpf_sock_map_update @@ -1337,7 +1360,7 @@ static long (*bpf_sk_redirect_map)(struct __sk_buff *skb, void *map, __u32 key, * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 53; +static long (* const bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 53; /* * bpf_xdp_adjust_meta @@ -1370,7 +1393,7 @@ static long (*bpf_sock_map_update)(struct bpf_sock_ops *skops, void *map, void * * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54; +static long (* const bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) 54; /* * bpf_perf_event_read_value @@ -1424,12 +1447,12 @@ static long (*bpf_xdp_adjust_meta)(struct xdp_md *xdp_md, int delta) = (void *) * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 55; +static long (* const bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 55; /* * bpf_perf_prog_read_value * - * For en eBPF program attached to a perf event, retrieve the + * For an eBPF program attached to a perf event, retrieve the * value of the event counter associated to *ctx* and store it in * the structure pointed by *buf* and of size *buf_size*. Enabled * and running times are also stored in the structure (see @@ -1439,7 +1462,7 @@ static long (*bpf_perf_event_read_value)(void *map, __u64 flags, struct bpf_perf * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 56; +static long (* const bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct bpf_perf_event_value *buf, __u32 buf_size) = (void *) 56; /* * bpf_getsockopt @@ -1454,21 +1477,19 @@ static long (*bpf_perf_prog_read_value)(struct bpf_perf_event_data *ctx, struct * *bpf_socket* should be one of the following: * * * **struct bpf_sock_ops** for **BPF_PROG_TYPE_SOCK_OPS**. - * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT** - * and **BPF_CGROUP_INET6_CONNECT**. + * * **struct bpf_sock_addr** for **BPF_CGROUP_INET4_CONNECT**, + * **BPF_CGROUP_INET6_CONNECT** and **BPF_CGROUP_UNIX_CONNECT**. * * This helper actually implements a subset of **getsockopt()**. - * It supports the following *level*\ s: - * - * * **IPPROTO_TCP**, which supports *optname* - * **TCP_CONGESTION**. - * * **IPPROTO_IP**, which supports *optname* **IP_TOS**. - * * **IPPROTO_IPV6**, which supports *optname* **IPV6_TCLASS**. + * It supports the same set of *optname*\ s that is supported by + * the **bpf_setsockopt**\ () helper. The exceptions are + * **TCP_BPF_*** is **bpf_setsockopt**\ () only and + * **TCP_SAVED_SYN** is **bpf_getsockopt**\ () only. * * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57; +static long (* const bpf_getsockopt)(void *bpf_socket, int level, int optname, void *optval, int optlen) = (void *) 57; /* * bpf_override_return @@ -1497,7 +1518,7 @@ static long (*bpf_getsockopt)(void *bpf_socket, int level, int optname, void *op * Returns * 0 */ -static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58; +static long (* const bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58; /* * bpf_sock_ops_cb_flags_set @@ -1545,7 +1566,7 @@ static long (*bpf_override_return)(struct pt_regs *regs, __u64 rc) = (void *) 58 * be set is returned (which comes down to 0 if all bits were set * as required). */ -static long (*bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argval) = (void *) 59; +static long (* const bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argval) = (void *) 59; /* * bpf_msg_redirect_map @@ -1563,7 +1584,7 @@ static long (*bpf_sock_ops_cb_flags_set)(struct bpf_sock_ops *bpf_sock, int argv * Returns * **SK_PASS** on success, or **SK_DROP** on error. */ -static long (*bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, __u64 flags) = (void *) 60; +static long (* const bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, __u64 flags) = (void *) 60; /* * bpf_msg_apply_bytes @@ -1601,7 +1622,7 @@ static long (*bpf_msg_redirect_map)(struct sk_msg_md *msg, void *map, __u32 key, * Returns * 0 */ -static long (*bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 61; +static long (* const bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 61; /* * bpf_msg_cork_bytes @@ -1623,7 +1644,7 @@ static long (*bpf_msg_apply_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void * * Returns * 0 */ -static long (*bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 62; +static long (* const bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) 62; /* * bpf_msg_pull_data @@ -1658,7 +1679,7 @@ static long (*bpf_msg_cork_bytes)(struct sk_msg_md *msg, __u32 bytes) = (void *) * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, __u64 flags) = (void *) 63; +static long (* const bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, __u64 flags) = (void *) 63; /* * bpf_bind @@ -1680,7 +1701,7 @@ static long (*bpf_msg_pull_data)(struct sk_msg_md *msg, __u32 start, __u32 end, * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) = (void *) 64; +static long (* const bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int addr_len) = (void *) 64; /* * bpf_xdp_adjust_tail @@ -1698,7 +1719,7 @@ static long (*bpf_bind)(struct bpf_sock_addr *ctx, struct sockaddr *addr, int ad * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65; +static long (* const bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) 65; /* * bpf_skb_get_xfrm_state @@ -1718,7 +1739,7 @@ static long (*bpf_xdp_adjust_tail)(struct xdp_md *xdp_md, int delta) = (void *) * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct bpf_xfrm_state *xfrm_state, __u32 size, __u64 flags) = (void *) 66; +static long (* const bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct bpf_xfrm_state *xfrm_state, __u32 size, __u64 flags) = (void *) 66; /* * bpf_get_stack @@ -1737,8 +1758,18 @@ static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. * **BPF_F_USER_BUILD_ID** - * Collect buildid+offset instead of ips for user stack, - * only valid if **BPF_F_USER_STACK** is also specified. + * Collect (build_id, file_offset) instead of ips for user + * stack, only valid if **BPF_F_USER_STACK** is also + * specified. + * + * *file_offset* is an offset relative to the beginning + * of the executable or shared object file backing the vma + * which the *ip* falls in. It is *not* an offset relative + * to that object's base address. Accordingly, it must be + * adjusted by adding (sh_addr - sh_offset), where + * sh_{addr,offset} correspond to the executable section + * containing *file_offset* in the object, for comparisons + * to symbols' st_value to be valid. * * **bpf_get_stack**\ () can collect up to * **PERF_MAX_STACK_DEPTH** both kernel and user frames, subject @@ -1755,7 +1786,7 @@ static long (*bpf_skb_get_xfrm_state)(struct __sk_buff *skb, __u32 index, struct * The non-negative copied *buf* length equal to or less than * *size* on success, or a negative error in case of failure. */ -static long (*bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (void *) 67; +static long (* const bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (void *) 67; /* * bpf_skb_load_bytes_relative @@ -1781,7 +1812,7 @@ static long (*bpf_get_stack)(void *ctx, void *buf, __u32 size, __u64 flags) = (v * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void *to, __u32 len, __u32 start_header) = (void *) 68; +static long (* const bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void *to, __u32 len, __u32 start_header) = (void *) 68; /* * bpf_fib_lookup @@ -1803,9 +1834,27 @@ static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void * * **BPF_FIB_LOOKUP_DIRECT** * Do a direct table lookup vs full lookup using FIB * rules. + * **BPF_FIB_LOOKUP_TBID** + * Used with BPF_FIB_LOOKUP_DIRECT. + * Use the routing table ID present in *params*->tbid + * for the fib lookup. * **BPF_FIB_LOOKUP_OUTPUT** * Perform lookup from an egress perspective (default is * ingress). + * **BPF_FIB_LOOKUP_SKIP_NEIGH** + * Skip the neighbour table lookup. *params*->dmac + * and *params*->smac will not be set as output. A common + * use case is to call **bpf_redirect_neigh**\ () after + * doing **bpf_fib_lookup**\ (). + * **BPF_FIB_LOOKUP_SRC** + * Derive and set source IP addr in *params*->ipv{4,6}_src + * for the nexthop. If the src addr cannot be derived, + * **BPF_FIB_LKUP_RET_NO_SRC_ADDR** is returned. In this + * case, *params*->dmac and *params*->smac are not set either. + * **BPF_FIB_LOOKUP_MARK** + * Use the mark present in *params*->mark for the fib lookup. + * This option should not be used with BPF_FIB_LOOKUP_DIRECT, + * as it only has meaning for full lookups. * * *ctx* is either **struct xdp_md** for XDP programs or * **struct sk_buff** tc cls_act programs. @@ -1819,7 +1868,7 @@ static long (*bpf_skb_load_bytes_relative)(const void *skb, __u32 offset, void * * If lookup fails with BPF_FIB_LKUP_RET_FRAG_NEEDED, then the MTU * was exceeded and output params->mtu_result contains the MTU. */ -static long (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) 69; +static long (* const bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen, __u32 flags) = (void *) 69; /* * bpf_sock_hash_update @@ -1842,7 +1891,7 @@ static long (*bpf_fib_lookup)(void *ctx, struct bpf_fib_lookup *params, int plen * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 70; +static long (* const bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void *key, __u64 flags) = (void *) 70; /* * bpf_msg_redirect_hash @@ -1860,7 +1909,7 @@ static long (*bpf_sock_hash_update)(struct bpf_sock_ops *skops, void *map, void * Returns * **SK_PASS** on success, or **SK_DROP** on error. */ -static long (*bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key, __u64 flags) = (void *) 71; +static long (* const bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key, __u64 flags) = (void *) 71; /* * bpf_sk_redirect_hash @@ -1878,7 +1927,7 @@ static long (*bpf_msg_redirect_hash)(struct sk_msg_md *msg, void *map, void *key * Returns * **SK_PASS** on success, or **SK_DROP** on error. */ -static long (*bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, __u64 flags) = (void *) 72; +static long (* const bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, __u64 flags) = (void *) 72; /* * bpf_lwt_push_encap @@ -1919,7 +1968,7 @@ static long (*bpf_sk_redirect_hash)(struct __sk_buff *skb, void *map, void *key, * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, __u32 len) = (void *) 73; +static long (* const bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, __u32 len) = (void *) 73; /* * bpf_lwt_seg6_store_bytes @@ -1938,7 +1987,7 @@ static long (*bpf_lwt_push_encap)(struct __sk_buff *skb, __u32 type, void *hdr, * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len) = (void *) 74; +static long (* const bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, const void *from, __u32 len) = (void *) 74; /* * bpf_lwt_seg6_adjust_srh @@ -1958,7 +2007,7 @@ static long (*bpf_lwt_seg6_store_bytes)(struct __sk_buff *skb, __u32 offset, con * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s32 delta) = (void *) 75; +static long (* const bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s32 delta) = (void *) 75; /* * bpf_lwt_seg6_action @@ -1991,7 +2040,7 @@ static long (*bpf_lwt_seg6_adjust_srh)(struct __sk_buff *skb, __u32 offset, __s3 * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *param, __u32 param_len) = (void *) 76; +static long (* const bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *param, __u32 param_len) = (void *) 76; /* * bpf_rc_repeat @@ -2014,7 +2063,7 @@ static long (*bpf_lwt_seg6_action)(struct __sk_buff *skb, __u32 action, void *pa * Returns * 0 */ -static long (*bpf_rc_repeat)(void *ctx) = (void *) 77; +static long (* const bpf_rc_repeat)(void *ctx) = (void *) 77; /* * bpf_rc_keydown @@ -2044,7 +2093,7 @@ static long (*bpf_rc_repeat)(void *ctx) = (void *) 77; * Returns * 0 */ -static long (*bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 toggle) = (void *) 78; +static long (* const bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 toggle) = (void *) 78; /* * bpf_skb_cgroup_id @@ -2064,7 +2113,7 @@ static long (*bpf_rc_keydown)(void *ctx, __u32 protocol, __u64 scancode, __u32 t * Returns * The id is returned or 0 in case the id could not be retrieved. */ -static __u64 (*bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79; +static __u64 (* const bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79; /* * bpf_get_current_cgroup_id @@ -2076,7 +2125,7 @@ static __u64 (*bpf_skb_cgroup_id)(struct __sk_buff *skb) = (void *) 79; * A 64-bit integer containing the current cgroup id based * on the cgroup within which the current task is running. */ -static __u64 (*bpf_get_current_cgroup_id)(void) = (void *) 80; +static __u64 (* const bpf_get_current_cgroup_id)(void) = (void *) 80; /* * bpf_get_local_storage @@ -2098,7 +2147,7 @@ static __u64 (*bpf_get_current_cgroup_id)(void) = (void *) 80; * Returns * A pointer to the local storage area. */ -static void *(*bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81; +static void *(* const bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81; /* * bpf_sk_select_reuseport @@ -2111,7 +2160,7 @@ static void *(*bpf_get_local_storage)(void *map, __u64 flags) = (void *) 81; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, void *key, __u64 flags) = (void *) 82; +static long (* const bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, void *key, __u64 flags) = (void *) 82; /* * bpf_skb_ancestor_cgroup_id @@ -2133,7 +2182,7 @@ static long (*bpf_sk_select_reuseport)(struct sk_reuseport_md *reuse, void *map, * Returns * The id is returned or 0 in case the id could not be retrieved. */ -static __u64 (*bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_level) = (void *) 83; +static __u64 (* const bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_level) = (void *) 83; /* * bpf_sk_lookup_tcp @@ -2174,7 +2223,7 @@ static __u64 (*bpf_skb_ancestor_cgroup_id)(struct __sk_buff *skb, int ancestor_l * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. */ -static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 84; +static struct bpf_sock *(* const bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 84; /* * bpf_sk_lookup_udp @@ -2215,7 +2264,7 @@ static struct bpf_sock *(*bpf_sk_lookup_tcp)(void *ctx, struct bpf_sock_tuple *t * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. */ -static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 85; +static struct bpf_sock *(* const bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 85; /* * bpf_sk_release @@ -2227,7 +2276,7 @@ static struct bpf_sock *(*bpf_sk_lookup_udp)(void *ctx, struct bpf_sock_tuple *t * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_sk_release)(void *sock) = (void *) 86; +static long (* const bpf_sk_release)(void *sock) = (void *) 86; /* * bpf_map_push_elem @@ -2241,7 +2290,7 @@ static long (*bpf_sk_release)(void *sock) = (void *) 86; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (void *) 87; +static long (* const bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (void *) 87; /* * bpf_map_pop_elem @@ -2251,7 +2300,7 @@ static long (*bpf_map_push_elem)(void *map, const void *value, __u64 flags) = (v * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_map_pop_elem)(void *map, void *value) = (void *) 88; +static long (* const bpf_map_pop_elem)(void *map, void *value) = (void *) 88; /* * bpf_map_peek_elem @@ -2261,7 +2310,7 @@ static long (*bpf_map_pop_elem)(void *map, void *value) = (void *) 88; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_map_peek_elem)(void *map, void *value) = (void *) 89; +static long (* const bpf_map_peek_elem)(void *map, void *value) = (void *) 89; /* * bpf_msg_push_data @@ -2281,7 +2330,7 @@ static long (*bpf_map_peek_elem)(void *map, void *value) = (void *) 89; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 90; +static long (* const bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 90; /* * bpf_msg_pop_data @@ -2297,7 +2346,7 @@ static long (*bpf_msg_push_data)(struct sk_msg_md *msg, __u32 start, __u32 len, * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 91; +static long (* const bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, __u64 flags) = (void *) 91; /* * bpf_rc_pointer_rel @@ -2315,7 +2364,7 @@ static long (*bpf_msg_pop_data)(struct sk_msg_md *msg, __u32 start, __u32 len, _ * Returns * 0 */ -static long (*bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void *) 92; +static long (* const bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void *) 92; /* * bpf_spin_lock @@ -2367,7 +2416,7 @@ static long (*bpf_rc_pointer_rel)(void *ctx, __s32 rel_x, __s32 rel_y) = (void * * Returns * 0 */ -static long (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93; +static long (* const bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93; /* * bpf_spin_unlock @@ -2378,7 +2427,7 @@ static long (*bpf_spin_lock)(struct bpf_spin_lock *lock) = (void *) 93; * Returns * 0 */ -static long (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94; +static long (* const bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94; /* * bpf_sk_fullsock @@ -2390,7 +2439,7 @@ static long (*bpf_spin_unlock)(struct bpf_spin_lock *lock) = (void *) 94; * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. */ -static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95; +static struct bpf_sock *(* const bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95; /* * bpf_tcp_sock @@ -2402,7 +2451,7 @@ static struct bpf_sock *(*bpf_sk_fullsock)(struct bpf_sock *sk) = (void *) 95; * A **struct bpf_tcp_sock** pointer on success, or **NULL** in * case of failure. */ -static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96; +static struct bpf_tcp_sock *(* const bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96; /* * bpf_skb_ecn_set_ce @@ -2416,7 +2465,7 @@ static struct bpf_tcp_sock *(*bpf_tcp_sock)(struct bpf_sock *sk) = (void *) 96; * 1 if the **CE** flag is set (either by the current helper call * or because it was already present), 0 if it is not set. */ -static long (*bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97; +static long (* const bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97; /* * bpf_get_listener_sock @@ -2428,7 +2477,7 @@ static long (*bpf_skb_ecn_set_ce)(struct __sk_buff *skb) = (void *) 97; * A **struct bpf_sock** pointer on success, or **NULL** in * case of failure. */ -static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) 98; +static struct bpf_sock *(* const bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) 98; /* * bpf_skc_lookup_tcp @@ -2451,7 +2500,7 @@ static struct bpf_sock *(*bpf_get_listener_sock)(struct bpf_sock *sk) = (void *) * result is from *reuse*\ **->socks**\ [] using the hash of the * tuple. */ -static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99; +static struct bpf_sock *(* const bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple *tuple, __u32 tuple_size, __u64 netns, __u64 flags) = (void *) 99; /* * bpf_tcp_check_syncookie @@ -2461,16 +2510,17 @@ static struct bpf_sock *(*bpf_skc_lookup_tcp)(void *ctx, struct bpf_sock_tuple * * * *iph* points to the start of the IPv4 or IPv6 header, while * *iph_len* contains **sizeof**\ (**struct iphdr**) or - * **sizeof**\ (**struct ip6hdr**). + * **sizeof**\ (**struct ipv6hdr**). * * *th* points to the start of the TCP header, while *th_len* - * contains **sizeof**\ (**struct tcphdr**). + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). * * Returns * 0 if *iph* and *th* are a valid SYN cookie ACK, or a negative * error otherwise. */ -static long (*bpf_tcp_check_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 100; +static long (* const bpf_tcp_check_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 100; /* * bpf_sysctl_get_name @@ -2490,7 +2540,7 @@ static long (*bpf_tcp_check_syncookie)(void *sk, void *iph, __u32 iph_len, struc * **-E2BIG** if the buffer wasn't big enough (*buf* will contain * truncated name in this case). */ -static long (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len, __u64 flags) = (void *) 101; +static long (* const bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len, __u64 flags) = (void *) 101; /* * bpf_sysctl_get_current_value @@ -2513,7 +2563,7 @@ static long (*bpf_sysctl_get_name)(struct bpf_sysctl *ctx, char *buf, unsigned l * **-EINVAL** if current value was unavailable, e.g. because * sysctl is uninitialized and read returns -EIO for it. */ -static long (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 102; +static long (* const bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 102; /* * bpf_sysctl_get_new_value @@ -2534,7 +2584,7 @@ static long (*bpf_sysctl_get_current_value)(struct bpf_sysctl *ctx, char *buf, u * * **-EINVAL** if sysctl is being read. */ -static long (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 103; +static long (* const bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsigned long buf_len) = (void *) 103; /* * bpf_sysctl_set_new_value @@ -2555,7 +2605,7 @@ static long (*bpf_sysctl_get_new_value)(struct bpf_sysctl *ctx, char *buf, unsig * * **-EINVAL** if sysctl is being read. */ -static long (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, unsigned long buf_len) = (void *) 104; +static long (* const bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, unsigned long buf_len) = (void *) 104; /* * bpf_strtol @@ -2583,7 +2633,7 @@ static long (*bpf_sysctl_set_new_value)(struct bpf_sysctl *ctx, const char *buf, * * **-ERANGE** if resulting value was out of range. */ -static long (*bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, long *res) = (void *) 105; +static long (* const bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, long *res) = (void *) 105; /* * bpf_strtoul @@ -2610,7 +2660,7 @@ static long (*bpf_strtol)(const char *buf, unsigned long buf_len, __u64 flags, l * * **-ERANGE** if resulting value was out of range. */ -static long (*bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, unsigned long *res) = (void *) 106; +static long (* const bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, unsigned long *res) = (void *) 106; /* * bpf_sk_storage_get @@ -2645,7 +2695,7 @@ static long (*bpf_strtoul)(const char *buf, unsigned long buf_len, __u64 flags, * **NULL** if not found or there was an error in adding * a new bpf-local-storage. */ -static void *(*bpf_sk_storage_get)(void *map, void *sk, void *value, __u64 flags) = (void *) 107; +static void *(* const bpf_sk_storage_get)(void *map, void *sk, void *value, __u64 flags) = (void *) 107; /* * bpf_sk_storage_delete @@ -2658,7 +2708,7 @@ static void *(*bpf_sk_storage_get)(void *map, void *sk, void *value, __u64 flags * **-ENOENT** if the bpf-local-storage cannot be found. * **-EINVAL** if sk is not a fullsock (e.g. a request_sock). */ -static long (*bpf_sk_storage_delete)(void *map, void *sk) = (void *) 108; +static long (* const bpf_sk_storage_delete)(void *map, void *sk) = (void *) 108; /* * bpf_send_signal @@ -2677,7 +2727,7 @@ static long (*bpf_sk_storage_delete)(void *map, void *sk) = (void *) 108; * * **-EAGAIN** if bpf program can try again. */ -static long (*bpf_send_signal)(__u32 sig) = (void *) 109; +static long (* const bpf_send_signal)(__u32 sig) = (void *) 109; /* * bpf_tcp_gen_syncookie @@ -2687,10 +2737,11 @@ static long (*bpf_send_signal)(__u32 sig) = (void *) 109; * * *iph* points to the start of the IPv4 or IPv6 header, while * *iph_len* contains **sizeof**\ (**struct iphdr**) or - * **sizeof**\ (**struct ip6hdr**). + * **sizeof**\ (**struct ipv6hdr**). * * *th* points to the start of the TCP header, while *th_len* - * contains the length of the TCP header. + * contains the length of the TCP header with options (at least + * **sizeof**\ (**struct tcphdr**)). * * Returns * On success, lower 32 bits hold the generated SYN cookie in @@ -2707,7 +2758,7 @@ static long (*bpf_send_signal)(__u32 sig) = (void *) 109; * * **-EPROTONOSUPPORT** IP packet version is not 4 or 6 */ -static __s64 (*bpf_tcp_gen_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 110; +static __s64 (* const bpf_tcp_gen_syncookie)(void *sk, void *iph, __u32 iph_len, struct tcphdr *th, __u32 th_len) = (void *) 110; /* * bpf_skb_output @@ -2735,7 +2786,7 @@ static __s64 (*bpf_tcp_gen_syncookie)(void *sk, void *iph, __u32 iph_len, struct * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 111; +static long (* const bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 111; /* * bpf_probe_read_user @@ -2746,7 +2797,7 @@ static long (*bpf_skb_output)(void *ctx, void *map, __u64 flags, void *data, __u * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 112; +static long (* const bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 112; /* * bpf_probe_read_kernel @@ -2757,7 +2808,7 @@ static long (*bpf_probe_read_user)(void *dst, __u32 size, const void *unsafe_ptr * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 113; +static long (* const bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 113; /* * bpf_probe_read_user_str @@ -2805,7 +2856,7 @@ static long (*bpf_probe_read_kernel)(void *dst, __u32 size, const void *unsafe_p * including the trailing NUL character. On error, a negative * value. */ -static long (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 114; +static long (* const bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 114; /* * bpf_probe_read_kernel_str @@ -2817,7 +2868,7 @@ static long (*bpf_probe_read_user_str)(void *dst, __u32 size, const void *unsafe * On success, the strictly positive length of the string, including * the trailing NUL character. On error, a negative value. */ -static long (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115; +static long (* const bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsafe_ptr) = (void *) 115; /* * bpf_tcp_send_ack @@ -2828,7 +2879,7 @@ static long (*bpf_probe_read_kernel_str)(void *dst, __u32 size, const void *unsa * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116; +static long (* const bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116; /* * bpf_send_signal_thread @@ -2846,7 +2897,7 @@ static long (*bpf_tcp_send_ack)(void *tp, __u32 rcv_nxt) = (void *) 116; * * **-EAGAIN** if bpf program can try again. */ -static long (*bpf_send_signal_thread)(__u32 sig) = (void *) 117; +static long (* const bpf_send_signal_thread)(__u32 sig) = (void *) 117; /* * bpf_jiffies64 @@ -2856,7 +2907,7 @@ static long (*bpf_send_signal_thread)(__u32 sig) = (void *) 117; * Returns * The 64 bit jiffies */ -static __u64 (*bpf_jiffies64)(void) = (void *) 118; +static __u64 (* const bpf_jiffies64)(void) = (void *) 118; /* * bpf_read_branch_records @@ -2879,7 +2930,7 @@ static __u64 (*bpf_jiffies64)(void) = (void *) 118; * * **-ENOENT** if architecture does not support branch records. */ -static long (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf, __u32 size, __u64 flags) = (void *) 119; +static long (* const bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *buf, __u32 size, __u64 flags) = (void *) 119; /* * bpf_get_ns_current_pid_tgid @@ -2895,7 +2946,7 @@ static long (*bpf_read_branch_records)(struct bpf_perf_event_data *ctx, void *bu * * **-ENOENT** if pidns does not exists for the current task. */ -static long (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120; +static long (* const bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidns_info *nsdata, __u32 size) = (void *) 120; /* * bpf_xdp_output @@ -2923,7 +2974,7 @@ static long (*bpf_get_ns_current_pid_tgid)(__u64 dev, __u64 ino, struct bpf_pidn * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 121; +static long (* const bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u64 size) = (void *) 121; /* * bpf_get_netns_cookie @@ -2940,7 +2991,7 @@ static long (*bpf_xdp_output)(void *ctx, void *map, __u64 flags, void *data, __u * Returns * A 8-byte long opaque number. */ -static __u64 (*bpf_get_netns_cookie)(void *ctx) = (void *) 122; +static __u64 (* const bpf_get_netns_cookie)(void *ctx) = (void *) 122; /* * bpf_get_current_ancestor_cgroup_id @@ -2962,7 +3013,7 @@ static __u64 (*bpf_get_netns_cookie)(void *ctx) = (void *) 122; * Returns * The id is returned or 0 in case the id could not be retrieved. */ -static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void *) 123; +static __u64 (* const bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void *) 123; /* * bpf_sk_assign @@ -2993,11 +3044,8 @@ static __u64 (*bpf_get_current_ancestor_cgroup_id)(int ancestor_level) = (void * * * **-EOPNOTSUPP** if the operation is not supported, for example * a call from outside of TC ingress. - * - * **-ESOCKTNOSUPPORT** if the socket type is not supported - * (reuseport). */ -static long (*bpf_sk_assign)(void *ctx, void *sk, __u64 flags) = (void *) 124; +static long (* const bpf_sk_assign)(void *ctx, void *sk, __u64 flags) = (void *) 124; /* * bpf_ktime_get_boot_ns @@ -3009,7 +3057,7 @@ static long (*bpf_sk_assign)(void *ctx, void *sk, __u64 flags) = (void *) 124; * Returns * Current *ktime*. */ -static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125; +static __u64 (* const bpf_ktime_get_boot_ns)(void) = (void *) 125; /* * bpf_seq_printf @@ -3042,7 +3090,7 @@ static __u64 (*bpf_ktime_get_boot_ns)(void) = (void *) 125; * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. */ -static long (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126; +static long (* const bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 126; /* * bpf_seq_write @@ -3056,7 +3104,7 @@ static long (*bpf_seq_printf)(struct seq_file *m, const char *fmt, __u32 fmt_siz * * **-EOVERFLOW** if an overflow happened: The same object will be tried again. */ -static long (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127; +static long (* const bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = (void *) 127; /* * bpf_sk_cgroup_id @@ -3074,7 +3122,7 @@ static long (*bpf_seq_write)(struct seq_file *m, const void *data, __u32 len) = * Returns * The id is returned or 0 in case the id could not be retrieved. */ -static __u64 (*bpf_sk_cgroup_id)(void *sk) = (void *) 128; +static __u64 (* const bpf_sk_cgroup_id)(void *sk) = (void *) 128; /* * bpf_sk_ancestor_cgroup_id @@ -3096,7 +3144,7 @@ static __u64 (*bpf_sk_cgroup_id)(void *sk) = (void *) 128; * Returns * The id is returned or 0 in case the id could not be retrieved. */ -static __u64 (*bpf_sk_ancestor_cgroup_id)(void *sk, int ancestor_level) = (void *) 129; +static __u64 (* const bpf_sk_ancestor_cgroup_id)(void *sk, int ancestor_level) = (void *) 129; /* * bpf_ringbuf_output @@ -3117,7 +3165,7 @@ static __u64 (*bpf_sk_ancestor_cgroup_id)(void *sk, int ancestor_level) = (void * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *) 130; +static long (* const bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 flags) = (void *) 130; /* * bpf_ringbuf_reserve @@ -3129,7 +3177,7 @@ static long (*bpf_ringbuf_output)(void *ringbuf, void *data, __u64 size, __u64 f * Valid pointer with *size* bytes of memory available; NULL, * otherwise. */ -static void *(*bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (void *) 131; +static void *(* const bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (void *) 131; /* * bpf_ringbuf_submit @@ -3147,7 +3195,7 @@ static void *(*bpf_ringbuf_reserve)(void *ringbuf, __u64 size, __u64 flags) = (v * Returns * Nothing. Always succeeds. */ -static void (*bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132; +static void (* const bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132; /* * bpf_ringbuf_discard @@ -3165,7 +3213,7 @@ static void (*bpf_ringbuf_submit)(void *data, __u64 flags) = (void *) 132; * Returns * Nothing. Always succeeds. */ -static void (*bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133; +static void (* const bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133; /* * bpf_ringbuf_query @@ -3186,7 +3234,7 @@ static void (*bpf_ringbuf_discard)(void *data, __u64 flags) = (void *) 133; * Returns * Requested value, or 0, if *flags* are not recognized. */ -static __u64 (*bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134; +static __u64 (* const bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134; /* * bpf_csum_level @@ -3222,7 +3270,7 @@ static __u64 (*bpf_ringbuf_query)(void *ringbuf, __u64 flags) = (void *) 134; * is returned or the error code -EACCES in case the skb is not * subject to CHECKSUM_UNNECESSARY. */ -static long (*bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135; +static long (* const bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135; /* * bpf_skc_to_tcp6_sock @@ -3232,7 +3280,7 @@ static long (*bpf_csum_level)(struct __sk_buff *skb, __u64 level) = (void *) 135 * Returns * *sk* if casting is valid, or **NULL** otherwise. */ -static struct tcp6_sock *(*bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136; +static struct tcp6_sock *(* const bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136; /* * bpf_skc_to_tcp_sock @@ -3242,7 +3290,7 @@ static struct tcp6_sock *(*bpf_skc_to_tcp6_sock)(void *sk) = (void *) 136; * Returns * *sk* if casting is valid, or **NULL** otherwise. */ -static struct tcp_sock *(*bpf_skc_to_tcp_sock)(void *sk) = (void *) 137; +static struct tcp_sock *(* const bpf_skc_to_tcp_sock)(void *sk) = (void *) 137; /* * bpf_skc_to_tcp_timewait_sock @@ -3252,7 +3300,7 @@ static struct tcp_sock *(*bpf_skc_to_tcp_sock)(void *sk) = (void *) 137; * Returns * *sk* if casting is valid, or **NULL** otherwise. */ -static struct tcp_timewait_sock *(*bpf_skc_to_tcp_timewait_sock)(void *sk) = (void *) 138; +static struct tcp_timewait_sock *(* const bpf_skc_to_tcp_timewait_sock)(void *sk) = (void *) 138; /* * bpf_skc_to_tcp_request_sock @@ -3262,7 +3310,7 @@ static struct tcp_timewait_sock *(*bpf_skc_to_tcp_timewait_sock)(void *sk) = (vo * Returns * *sk* if casting is valid, or **NULL** otherwise. */ -static struct tcp_request_sock *(*bpf_skc_to_tcp_request_sock)(void *sk) = (void *) 139; +static struct tcp_request_sock *(* const bpf_skc_to_tcp_request_sock)(void *sk) = (void *) 139; /* * bpf_skc_to_udp6_sock @@ -3272,12 +3320,14 @@ static struct tcp_request_sock *(*bpf_skc_to_tcp_request_sock)(void *sk) = (void * Returns * *sk* if casting is valid, or **NULL** otherwise. */ -static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; +static struct udp6_sock *(* const bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; /* * bpf_get_task_stack * * Return a user or a kernel stack in bpf program provided buffer. + * Note: the user stack will only be populated if the *task* is + * the current task; all other tasks will return -EOPNOTSUPP. * To achieve this, the helper needs *task*, which is a valid * pointer to **struct task_struct**. To store the stacktrace, the * bpf program provides *buf* with a nonnegative *size*. @@ -3289,6 +3339,7 @@ static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; * * **BPF_F_USER_STACK** * Collect a user space stack instead of a kernel stack. + * The *task* must be the current task. * **BPF_F_USER_BUILD_ID** * Collect buildid+offset instead of ips for user stack, * only valid if **BPF_F_USER_STACK** is also specified. @@ -3308,7 +3359,7 @@ static struct udp6_sock *(*bpf_skc_to_udp6_sock)(void *sk) = (void *) 140; * The non-negative copied *buf* length equal to or less than * *size* on success, or a negative error in case of failure. */ -static long (*bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 size, __u64 flags) = (void *) 141; +static long (* const bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 size, __u64 flags) = (void *) 141; /* * bpf_load_hdr_opt @@ -3375,7 +3426,7 @@ static long (*bpf_get_task_stack)(struct task_struct *task, void *buf, __u32 siz * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. */ -static long (*bpf_load_hdr_opt)(struct bpf_sock_ops *skops, void *searchby_res, __u32 len, __u64 flags) = (void *) 142; +static long (* const bpf_load_hdr_opt)(struct bpf_sock_ops *skops, void *searchby_res, __u32 len, __u64 flags) = (void *) 142; /* * bpf_store_hdr_opt @@ -3407,12 +3458,12 @@ static long (*bpf_load_hdr_opt)(struct bpf_sock_ops *skops, void *searchby_res, * * **-EEXIST** if the option already exists. * - * **-EFAULT** on failrue to parse the existing header options. + * **-EFAULT** on failure to parse the existing header options. * * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. */ -static long (*bpf_store_hdr_opt)(struct bpf_sock_ops *skops, const void *from, __u32 len, __u64 flags) = (void *) 143; +static long (* const bpf_store_hdr_opt)(struct bpf_sock_ops *skops, const void *from, __u32 len, __u64 flags) = (void *) 143; /* * bpf_reserve_hdr_opt @@ -3438,7 +3489,7 @@ static long (*bpf_store_hdr_opt)(struct bpf_sock_ops *skops, const void *from, _ * **-EPERM** if the helper cannot be used under the current * *skops*\ **->op**. */ -static long (*bpf_reserve_hdr_opt)(struct bpf_sock_ops *skops, __u32 len, __u64 flags) = (void *) 144; +static long (* const bpf_reserve_hdr_opt)(struct bpf_sock_ops *skops, __u32 len, __u64 flags) = (void *) 144; /* * bpf_inode_storage_get @@ -3470,7 +3521,7 @@ static long (*bpf_reserve_hdr_opt)(struct bpf_sock_ops *skops, __u32 len, __u64 * **NULL** if not found or there was an error in adding * a new bpf_local_storage. */ -static void *(*bpf_inode_storage_get)(void *map, void *inode, void *value, __u64 flags) = (void *) 145; +static void *(* const bpf_inode_storage_get)(void *map, void *inode, void *value, __u64 flags) = (void *) 145; /* * bpf_inode_storage_delete @@ -3482,7 +3533,7 @@ static void *(*bpf_inode_storage_get)(void *map, void *inode, void *value, __u64 * * **-ENOENT** if the bpf_local_storage cannot be found. */ -static int (*bpf_inode_storage_delete)(void *map, void *inode) = (void *) 146; +static int (* const bpf_inode_storage_delete)(void *map, void *inode) = (void *) 146; /* * bpf_d_path @@ -3498,7 +3549,7 @@ static int (*bpf_inode_storage_delete)(void *map, void *inode) = (void *) 146; * including the trailing NUL character. On error, a negative * value. */ -static long (*bpf_d_path)(struct path *path, char *buf, __u32 sz) = (void *) 147; +static long (* const bpf_d_path)(struct path *path, char *buf, __u32 sz) = (void *) 147; /* * bpf_copy_from_user @@ -3509,7 +3560,7 @@ static long (*bpf_d_path)(struct path *path, char *buf, __u32 sz) = (void *) 147 * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_copy_from_user)(void *dst, __u32 size, const void *user_ptr) = (void *) 148; +static long (* const bpf_copy_from_user)(void *dst, __u32 size, const void *user_ptr) = (void *) 148; /* * bpf_snprintf_btf @@ -3549,7 +3600,7 @@ static long (*bpf_copy_from_user)(void *dst, __u32 size, const void *user_ptr) = * written if output had to be truncated due to string size), * or a negative error in cases of failure. */ -static long (*bpf_snprintf_btf)(char *str, __u32 str_size, struct btf_ptr *ptr, __u32 btf_ptr_size, __u64 flags) = (void *) 149; +static long (* const bpf_snprintf_btf)(char *str, __u32 str_size, struct btf_ptr *ptr, __u32 btf_ptr_size, __u64 flags) = (void *) 149; /* * bpf_seq_printf_btf @@ -3561,7 +3612,7 @@ static long (*bpf_snprintf_btf)(char *str, __u32 str_size, struct btf_ptr *ptr, * Returns * 0 on success or a negative error in case of failure. */ -static long (*bpf_seq_printf_btf)(struct seq_file *m, struct btf_ptr *ptr, __u32 ptr_size, __u64 flags) = (void *) 150; +static long (* const bpf_seq_printf_btf)(struct seq_file *m, struct btf_ptr *ptr, __u32 ptr_size, __u64 flags) = (void *) 150; /* * bpf_skb_cgroup_classid @@ -3574,7 +3625,7 @@ static long (*bpf_seq_printf_btf)(struct seq_file *m, struct btf_ptr *ptr, __u32 * Returns * The id is returned or 0 in case the id could not be retrieved. */ -static __u64 (*bpf_skb_cgroup_classid)(struct __sk_buff *skb) = (void *) 151; +static __u64 (* const bpf_skb_cgroup_classid)(struct __sk_buff *skb) = (void *) 151; /* * bpf_redirect_neigh @@ -3599,7 +3650,7 @@ static __u64 (*bpf_skb_cgroup_classid)(struct __sk_buff *skb) = (void *) 151; * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. */ -static long (*bpf_redirect_neigh)(__u32 ifindex, struct bpf_redir_neigh *params, int plen, __u64 flags) = (void *) 152; +static long (* const bpf_redirect_neigh)(__u32 ifindex, struct bpf_redir_neigh *params, int plen, __u64 flags) = (void *) 152; /* * bpf_per_cpu_ptr @@ -3620,7 +3671,7 @@ static long (*bpf_redirect_neigh)(__u32 ifindex, struct bpf_redir_neigh *params, * A pointer pointing to the kernel percpu variable on *cpu*, or * NULL, if *cpu* is invalid. */ -static void *(*bpf_per_cpu_ptr)(const void *percpu_ptr, __u32 cpu) = (void *) 153; +static void *(* const bpf_per_cpu_ptr)(const void *percpu_ptr, __u32 cpu) = (void *) 153; /* * bpf_this_cpu_ptr @@ -3636,7 +3687,7 @@ static void *(*bpf_per_cpu_ptr)(const void *percpu_ptr, __u32 cpu) = (void *) 15 * Returns * A pointer pointing to the kernel percpu variable on this cpu. */ -static void *(*bpf_this_cpu_ptr)(const void *percpu_ptr) = (void *) 154; +static void *(* const bpf_this_cpu_ptr)(const void *percpu_ptr) = (void *) 154; /* * bpf_redirect_peer @@ -3648,15 +3699,15 @@ static void *(*bpf_this_cpu_ptr)(const void *percpu_ptr) = (void *) 154; * going through the CPU's backlog queue. * * The *flags* argument is reserved and must be 0. The helper is - * currently only supported for tc BPF program types at the ingress - * hook and for veth device types. The peer device must reside in a - * different network namespace. + * currently only supported for tc BPF program types at the + * ingress hook and for veth and netkit target device types. The + * peer device must reside in a different network namespace. * * Returns * The helper returns **TC_ACT_REDIRECT** on success or * **TC_ACT_SHOT** on error. */ -static long (*bpf_redirect_peer)(__u32 ifindex, __u64 flags) = (void *) 155; +static long (* const bpf_redirect_peer)(__u32 ifindex, __u64 flags) = (void *) 155; /* * bpf_task_storage_get @@ -3667,7 +3718,7 @@ static long (*bpf_redirect_peer)(__u32 ifindex, __u64 flags) = (void *) 155; * a *map* with *task* as the **key**. From this * perspective, the usage is not much different from * **bpf_map_lookup_elem**\ (*map*, **&**\ *task*) except this - * helper enforces the key must be an task_struct and the map must also + * helper enforces the key must be a task_struct and the map must also * be a **BPF_MAP_TYPE_TASK_STORAGE**. * * Underneath, the value is stored locally at *task* instead of @@ -3688,7 +3739,7 @@ static long (*bpf_redirect_peer)(__u32 ifindex, __u64 flags) = (void *) 155; * **NULL** if not found or there was an error in adding * a new bpf_local_storage. */ -static void *(*bpf_task_storage_get)(void *map, struct task_struct *task, void *value, __u64 flags) = (void *) 156; +static void *(* const bpf_task_storage_get)(void *map, struct task_struct *task, void *value, __u64 flags) = (void *) 156; /* * bpf_task_storage_delete @@ -3700,7 +3751,7 @@ static void *(*bpf_task_storage_get)(void *map, struct task_struct *task, void * * * **-ENOENT** if the bpf_local_storage cannot be found. */ -static long (*bpf_task_storage_delete)(void *map, struct task_struct *task) = (void *) 157; +static long (* const bpf_task_storage_delete)(void *map, struct task_struct *task) = (void *) 157; /* * bpf_get_current_task_btf @@ -3712,7 +3763,7 @@ static long (*bpf_task_storage_delete)(void *map, struct task_struct *task) = (v * Returns * Pointer to the current task. */ -static struct task_struct *(*bpf_get_current_task_btf)(void) = (void *) 158; +static struct task_struct *(* const bpf_get_current_task_btf)(void) = (void *) 158; /* * bpf_bprm_opts_set @@ -3726,7 +3777,7 @@ static struct task_struct *(*bpf_get_current_task_btf)(void) = (void *) 158; * Returns * **-EINVAL** if invalid *flags* are passed, zero otherwise. */ -static long (*bpf_bprm_opts_set)(struct linux_binprm *bprm, __u64 flags) = (void *) 159; +static long (* const bpf_bprm_opts_set)(struct linux_binprm *bprm, __u64 flags) = (void *) 159; /* * bpf_ktime_get_coarse_ns @@ -3740,21 +3791,21 @@ static long (*bpf_bprm_opts_set)(struct linux_binprm *bprm, __u64 flags) = (void * Returns * Current *ktime*. */ -static __u64 (*bpf_ktime_get_coarse_ns)(void) = (void *) 160; +static __u64 (* const bpf_ktime_get_coarse_ns)(void) = (void *) 160; /* * bpf_ima_inode_hash * - * Returns the stored IMA hash of the *inode* (if it's avaialable). + * Returns the stored IMA hash of the *inode* (if it's available). * If the hash is larger than *size*, then only *size* * bytes will be copied to *dst* * * Returns * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if IMA is disabled or **-EINVAL** if + * **-EOPNOTSUPP** if IMA is disabled or **-EINVAL** if * invalid arguments are passed. */ -static long (*bpf_ima_inode_hash)(struct inode *inode, void *dst, __u32 size) = (void *) 161; +static long (* const bpf_ima_inode_hash)(struct inode *inode, void *dst, __u32 size) = (void *) 161; /* * bpf_sock_from_file @@ -3766,7 +3817,7 @@ static long (*bpf_ima_inode_hash)(struct inode *inode, void *dst, __u32 size) = * A pointer to a struct socket on success or NULL if the file is * not a socket. */ -static struct socket *(*bpf_sock_from_file)(struct file *file) = (void *) 162; +static struct socket *(* const bpf_sock_from_file)(struct file *file) = (void *) 162; /* * bpf_check_mtu @@ -3777,12 +3828,12 @@ static struct socket *(*bpf_sock_from_file)(struct file *file) = (void *) 162; * * The argument *len_diff* can be used for querying with a planned * size change. This allows to check MTU prior to changing packet - * ctx. Providing an *len_diff* adjustment that is larger than the + * ctx. Providing a *len_diff* adjustment that is larger than the * actual packet size (resulting in negative packet size) will in - * principle not exceed the MTU, why it is not considered a - * failure. Other BPF-helpers are needed for performing the - * planned size change, why the responsability for catch a negative - * packet size belong in those helpers. + * principle not exceed the MTU, which is why it is not considered + * a failure. Other BPF helpers are needed for performing the + * planned size change; therefore the responsibility for catching + * a negative packet size belongs in those helpers. * * Specifying *ifindex* zero means the MTU check is performed * against the current net device. This is practical if this isn't @@ -3837,7 +3888,7 @@ static struct socket *(*bpf_sock_from_file)(struct file *file) = (void *) 162; * * **BPF_MTU_CHK_RET_FRAG_NEEDED** * * **BPF_MTU_CHK_RET_SEGS_TOOBIG** */ -static long (*bpf_check_mtu)(void *ctx, __u32 ifindex, __u32 *mtu_len, __s32 len_diff, __u64 flags) = (void *) 163; +static long (* const bpf_check_mtu)(void *ctx, __u32 ifindex, __u32 *mtu_len, __s32 len_diff, __u64 flags) = (void *) 163; /* * bpf_for_each_map_elem @@ -3870,7 +3921,7 @@ static long (*bpf_check_mtu)(void *ctx, __u32 ifindex, __u32 *mtu_len, __s32 len * The number of traversed map elements for success, **-EINVAL** for * invalid **flags**. */ -static long (*bpf_for_each_map_elem)(void *map, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 164; +static long (* const bpf_for_each_map_elem)(void *map, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 164; /* * bpf_snprintf @@ -3902,7 +3953,7 @@ static long (*bpf_for_each_map_elem)(void *map, void *callback_fn, void *callbac * * Or **-EBUSY** if the per-CPU memory copy buffer is busy. */ -static long (*bpf_snprintf)(char *str, __u32 str_size, const char *fmt, __u64 *data, __u32 data_len) = (void *) 165; +static long (* const bpf_snprintf)(char *str, __u32 str_size, const char *fmt, __u64 *data, __u32 data_len) = (void *) 165; /* * bpf_sys_bpf @@ -3912,7 +3963,7 @@ static long (*bpf_snprintf)(char *str, __u32 str_size, const char *fmt, __u64 *d * Returns * A syscall result. */ -static long (*bpf_sys_bpf)(__u32 cmd, void *attr, __u32 attr_size) = (void *) 166; +static long (* const bpf_sys_bpf)(__u32 cmd, void *attr, __u32 attr_size) = (void *) 166; /* * bpf_btf_find_by_name_kind @@ -3922,7 +3973,7 @@ static long (*bpf_sys_bpf)(__u32 cmd, void *attr, __u32 attr_size) = (void *) 16 * Returns * Returns btf_id and btf_obj_fd in lower and upper 32 bits. */ -static long (*bpf_btf_find_by_name_kind)(char *name, int name_sz, __u32 kind, int flags) = (void *) 167; +static long (* const bpf_btf_find_by_name_kind)(char *name, int name_sz, __u32 kind, int flags) = (void *) 167; /* * bpf_sys_close @@ -3932,7 +3983,7 @@ static long (*bpf_btf_find_by_name_kind)(char *name, int name_sz, __u32 kind, in * Returns * A syscall result. */ -static long (*bpf_sys_close)(__u32 fd) = (void *) 168; +static long (* const bpf_sys_close)(__u32 fd) = (void *) 168; /* * bpf_timer_init @@ -3953,7 +4004,7 @@ static long (*bpf_sys_close)(__u32 fd) = (void *) 168; * or pin such map in bpffs. When map is unpinned or file descriptor is * closed all timers in the map will be cancelled and freed. */ -static long (*bpf_timer_init)(struct bpf_timer *timer, void *map, __u64 flags) = (void *) 169; +static long (* const bpf_timer_init)(struct bpf_timer *timer, void *map, __u64 flags) = (void *) 169; /* * bpf_timer_set_callback @@ -3968,7 +4019,7 @@ static long (*bpf_timer_init)(struct bpf_timer *timer, void *map, __u64 flags) = * or pin such map in bpffs. When map is unpinned or file descriptor is * closed all timers in the map will be cancelled and freed. */ -static long (*bpf_timer_set_callback)(struct bpf_timer *timer, void *callback_fn) = (void *) 170; +static long (* const bpf_timer_set_callback)(struct bpf_timer *timer, void *callback_fn) = (void *) 170; /* * bpf_timer_start @@ -3992,13 +4043,21 @@ static long (*bpf_timer_set_callback)(struct bpf_timer *timer, void *callback_fn * different maps if key/value layout matches across maps. * Every bpf_timer_set_callback() can have different callback_fn. * + * *flags* can be one of: + * + * **BPF_F_TIMER_ABS** + * Start the timer in absolute expire value instead of the + * default relative one. + * **BPF_F_TIMER_CPU_PIN** + * Timer will be pinned to the CPU of the caller. + * * * Returns * 0 on success. * **-EINVAL** if *timer* was not initialized with bpf_timer_init() earlier * or invalid *flags* are passed. */ -static long (*bpf_timer_start)(struct bpf_timer *timer, __u64 nsecs, __u64 flags) = (void *) 171; +static long (* const bpf_timer_start)(struct bpf_timer *timer, __u64 nsecs, __u64 flags) = (void *) 171; /* * bpf_timer_cancel @@ -4012,17 +4071,23 @@ static long (*bpf_timer_start)(struct bpf_timer *timer, __u64 nsecs, __u64 flags * **-EDEADLK** if callback_fn tried to call bpf_timer_cancel() on its * own timer which would have led to a deadlock otherwise. */ -static long (*bpf_timer_cancel)(struct bpf_timer *timer) = (void *) 172; +static long (* const bpf_timer_cancel)(struct bpf_timer *timer) = (void *) 172; /* * bpf_get_func_ip * * Get address of the traced function (for tracing and kprobe programs). * + * When called for kprobe program attached as uprobe it returns + * probe address for both entry and return uprobe. + * + * * Returns - * Address of the traced function. + * Address of the traced function for kprobe. + * 0 for kprobes placed within the function (not at the entry). + * Address of the probe for uprobe and return uprobe. */ -static __u64 (*bpf_get_func_ip)(void *ctx) = (void *) 173; +static __u64 (* const bpf_get_func_ip)(void *ctx) = (void *) 173; /* * bpf_get_attach_cookie @@ -4041,7 +4106,7 @@ static __u64 (*bpf_get_func_ip)(void *ctx) = (void *) 173; * Value specified by user at BPF link creation/attachment time * or 0, if it was not specified. */ -static __u64 (*bpf_get_attach_cookie)(void *ctx) = (void *) 174; +static __u64 (* const bpf_get_attach_cookie)(void *ctx) = (void *) 174; /* * bpf_task_pt_regs @@ -4051,7 +4116,7 @@ static __u64 (*bpf_get_attach_cookie)(void *ctx) = (void *) 174; * Returns * A pointer to struct pt_regs. */ -static long (*bpf_task_pt_regs)(struct task_struct *task) = (void *) 175; +static long (* const bpf_task_pt_regs)(struct task_struct *task) = (void *) 175; /* * bpf_get_branch_snapshot @@ -4076,7 +4141,7 @@ static long (*bpf_task_pt_regs)(struct task_struct *task) = (void *) 175; * * **-ENOENT** if architecture does not support branch records. */ -static long (*bpf_get_branch_snapshot)(void *entries, __u32 size, __u64 flags) = (void *) 176; +static long (* const bpf_get_branch_snapshot)(void *entries, __u32 size, __u64 flags) = (void *) 176; /* * bpf_trace_vprintk @@ -4090,7 +4155,7 @@ static long (*bpf_get_branch_snapshot)(void *entries, __u32 size, __u64 flags) = * The number of bytes written to the buffer, or a negative error * in case of failure. */ -static long (*bpf_trace_vprintk)(const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 177; +static long (* const bpf_trace_vprintk)(const char *fmt, __u32 fmt_size, const void *data, __u32 data_len) = (void *) 177; /* * bpf_skc_to_unix_sock @@ -4100,7 +4165,7 @@ static long (*bpf_trace_vprintk)(const char *fmt, __u32 fmt_size, const void *da * Returns * *sk* if casting is valid, or **NULL** otherwise. */ -static struct unix_sock *(*bpf_skc_to_unix_sock)(void *sk) = (void *) 178; +static struct unix_sock *(* const bpf_skc_to_unix_sock)(void *sk) = (void *) 178; /* * bpf_kallsyms_lookup_name @@ -4119,7 +4184,7 @@ static struct unix_sock *(*bpf_skc_to_unix_sock)(void *sk) = (void *) 178; * * **-EPERM** if caller does not have permission to obtain kernel address. */ -static long (*bpf_kallsyms_lookup_name)(const char *name, int name_sz, int flags, __u64 *res) = (void *) 179; +static long (* const bpf_kallsyms_lookup_name)(const char *name, int name_sz, int flags, __u64 *res) = (void *) 179; /* * bpf_find_vma @@ -4142,7 +4207,7 @@ static long (*bpf_kallsyms_lookup_name)(const char *name, int name_sz, int flags * **-EBUSY** if failed to try lock mmap_lock. * **-EINVAL** for invalid **flags**. */ -static long (*bpf_find_vma)(struct task_struct *task, __u64 addr, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 180; +static long (* const bpf_find_vma)(struct task_struct *task, __u64 addr, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 180; /* * bpf_loop @@ -4170,7 +4235,7 @@ static long (*bpf_find_vma)(struct task_struct *task, __u64 addr, void *callback * The number of loops performed, **-EINVAL** for invalid **flags**, * **-E2BIG** if **nr_loops** exceeds the maximum number of loops. */ -static long (*bpf_loop)(__u32 nr_loops, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 181; +static long (* const bpf_loop)(__u32 nr_loops, void *callback_fn, void *callback_ctx, __u64 flags) = (void *) 181; /* * bpf_strncmp @@ -4184,20 +4249,20 @@ static long (*bpf_loop)(__u32 nr_loops, void *callback_fn, void *callback_ctx, _ * if the first **s1_sz** bytes of **s1** is found to be * less than, to match, or be greater than **s2**. */ -static long (*bpf_strncmp)(const char *s1, __u32 s1_sz, const char *s2) = (void *) 182; +static long (* const bpf_strncmp)(const char *s1, __u32 s1_sz, const char *s2) = (void *) 182; /* * bpf_get_func_arg * - * Get **n**-th argument (zero based) of the traced function (for tracing programs) + * Get **n**-th argument register (zero based) of the traced function (for tracing programs) * returned in **value**. * * * Returns * 0 on success. - * **-EINVAL** if n >= arguments count of traced function. + * **-EINVAL** if n >= argument register count of traced function. */ -static long (*bpf_get_func_arg)(void *ctx, __u32 n, __u64 *value) = (void *) 183; +static long (* const bpf_get_func_arg)(void *ctx, __u32 n, __u64 *value) = (void *) 183; /* * bpf_get_func_ret @@ -4210,42 +4275,55 @@ static long (*bpf_get_func_arg)(void *ctx, __u32 n, __u64 *value) = (void *) 183 * 0 on success. * **-EOPNOTSUPP** for tracing programs other than BPF_TRACE_FEXIT or BPF_MODIFY_RETURN. */ -static long (*bpf_get_func_ret)(void *ctx, __u64 *value) = (void *) 184; +static long (* const bpf_get_func_ret)(void *ctx, __u64 *value) = (void *) 184; /* * bpf_get_func_arg_cnt * - * Get number of arguments of the traced function (for tracing programs). + * Get number of registers of the traced function (for tracing programs) where + * function arguments are stored in these registers. * * * Returns - * The number of arguments of the traced function. + * The number of argument registers of the traced function. */ -static long (*bpf_get_func_arg_cnt)(void *ctx) = (void *) 185; +static long (* const bpf_get_func_arg_cnt)(void *ctx) = (void *) 185; /* * bpf_get_retval * - * Get the syscall's return value that will be returned to userspace. + * Get the BPF program's return value that will be returned to the upper layers. * - * This helper is currently supported by cgroup programs only. + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. * * Returns - * The syscall's return value. + * The BPF program's return value. */ -static int (*bpf_get_retval)(void) = (void *) 186; +static int (* const bpf_get_retval)(void) = (void *) 186; /* * bpf_set_retval * - * Set the syscall's return value that will be returned to userspace. + * Set the BPF program's return value that will be returned to the upper layers. + * + * This helper is currently supported by cgroup programs and only by the hooks + * where BPF program's return value is returned to the userspace via errno. + * + * Note that there is the following corner case where the program exports an error + * via bpf_set_retval but signals success via 'return 1': + * + * bpf_set_retval(-EPERM); + * return 1; + * + * In this case, the BPF program's return value will use helper's -EPERM. This + * still holds true for cgroup/bind{4,6} which supports extra 'return 3' success case. * - * This helper is currently supported by cgroup programs only. * * Returns * 0 on success, or a negative error in case of failure. */ -static int (*bpf_set_retval)(int retval) = (void *) 187; +static int (* const bpf_set_retval)(int retval) = (void *) 187; /* * bpf_xdp_get_buff_len @@ -4255,7 +4333,7 @@ static int (*bpf_set_retval)(int retval) = (void *) 187; * Returns * The total size of a given xdp buffer. */ -static __u64 (*bpf_xdp_get_buff_len)(struct xdp_md *xdp_md) = (void *) 188; +static __u64 (* const bpf_xdp_get_buff_len)(struct xdp_md *xdp_md) = (void *) 188; /* * bpf_xdp_load_bytes @@ -4268,7 +4346,7 @@ static __u64 (*bpf_xdp_get_buff_len)(struct xdp_md *xdp_md) = (void *) 188; * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 189; +static long (* const bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 189; /* * bpf_xdp_store_bytes @@ -4279,7 +4357,7 @@ static long (*bpf_xdp_load_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf * Returns * 0 on success, or a negative error in case of failure. */ -static long (*bpf_xdp_store_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 190; +static long (* const bpf_xdp_store_bytes)(struct xdp_md *xdp_md, __u32 offset, void *buf, __u32 len) = (void *) 190; /* * bpf_copy_from_user_task @@ -4293,7 +4371,7 @@ static long (*bpf_xdp_store_bytes)(struct xdp_md *xdp_md, __u32 offset, void *bu * 0 on success, or a negative error in case of failure. On error * *dst* buffer is zeroed out. */ -static long (*bpf_copy_from_user_task)(void *dst, __u32 size, const void *user_ptr, struct task_struct *tsk, __u64 flags) = (void *) 191; +static long (* const bpf_copy_from_user_task)(void *dst, __u32 size, const void *user_ptr, struct task_struct *tsk, __u64 flags) = (void *) 191; /* * bpf_skb_set_tstamp @@ -4327,7 +4405,7 @@ static long (*bpf_copy_from_user_task)(void *dst, __u32 size, const void *user_p * **-EINVAL** for invalid input * **-EOPNOTSUPP** for unsupported protocol */ -static long (*bpf_skb_set_tstamp)(struct __sk_buff *skb, __u64 tstamp, __u32 tstamp_type) = (void *) 192; +static long (* const bpf_skb_set_tstamp)(struct __sk_buff *skb, __u64 tstamp, __u32 tstamp_type) = (void *) 192; /* * bpf_ima_file_hash @@ -4338,10 +4416,10 @@ static long (*bpf_skb_set_tstamp)(struct __sk_buff *skb, __u64 tstamp, __u32 tst * * Returns * The **hash_algo** is returned on success, - * **-EOPNOTSUP** if the hash calculation failed or **-EINVAL** if + * **-EOPNOTSUPP** if the hash calculation failed or **-EINVAL** if * invalid arguments are passed. */ -static long (*bpf_ima_file_hash)(struct file *file, void *dst, __u32 size) = (void *) 193; +static long (* const bpf_ima_file_hash)(struct file *file, void *dst, __u32 size) = (void *) 193; /* * bpf_kptr_xchg @@ -4356,6 +4434,337 @@ static long (*bpf_ima_file_hash)(struct file *file, void *dst, __u32 size) = (vo * corresponding release function, or moved into a BPF map before * program exit. */ -static void *(*bpf_kptr_xchg)(void *map_value, void *ptr) = (void *) 194; +static void *(* const bpf_kptr_xchg)(void *map_value, void *ptr) = (void *) 194; + +/* + * bpf_map_lookup_percpu_elem + * + * Perform a lookup in *percpu map* for an entry associated to + * *key* on *cpu*. + * + * Returns + * Map value associated to *key* on *cpu*, or **NULL** if no entry + * was found or *cpu* is invalid. + */ +static void *(* const bpf_map_lookup_percpu_elem)(void *map, const void *key, __u32 cpu) = (void *) 195; + +/* + * bpf_skc_to_mptcp_sock + * + * Dynamically cast a *sk* pointer to a *mptcp_sock* pointer. + * + * Returns + * *sk* if casting is valid, or **NULL** otherwise. + */ +static struct mptcp_sock *(* const bpf_skc_to_mptcp_sock)(void *sk) = (void *) 196; + +/* + * bpf_dynptr_from_mem + * + * Get a dynptr to local memory *data*. + * + * *data* must be a ptr to a map value. + * The maximum *size* supported is DYNPTR_MAX_SIZE. + * *flags* is currently unused. + * + * Returns + * 0 on success, -E2BIG if the size exceeds DYNPTR_MAX_SIZE, + * -EINVAL if flags is not 0. + */ +static long (* const bpf_dynptr_from_mem)(void *data, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 197; + +/* + * bpf_ringbuf_reserve_dynptr + * + * Reserve *size* bytes of payload in a ring buffer *ringbuf* + * through the dynptr interface. *flags* must be 0. + * + * Please note that a corresponding bpf_ringbuf_submit_dynptr or + * bpf_ringbuf_discard_dynptr must be called on *ptr*, even if the + * reservation fails. This is enforced by the verifier. + * + * Returns + * 0 on success, or a negative error in case of failure. + */ +static long (* const bpf_ringbuf_reserve_dynptr)(void *ringbuf, __u32 size, __u64 flags, struct bpf_dynptr *ptr) = (void *) 198; + +/* + * bpf_ringbuf_submit_dynptr + * + * Submit reserved ring buffer sample, pointed to by *data*, + * through the dynptr interface. This is a no-op if the dynptr is + * invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_submit'. + * + * Returns + * Nothing. Always succeeds. + */ +static void (* const bpf_ringbuf_submit_dynptr)(struct bpf_dynptr *ptr, __u64 flags) = (void *) 199; + +/* + * bpf_ringbuf_discard_dynptr + * + * Discard reserved ring buffer sample through the dynptr + * interface. This is a no-op if the dynptr is invalid/null. + * + * For more information on *flags*, please see + * 'bpf_ringbuf_discard'. + * + * Returns + * Nothing. Always succeeds. + */ +static void (* const bpf_ringbuf_discard_dynptr)(struct bpf_dynptr *ptr, __u64 flags) = (void *) 200; + +/* + * bpf_dynptr_read + * + * Read *len* bytes from *src* into *dst*, starting from *offset* + * into *src*. + * *flags* is currently unused. + * + * Returns + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *src*'s data, -EINVAL if *src* is an invalid dynptr or if + * *flags* is not 0. + */ +static long (* const bpf_dynptr_read)(void *dst, __u32 len, const struct bpf_dynptr *src, __u32 offset, __u64 flags) = (void *) 201; + +/* + * bpf_dynptr_write + * + * Write *len* bytes from *src* into *dst*, starting from *offset* + * into *dst*. + * + * *flags* must be 0 except for skb-type dynptrs. + * + * For skb-type dynptrs: + * * All data slices of the dynptr are automatically + * invalidated after **bpf_dynptr_write**\ (). This is + * because writing may pull the skb and change the + * underlying packet buffer. + * + * * For *flags*, please see the flags accepted by + * **bpf_skb_store_bytes**\ (). + * + * Returns + * 0 on success, -E2BIG if *offset* + *len* exceeds the length + * of *dst*'s data, -EINVAL if *dst* is an invalid dynptr or if *dst* + * is a read-only dynptr or if *flags* is not correct. For skb-type dynptrs, + * other errors correspond to errors returned by **bpf_skb_store_bytes**\ (). + */ +static long (* const bpf_dynptr_write)(const struct bpf_dynptr *dst, __u32 offset, void *src, __u32 len, __u64 flags) = (void *) 202; + +/* + * bpf_dynptr_data + * + * Get a pointer to the underlying dynptr data. + * + * *len* must be a statically known value. The returned data slice + * is invalidated whenever the dynptr is invalidated. + * + * skb and xdp type dynptrs may not use bpf_dynptr_data. They should + * instead use bpf_dynptr_slice and bpf_dynptr_slice_rdwr. + * + * Returns + * Pointer to the underlying dynptr data, NULL if the dynptr is + * read-only, if the dynptr is invalid, or if the offset and length + * is out of bounds. + */ +static void *(* const bpf_dynptr_data)(const struct bpf_dynptr *ptr, __u32 offset, __u32 len) = (void *) 203; + +/* + * bpf_tcp_raw_gen_syncookie_ipv4 + * + * Try to issue a SYN cookie for the packet with corresponding + * IPv4/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + */ +static __s64 (* const bpf_tcp_raw_gen_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th, __u32 th_len) = (void *) 204; + +/* + * bpf_tcp_raw_gen_syncookie_ipv6 + * + * Try to issue a SYN cookie for the packet with corresponding + * IPv6/TCP headers, *iph* and *th*, without depending on a + * listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the start of the TCP header, while *th_len* + * contains the length of the TCP header (at least + * **sizeof**\ (**struct tcphdr**)). + * + * Returns + * On success, lower 32 bits hold the generated SYN cookie in + * followed by 16 bits which hold the MSS value for that cookie, + * and the top 16 bits are unused. + * + * On failure, the returned value is one of the following: + * + * **-EINVAL** if *th_len* is invalid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + */ +static __s64 (* const bpf_tcp_raw_gen_syncookie_ipv6)(struct ipv6hdr *iph, struct tcphdr *th, __u32 th_len) = (void *) 205; + +/* + * bpf_tcp_raw_check_syncookie_ipv4 + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv4 header. + * + * *th* points to the TCP header. + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + */ +static long (* const bpf_tcp_raw_check_syncookie_ipv4)(struct iphdr *iph, struct tcphdr *th) = (void *) 206; + +/* + * bpf_tcp_raw_check_syncookie_ipv6 + * + * Check whether *iph* and *th* contain a valid SYN cookie ACK + * without depending on a listening socket. + * + * *iph* points to the IPv6 header. + * + * *th* points to the TCP header. + * + * Returns + * 0 if *iph* and *th* are a valid SYN cookie ACK. + * + * On failure, the returned value is one of the following: + * + * **-EACCES** if the SYN cookie is not valid. + * + * **-EPROTONOSUPPORT** if CONFIG_IPV6 is not builtin. + */ +static long (* const bpf_tcp_raw_check_syncookie_ipv6)(struct ipv6hdr *iph, struct tcphdr *th) = (void *) 207; + +/* + * bpf_ktime_get_tai_ns + * + * A nonsettable system-wide clock derived from wall-clock time but + * ignoring leap seconds. This clock does not experience + * discontinuities and backwards jumps caused by NTP inserting leap + * seconds as CLOCK_REALTIME does. + * + * See: **clock_gettime**\ (**CLOCK_TAI**) + * + * Returns + * Current *ktime*. + */ +static __u64 (* const bpf_ktime_get_tai_ns)(void) = (void *) 208; + +/* + * bpf_user_ringbuf_drain + * + * Drain samples from the specified user ring buffer, and invoke + * the provided callback for each such sample: + * + * long (\*callback_fn)(const struct bpf_dynptr \*dynptr, void \*ctx); + * + * If **callback_fn** returns 0, the helper will continue to try + * and drain the next sample, up to a maximum of + * BPF_MAX_USER_RINGBUF_SAMPLES samples. If the return value is 1, + * the helper will skip the rest of the samples and return. Other + * return values are not used now, and will be rejected by the + * verifier. + * + * Returns + * The number of drained samples if no error was encountered while + * draining samples, or 0 if no samples were present in the ring + * buffer. If a user-space producer was epoll-waiting on this map, + * and at least one sample was drained, they will receive an event + * notification notifying them of available space in the ring + * buffer. If the BPF_RB_NO_WAKEUP flag is passed to this + * function, no wakeup notification will be sent. If the + * BPF_RB_FORCE_WAKEUP flag is passed, a wakeup notification will + * be sent even if no sample was drained. + * + * On failure, the returned value is one of the following: + * + * **-EBUSY** if the ring buffer is contended, and another calling + * context was concurrently draining the ring buffer. + * + * **-EINVAL** if user-space is not properly tracking the ring + * buffer due to the producer position not being aligned to 8 + * bytes, a sample not being aligned to 8 bytes, or the producer + * position not matching the advertised length of a sample. + * + * **-E2BIG** if user-space has tried to publish a sample which is + * larger than the size of the ring buffer, or which cannot fit + * within a struct bpf_dynptr. + */ +static long (* const bpf_user_ringbuf_drain)(void *map, void *callback_fn, void *ctx, __u64 flags) = (void *) 209; + +/* + * bpf_cgrp_storage_get + * + * Get a bpf_local_storage from the *cgroup*. + * + * Logically, it could be thought of as getting the value from + * a *map* with *cgroup* as the **key**. From this + * perspective, the usage is not much different from + * **bpf_map_lookup_elem**\ (*map*, **&**\ *cgroup*) except this + * helper enforces the key must be a cgroup struct and the map must also + * be a **BPF_MAP_TYPE_CGRP_STORAGE**. + * + * In reality, the local-storage value is embedded directly inside of the + * *cgroup* object itself, rather than being located in the + * **BPF_MAP_TYPE_CGRP_STORAGE** map. When the local-storage value is + * queried for some *map* on a *cgroup* object, the kernel will perform an + * O(n) iteration over all of the live local-storage values for that + * *cgroup* object until the local-storage value for the *map* is found. + * + * An optional *flags* (**BPF_LOCAL_STORAGE_GET_F_CREATE**) can be + * used such that a new bpf_local_storage will be + * created if one does not exist. *value* can be used + * together with **BPF_LOCAL_STORAGE_GET_F_CREATE** to specify + * the initial value of a bpf_local_storage. If *value* is + * **NULL**, the new bpf_local_storage will be zero initialized. + * + * Returns + * A bpf_local_storage pointer is returned on success. + * + * **NULL** if not found or there was an error in adding + * a new bpf_local_storage. + */ +static void *(* const bpf_cgrp_storage_get)(void *map, struct cgroup *cgroup, void *value, __u64 flags) = (void *) 210; + +/* + * bpf_cgrp_storage_delete + * + * Delete a bpf_local_storage from a *cgroup*. + * + * Returns + * 0 on success. + * + * **-ENOENT** if the bpf_local_storage cannot be found. + */ +static long (* const bpf_cgrp_storage_delete)(void *map, struct cgroup *cgroup) = (void *) 211; diff --git a/pkg/plugin/lib/common/libbpf/_src/bpf_helpers.h b/pkg/plugin/lib/common/libbpf/_src/bpf_helpers.h index 5de3eb2671..305c62817d 100644 --- a/pkg/plugin/lib/common/libbpf/_src/bpf_helpers.h +++ b/pkg/plugin/lib/common/libbpf/_src/bpf_helpers.h @@ -13,6 +13,7 @@ #define __uint(name, val) int (*name)[val] #define __type(name, val) typeof(val) *name #define __array(name, val) typeof(val) *name[] +#define __ulong(name, val) enum { ___bpf_concat(__unique_value, __COUNTER__) = val } name /* * Helper macro to place programs, maps, license in @@ -22,12 +23,25 @@ * To allow use of SEC() with externs (e.g., for extern .maps declarations), * make sure __attribute__((unused)) doesn't trigger compilation warning. */ +#if __GNUC__ && !__clang__ + +/* + * Pragma macros are broken on GCC + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=55578 + * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=90400 + */ +#define SEC(name) __attribute__((section(name), used)) + +#else + #define SEC(name) \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wignored-attributes\"") \ __attribute__((section(name), used)) \ _Pragma("GCC diagnostic pop") \ +#endif + /* Avoid 'linux/stddef.h' definition of '__always_inline'. */ #undef __always_inline #define __always_inline inline __attribute__((always_inline)) @@ -64,15 +78,44 @@ /* * Helper macros to manipulate data structures */ -#ifndef offsetof -#define offsetof(TYPE, MEMBER) ((unsigned long)&((TYPE *)0)->MEMBER) -#endif -#ifndef container_of + +/* offsetof() definition that uses __builtin_offset() might not preserve field + * offset CO-RE relocation properly, so force-redefine offsetof() using + * old-school approach which works with CO-RE correctly + */ +#undef offsetof +#define offsetof(type, member) ((unsigned long)&((type *)0)->member) + +/* redefined container_of() to ensure we use the above offsetof() macro */ +#undef container_of #define container_of(ptr, type, member) \ ({ \ void *__mptr = (void *)(ptr); \ ((type *)(__mptr - offsetof(type, member))); \ }) + +/* + * Compiler (optimization) barrier. + */ +#ifndef barrier +#define barrier() asm volatile("" ::: "memory") +#endif + +/* Variable-specific compiler (optimization) barrier. It's a no-op which makes + * compiler believe that there is some black box modification of a given + * variable and thus prevents compiler from making extra assumption about its + * value and potential simplifications and optimizations on this variable. + * + * E.g., compiler might often delay or even omit 32-bit to 64-bit casting of + * a variable, making some code patterns unverifiable. Putting barrier_var() + * in place will ensure that cast is performed before the barrier_var() + * invocation, because compiler has to pessimistically assume that embedded + * asm section might perform some extra operations on that variable. + * + * This is a variable-specific variant of more global barrier(). + */ +#ifndef barrier_var +#define barrier_var(var) asm volatile("" : "+r"(var)) #endif /* @@ -94,7 +137,8 @@ /* * Helper function to perform a tail call with a constant/immediate map slot. */ -#if __clang_major__ >= 8 && defined(__bpf__) +#if (defined(__clang__) && __clang_major__ >= 8) || (!defined(__clang__) && __GNUC__ > 12) +#if defined(__bpf__) static __always_inline void bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) { @@ -122,18 +166,7 @@ bpf_tail_call_static(void *ctx, const void *map, const __u32 slot) : "r0", "r1", "r2", "r3", "r4", "r5"); } #endif - -/* - * Helper structure used by eBPF C program - * to describe BPF map attributes to libbpf loader - */ -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; -} __attribute__((deprecated("use BTF-defined maps in .maps section"))); +#endif enum libbpf_pin_type { LIBBPF_PIN_NONE, @@ -149,14 +182,32 @@ enum libbpf_tristate { #define __kconfig __attribute__((section(".kconfig"))) #define __ksym __attribute__((section(".ksyms"))) -#if __has_attribute(btf_type_tag) +#define __kptr_untrusted __attribute__((btf_type_tag("kptr_untrusted"))) #define __kptr __attribute__((btf_type_tag("kptr"))) -#define __kptr_ref __attribute__((btf_type_tag("kptr_ref"))) +#define __percpu_kptr __attribute__((btf_type_tag("percpu_kptr"))) + +#if defined (__clang__) +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(!__builtin_constant_p(!!sym), \ + #sym " should be marked as __weak"); \ + !!sym; \ +}) +#elif __GNUC__ > 8 +#define bpf_ksym_exists(sym) ({ \ + _Static_assert(__builtin_has_attribute (*sym, __weak__), \ + #sym " should be marked as __weak"); \ + !!sym; \ +}) #else -#define __kptr -#define __kptr_ref +#define bpf_ksym_exists(sym) !!sym #endif +#define __arg_ctx __attribute__((btf_decl_tag("arg:ctx"))) +#define __arg_nonnull __attribute((btf_decl_tag("arg:nonnull"))) +#define __arg_nullable __attribute((btf_decl_tag("arg:nullable"))) +#define __arg_trusted __attribute((btf_decl_tag("arg:trusted"))) +#define __arg_arena __attribute((btf_decl_tag("arg:arena"))) + #ifndef ___bpf_concat #define ___bpf_concat(a, b) a ## b #endif @@ -266,4 +317,107 @@ enum libbpf_tristate { /* Helper macro to print out debug messages */ #define bpf_printk(fmt, args...) ___bpf_pick_printk(args)(fmt, ##args) +struct bpf_iter_num; + +extern int bpf_iter_num_new(struct bpf_iter_num *it, int start, int end) __weak __ksym; +extern int *bpf_iter_num_next(struct bpf_iter_num *it) __weak __ksym; +extern void bpf_iter_num_destroy(struct bpf_iter_num *it) __weak __ksym; + +#ifndef bpf_for_each +/* bpf_for_each(iter_type, cur_elem, args...) provides generic construct for + * using BPF open-coded iterators without having to write mundane explicit + * low-level loop logic. Instead, it provides for()-like generic construct + * that can be used pretty naturally. E.g., for some hypothetical cgroup + * iterator, you'd write: + * + * struct cgroup *cg, *parent_cg = <...>; + * + * bpf_for_each(cgroup, cg, parent_cg, CG_ITER_CHILDREN) { + * bpf_printk("Child cgroup id = %d", cg->cgroup_id); + * if (cg->cgroup_id == 123) + * break; + * } + * + * I.e., it looks almost like high-level for each loop in other languages, + * supports continue/break, and is verifiable by BPF verifier. + * + * For iterating integers, the difference betwen bpf_for_each(num, i, N, M) + * and bpf_for(i, N, M) is in that bpf_for() provides additional proof to + * verifier that i is in [N, M) range, and in bpf_for_each() case i is `int + * *`, not just `int`. So for integers bpf_for() is more convenient. + * + * Note: this macro relies on C99 feature of allowing to declare variables + * inside for() loop, bound to for() loop lifetime. It also utilizes GCC + * extension: __attribute__((cleanup())), supported by both GCC and + * Clang. + */ +#define bpf_for_each(type, cur, args...) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_##type ___it __attribute__((aligned(8), /* enforce, just in case */, \ + cleanup(bpf_iter_##type##_destroy))), \ + /* ___p pointer is just to call bpf_iter_##type##_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_##type##_new(&___it, ##args), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_##type##_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_##type##_destroy, (void *)0); \ + /* iteration and termination check */ \ + (((cur) = bpf_iter_##type##_next(&___it))); \ +) +#endif /* bpf_for_each */ + +#ifndef bpf_for +/* bpf_for(i, start, end) implements a for()-like looping construct that sets + * provided integer variable *i* to values starting from *start* through, + * but not including, *end*. It also proves to BPF verifier that *i* belongs + * to range [start, end), so this can be used for accessing arrays without + * extra checks. + * + * Note: *start* and *end* are assumed to be expressions with no side effects + * and whose values do not change throughout bpf_for() loop execution. They do + * not have to be statically known or constant, though. + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_for(i, start, end) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, (start), (end)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + ({ \ + /* iteration step */ \ + int *___t = bpf_iter_num_next(&___it); \ + /* termination and bounds check */ \ + (___t && ((i) = *___t, (i) >= (start) && (i) < (end))); \ + }); \ +) +#endif /* bpf_for */ + +#ifndef bpf_repeat +/* bpf_repeat(N) performs N iterations without exposing iteration number + * + * Note: similarly to bpf_for_each(), it relies on C99 feature of declaring for() + * loop bound variables and cleanup attribute, supported by GCC and Clang. + */ +#define bpf_repeat(N) for ( \ + /* initialize and define destructor */ \ + struct bpf_iter_num ___it __attribute__((aligned(8), /* enforce, just in case */ \ + cleanup(bpf_iter_num_destroy))), \ + /* ___p pointer is necessary to call bpf_iter_num_new() *once* to init ___it */ \ + *___p __attribute__((unused)) = ( \ + bpf_iter_num_new(&___it, 0, (N)), \ + /* this is a workaround for Clang bug: it currently doesn't emit BTF */ \ + /* for bpf_iter_num_destroy() when used from cleanup() attribute */ \ + (void)bpf_iter_num_destroy, (void *)0); \ + bpf_iter_num_next(&___it); \ + /* nothing here */ \ +) +#endif /* bpf_repeat */ + #endif diff --git a/pkg/plugin/lib/common/libbpf/_src/bpf_tracing.h b/pkg/plugin/lib/common/libbpf/_src/bpf_tracing.h index 01ce121c30..9314fa95f0 100644 --- a/pkg/plugin/lib/common/libbpf/_src/bpf_tracing.h +++ b/pkg/plugin/lib/common/libbpf/_src/bpf_tracing.h @@ -2,6 +2,8 @@ #ifndef __BPF_TRACING_H__ #define __BPF_TRACING_H__ +#include "bpf_helpers.h" + /* Scan the ARCH passed in from ARCH env variable (see Makefile) */ #if defined(__TARGET_ARCH_x86) #define bpf_target_x86 @@ -30,6 +32,9 @@ #elif defined(__TARGET_ARCH_arc) #define bpf_target_arc #define bpf_target_defined +#elif defined(__TARGET_ARCH_loongarch) + #define bpf_target_loongarch + #define bpf_target_defined #else /* Fall back to what the compiler says */ @@ -60,6 +65,9 @@ #elif defined(__arc__) #define bpf_target_arc #define bpf_target_defined +#elif defined(__loongarch__) + #define bpf_target_loongarch + #define bpf_target_defined #endif /* no compiler target */ #endif @@ -70,6 +78,10 @@ #if defined(bpf_target_x86) +/* + * https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI + */ + #if defined(__KERNEL__) || defined(__VMLINUX_H__) #define __PT_PARM1_REG di @@ -77,25 +89,40 @@ #define __PT_PARM3_REG dx #define __PT_PARM4_REG cx #define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 +/* + * Syscall uses r10 for PARM4. See arch/x86/entry/entry_64.S:entry_SYSCALL_64 + * comments in Linux sources. And refer to syscall(2) manpage. + */ +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG r10 +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG sp #define __PT_FP_REG bp #define __PT_RC_REG ax #define __PT_SP_REG sp #define __PT_IP_REG ip -/* syscall uses r10 for PARM4 */ -#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10) -#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10) #else #ifdef __i386__ +/* i386 kernel is built with -mregparm=3 */ #define __PT_PARM1_REG eax #define __PT_PARM2_REG edx #define __PT_PARM3_REG ecx -/* i386 kernel is built with -mregparm=3 */ -#define __PT_PARM4_REG __unsupported__ -#define __PT_PARM5_REG __unsupported__ +/* i386 syscall ABI is very different, refer to syscall(2) manpage */ +#define __PT_PARM1_SYSCALL_REG ebx +#define __PT_PARM2_SYSCALL_REG ecx +#define __PT_PARM3_SYSCALL_REG edx +#define __PT_PARM4_SYSCALL_REG esi +#define __PT_PARM5_SYSCALL_REG edi +#define __PT_PARM6_SYSCALL_REG ebp + #define __PT_RET_REG esp #define __PT_FP_REG ebp #define __PT_RC_REG eax @@ -109,14 +136,20 @@ #define __PT_PARM3_REG rdx #define __PT_PARM4_REG rcx #define __PT_PARM5_REG r8 +#define __PT_PARM6_REG r9 + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG r10 +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG rsp #define __PT_FP_REG rbp #define __PT_RC_REG rax #define __PT_SP_REG rsp #define __PT_IP_REG rip -/* syscall uses r10 for PARM4 */ -#define PT_REGS_PARM4_SYSCALL(x) ((x)->r10) -#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(x, r10) #endif /* __i386__ */ @@ -124,6 +157,10 @@ #elif defined(bpf_target_s390) +/* + * https://github.com/IBM/s390x-abi/releases/download/v1.6/lzsabi_s390x.pdf + */ + struct pt_regs___s390 { unsigned long orig_gpr2; }; @@ -135,21 +172,42 @@ struct pt_regs___s390 { #define __PT_PARM3_REG gprs[4] #define __PT_PARM4_REG gprs[5] #define __PT_PARM5_REG gprs[6] -#define __PT_RET_REG grps[14] + +#define __PT_PARM1_SYSCALL_REG orig_gpr2 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG gprs[7] +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) \ + BPF_CORE_READ((const struct pt_regs___s390 *)(x), __PT_PARM1_SYSCALL_REG) + +#define __PT_RET_REG gprs[14] #define __PT_FP_REG gprs[11] /* Works only with CONFIG_FRAME_POINTER */ #define __PT_RC_REG gprs[2] #define __PT_SP_REG gprs[15] #define __PT_IP_REG psw.addr -#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma("GCC error \"use PT_REGS_PARM1_CORE_SYSCALL() instead\""); 0l; }) -#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___s390 *)(x), orig_gpr2) #elif defined(bpf_target_arm) +/* + * https://github.com/ARM-software/abi-aa/blob/main/aapcs32/aapcs32.rst#machine-registers + */ + #define __PT_PARM1_REG uregs[0] #define __PT_PARM2_REG uregs[1] #define __PT_PARM3_REG uregs[2] #define __PT_PARM4_REG uregs[3] -#define __PT_PARM5_REG uregs[4] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG uregs[4] +#define __PT_PARM6_SYSCALL_REG uregs[5] +#define __PT_PARM7_SYSCALL_REG uregs[6] + #define __PT_RET_REG uregs[14] #define __PT_FP_REG uregs[11] /* Works only with CONFIG_FRAME_POINTER */ #define __PT_RC_REG uregs[0] @@ -158,6 +216,10 @@ struct pt_regs___s390 { #elif defined(bpf_target_arm64) +/* + * https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst#machine-registers + */ + struct pt_regs___arm64 { unsigned long orig_x0; }; @@ -169,21 +231,49 @@ struct pt_regs___arm64 { #define __PT_PARM3_REG regs[2] #define __PT_PARM4_REG regs[3] #define __PT_PARM5_REG regs[4] +#define __PT_PARM6_REG regs[5] +#define __PT_PARM7_REG regs[6] +#define __PT_PARM8_REG regs[7] + +#define __PT_PARM1_SYSCALL_REG orig_x0 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG +#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1_CORE_SYSCALL(x) +#define PT_REGS_PARM1_CORE_SYSCALL(x) \ + BPF_CORE_READ((const struct pt_regs___arm64 *)(x), __PT_PARM1_SYSCALL_REG) + #define __PT_RET_REG regs[30] #define __PT_FP_REG regs[29] /* Works only with CONFIG_FRAME_POINTER */ #define __PT_RC_REG regs[0] #define __PT_SP_REG sp #define __PT_IP_REG pc -#define PT_REGS_PARM1_SYSCALL(x) ({ _Pragma("GCC error \"use PT_REGS_PARM1_CORE_SYSCALL() instead\""); 0l; }) -#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ((const struct pt_regs___arm64 *)(x), orig_x0) #elif defined(bpf_target_mips) +/* + * N64 ABI is assumed right now. + * https://en.wikipedia.org/wiki/MIPS_architecture#Calling_conventions + */ + #define __PT_PARM1_REG regs[4] #define __PT_PARM2_REG regs[5] #define __PT_PARM3_REG regs[6] #define __PT_PARM4_REG regs[7] #define __PT_PARM5_REG regs[8] +#define __PT_PARM6_REG regs[9] +#define __PT_PARM7_REG regs[10] +#define __PT_PARM8_REG regs[11] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG /* only N32/N64 */ +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG /* only N32/N64 */ + #define __PT_RET_REG regs[31] #define __PT_FP_REG regs[30] /* Works only with CONFIG_FRAME_POINTER */ #define __PT_RC_REG regs[2] @@ -192,26 +282,58 @@ struct pt_regs___arm64 { #elif defined(bpf_target_powerpc) +/* + * http://refspecs.linux-foundation.org/elf/elfspec_ppc.pdf (page 3-14, + * section "Function Calling Sequence") + */ + #define __PT_PARM1_REG gpr[3] #define __PT_PARM2_REG gpr[4] #define __PT_PARM3_REG gpr[5] #define __PT_PARM4_REG gpr[6] #define __PT_PARM5_REG gpr[7] +#define __PT_PARM6_REG gpr[8] +#define __PT_PARM7_REG gpr[9] +#define __PT_PARM8_REG gpr[10] + +/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG orig_gpr3 +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG +#if !defined(__arch64__) +#define __PT_PARM7_SYSCALL_REG __PT_PARM7_REG /* only powerpc (not powerpc64) */ +#endif + #define __PT_RET_REG regs[31] #define __PT_FP_REG __unsupported__ #define __PT_RC_REG gpr[3] #define __PT_SP_REG sp #define __PT_IP_REG nip -/* powerpc does not select ARCH_HAS_SYSCALL_WRAPPER. */ -#define PT_REGS_SYSCALL_REGS(ctx) ctx #elif defined(bpf_target_sparc) +/* + * https://en.wikipedia.org/wiki/Calling_convention#SPARC + */ + #define __PT_PARM1_REG u_regs[UREG_I0] #define __PT_PARM2_REG u_regs[UREG_I1] #define __PT_PARM3_REG u_regs[UREG_I2] #define __PT_PARM4_REG u_regs[UREG_I3] #define __PT_PARM5_REG u_regs[UREG_I4] +#define __PT_PARM6_REG u_regs[UREG_I5] + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG u_regs[UREG_I7] #define __PT_FP_REG __unsupported__ #define __PT_RC_REG u_regs[UREG_I0] @@ -225,36 +347,99 @@ struct pt_regs___arm64 { #elif defined(bpf_target_riscv) +/* + * https://github.com/riscv-non-isa/riscv-elf-psabi-doc/blob/master/riscv-cc.adoc#risc-v-calling-conventions + */ + +/* riscv provides struct user_regs_struct instead of struct pt_regs to userspace */ #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) #define __PT_PARM1_REG a0 #define __PT_PARM2_REG a1 #define __PT_PARM3_REG a2 #define __PT_PARM4_REG a3 #define __PT_PARM5_REG a4 +#define __PT_PARM6_REG a5 +#define __PT_PARM7_REG a6 +#define __PT_PARM8_REG a7 + +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG ra #define __PT_FP_REG s0 -#define __PT_RC_REG a5 +#define __PT_RC_REG a0 #define __PT_SP_REG sp #define __PT_IP_REG pc -/* riscv does not select ARCH_HAS_SYSCALL_WRAPPER. */ -#define PT_REGS_SYSCALL_REGS(ctx) ctx #elif defined(bpf_target_arc) -/* arc provides struct user_pt_regs instead of struct pt_regs to userspace */ +/* + * Section "Function Calling Sequence" (page 24): + * https://raw.githubusercontent.com/wiki/foss-for-synopsys-dwc-arc-processors/toolchain/files/ARCv2_ABI.pdf + */ + +/* arc provides struct user_regs_struct instead of struct pt_regs to userspace */ #define __PT_REGS_CAST(x) ((const struct user_regs_struct *)(x)) #define __PT_PARM1_REG scratch.r0 #define __PT_PARM2_REG scratch.r1 #define __PT_PARM3_REG scratch.r2 #define __PT_PARM4_REG scratch.r3 #define __PT_PARM5_REG scratch.r4 +#define __PT_PARM6_REG scratch.r5 +#define __PT_PARM7_REG scratch.r6 +#define __PT_PARM8_REG scratch.r7 + +/* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */ +#define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + #define __PT_RET_REG scratch.blink -#define __PT_FP_REG __unsupported__ +#define __PT_FP_REG scratch.fp #define __PT_RC_REG scratch.r0 #define __PT_SP_REG scratch.sp #define __PT_IP_REG scratch.ret -/* arc does not select ARCH_HAS_SYSCALL_WRAPPER. */ + +#elif defined(bpf_target_loongarch) + +/* + * https://docs.kernel.org/loongarch/introduction.html + * https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-EN.html + */ + +/* loongarch provides struct user_pt_regs instead of struct pt_regs to userspace */ +#define __PT_REGS_CAST(x) ((const struct user_pt_regs *)(x)) +#define __PT_PARM1_REG regs[4] +#define __PT_PARM2_REG regs[5] +#define __PT_PARM3_REG regs[6] +#define __PT_PARM4_REG regs[7] +#define __PT_PARM5_REG regs[8] +#define __PT_PARM6_REG regs[9] +#define __PT_PARM7_REG regs[10] +#define __PT_PARM8_REG regs[11] + +/* loongarch does not select ARCH_HAS_SYSCALL_WRAPPER. */ #define PT_REGS_SYSCALL_REGS(ctx) ctx +#define __PT_PARM1_SYSCALL_REG __PT_PARM1_REG +#define __PT_PARM2_SYSCALL_REG __PT_PARM2_REG +#define __PT_PARM3_SYSCALL_REG __PT_PARM3_REG +#define __PT_PARM4_SYSCALL_REG __PT_PARM4_REG +#define __PT_PARM5_SYSCALL_REG __PT_PARM5_REG +#define __PT_PARM6_SYSCALL_REG __PT_PARM6_REG + +#define __PT_RET_REG regs[1] +#define __PT_FP_REG regs[22] +#define __PT_RC_REG regs[4] +#define __PT_SP_REG regs[3] +#define __PT_IP_REG csr_era #endif @@ -262,16 +447,49 @@ struct pt_regs___arm64 { struct pt_regs; -/* allow some architecutres to override `struct pt_regs` */ +/* allow some architectures to override `struct pt_regs` */ #ifndef __PT_REGS_CAST #define __PT_REGS_CAST(x) (x) #endif +/* + * Different architectures support different number of arguments passed + * through registers. i386 supports just 3, some arches support up to 8. + */ +#ifndef __PT_PARM4_REG +#define __PT_PARM4_REG __unsupported__ +#endif +#ifndef __PT_PARM5_REG +#define __PT_PARM5_REG __unsupported__ +#endif +#ifndef __PT_PARM6_REG +#define __PT_PARM6_REG __unsupported__ +#endif +#ifndef __PT_PARM7_REG +#define __PT_PARM7_REG __unsupported__ +#endif +#ifndef __PT_PARM8_REG +#define __PT_PARM8_REG __unsupported__ +#endif +/* + * Similarly, syscall-specific conventions might differ between function call + * conventions within each architecutre. All supported architectures pass + * either 6 or 7 syscall arguments in registers. + * + * See syscall(2) manpage for succinct table with information on each arch. + */ +#ifndef __PT_PARM7_SYSCALL_REG +#define __PT_PARM7_SYSCALL_REG __unsupported__ +#endif + #define PT_REGS_PARM1(x) (__PT_REGS_CAST(x)->__PT_PARM1_REG) #define PT_REGS_PARM2(x) (__PT_REGS_CAST(x)->__PT_PARM2_REG) #define PT_REGS_PARM3(x) (__PT_REGS_CAST(x)->__PT_PARM3_REG) #define PT_REGS_PARM4(x) (__PT_REGS_CAST(x)->__PT_PARM4_REG) #define PT_REGS_PARM5(x) (__PT_REGS_CAST(x)->__PT_PARM5_REG) +#define PT_REGS_PARM6(x) (__PT_REGS_CAST(x)->__PT_PARM6_REG) +#define PT_REGS_PARM7(x) (__PT_REGS_CAST(x)->__PT_PARM7_REG) +#define PT_REGS_PARM8(x) (__PT_REGS_CAST(x)->__PT_PARM8_REG) #define PT_REGS_RET(x) (__PT_REGS_CAST(x)->__PT_RET_REG) #define PT_REGS_FP(x) (__PT_REGS_CAST(x)->__PT_FP_REG) #define PT_REGS_RC(x) (__PT_REGS_CAST(x)->__PT_RC_REG) @@ -283,6 +501,9 @@ struct pt_regs; #define PT_REGS_PARM3_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_REG) #define PT_REGS_PARM4_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_REG) #define PT_REGS_PARM5_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_REG) +#define PT_REGS_PARM6_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_REG) +#define PT_REGS_PARM7_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_REG) +#define PT_REGS_PARM8_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM8_REG) #define PT_REGS_RET_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RET_REG) #define PT_REGS_FP_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_FP_REG) #define PT_REGS_RC_CORE(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_RC_REG) @@ -309,24 +530,33 @@ struct pt_regs; #endif #ifndef PT_REGS_PARM1_SYSCALL -#define PT_REGS_PARM1_SYSCALL(x) PT_REGS_PARM1(x) +#define PT_REGS_PARM1_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM1_SYSCALL_REG) +#define PT_REGS_PARM1_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM1_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM2_SYSCALL +#define PT_REGS_PARM2_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM2_SYSCALL_REG) +#define PT_REGS_PARM2_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM2_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM3_SYSCALL +#define PT_REGS_PARM3_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM3_SYSCALL_REG) +#define PT_REGS_PARM3_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM3_SYSCALL_REG) #endif -#define PT_REGS_PARM2_SYSCALL(x) PT_REGS_PARM2(x) -#define PT_REGS_PARM3_SYSCALL(x) PT_REGS_PARM3(x) #ifndef PT_REGS_PARM4_SYSCALL -#define PT_REGS_PARM4_SYSCALL(x) PT_REGS_PARM4(x) +#define PT_REGS_PARM4_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM4_SYSCALL_REG) +#define PT_REGS_PARM4_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM4_SYSCALL_REG) #endif -#define PT_REGS_PARM5_SYSCALL(x) PT_REGS_PARM5(x) - -#ifndef PT_REGS_PARM1_CORE_SYSCALL -#define PT_REGS_PARM1_CORE_SYSCALL(x) PT_REGS_PARM1_CORE(x) +#ifndef PT_REGS_PARM5_SYSCALL +#define PT_REGS_PARM5_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM5_SYSCALL_REG) +#define PT_REGS_PARM5_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM5_SYSCALL_REG) +#endif +#ifndef PT_REGS_PARM6_SYSCALL +#define PT_REGS_PARM6_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM6_SYSCALL_REG) +#define PT_REGS_PARM6_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM6_SYSCALL_REG) #endif -#define PT_REGS_PARM2_CORE_SYSCALL(x) PT_REGS_PARM2_CORE(x) -#define PT_REGS_PARM3_CORE_SYSCALL(x) PT_REGS_PARM3_CORE(x) -#ifndef PT_REGS_PARM4_CORE_SYSCALL -#define PT_REGS_PARM4_CORE_SYSCALL(x) PT_REGS_PARM4_CORE(x) +#ifndef PT_REGS_PARM7_SYSCALL +#define PT_REGS_PARM7_SYSCALL(x) (__PT_REGS_CAST(x)->__PT_PARM7_SYSCALL_REG) +#define PT_REGS_PARM7_CORE_SYSCALL(x) BPF_CORE_READ(__PT_REGS_CAST(x), __PT_PARM7_SYSCALL_REG) #endif -#define PT_REGS_PARM5_CORE_SYSCALL(x) PT_REGS_PARM5_CORE(x) #else /* defined(bpf_target_defined) */ @@ -335,6 +565,9 @@ struct pt_regs; #define PT_REGS_PARM3(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM4(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM5(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM8(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_RET(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_FP(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_RC(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) @@ -346,6 +579,9 @@ struct pt_regs; #define PT_REGS_PARM3_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM4_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM5_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM8_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_RET_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_FP_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_RC_CORE(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) @@ -360,12 +596,16 @@ struct pt_regs; #define PT_REGS_PARM3_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM4_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM5_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM1_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM2_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM3_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM4_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #define PT_REGS_PARM5_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM6_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) +#define PT_REGS_PARM7_CORE_SYSCALL(x) ({ _Pragma(__BPF_TARGET_MISSING); 0l; }) #endif /* defined(bpf_target_defined) */ @@ -393,18 +633,18 @@ struct pt_regs; #endif #define ___bpf_ctx_cast0() ctx -#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), (void *)ctx[0] -#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), (void *)ctx[1] -#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), (void *)ctx[2] -#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), (void *)ctx[3] -#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), (void *)ctx[4] -#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), (void *)ctx[5] -#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), (void *)ctx[6] -#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), (void *)ctx[7] -#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), (void *)ctx[8] -#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), (void *)ctx[9] -#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), (void *)ctx[10] -#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), (void *)ctx[11] +#define ___bpf_ctx_cast1(x) ___bpf_ctx_cast0(), ctx[0] +#define ___bpf_ctx_cast2(x, args...) ___bpf_ctx_cast1(args), ctx[1] +#define ___bpf_ctx_cast3(x, args...) ___bpf_ctx_cast2(args), ctx[2] +#define ___bpf_ctx_cast4(x, args...) ___bpf_ctx_cast3(args), ctx[3] +#define ___bpf_ctx_cast5(x, args...) ___bpf_ctx_cast4(args), ctx[4] +#define ___bpf_ctx_cast6(x, args...) ___bpf_ctx_cast5(args), ctx[5] +#define ___bpf_ctx_cast7(x, args...) ___bpf_ctx_cast6(args), ctx[6] +#define ___bpf_ctx_cast8(x, args...) ___bpf_ctx_cast7(args), ctx[7] +#define ___bpf_ctx_cast9(x, args...) ___bpf_ctx_cast8(args), ctx[8] +#define ___bpf_ctx_cast10(x, args...) ___bpf_ctx_cast9(args), ctx[9] +#define ___bpf_ctx_cast11(x, args...) ___bpf_ctx_cast10(args), ctx[10] +#define ___bpf_ctx_cast12(x, args...) ___bpf_ctx_cast11(args), ctx[11] #define ___bpf_ctx_cast(args...) ___bpf_apply(___bpf_ctx_cast, ___bpf_narg(args))(args) /* @@ -424,7 +664,7 @@ struct pt_regs; */ #define BPF_PROG(name, args...) \ name(unsigned long long *ctx); \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(unsigned long long *ctx, ##args); \ typeof(name(0)) name(unsigned long long *ctx) \ { \ @@ -433,17 +673,127 @@ typeof(name(0)) name(unsigned long long *ctx) \ return ____##name(___bpf_ctx_cast(args)); \ _Pragma("GCC diagnostic pop") \ } \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(unsigned long long *ctx, ##args) +#ifndef ___bpf_nth2 +#define ___bpf_nth2(_, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, \ + _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, N, ...) N +#endif +#ifndef ___bpf_narg2 +#define ___bpf_narg2(...) \ + ___bpf_nth2(_, ##__VA_ARGS__, 12, 12, 11, 11, 10, 10, 9, 9, 8, 8, 7, 7, \ + 6, 6, 5, 5, 4, 4, 3, 3, 2, 2, 1, 1, 0) +#endif + +#define ___bpf_treg_cnt(t) \ + __builtin_choose_expr(sizeof(t) == 1, 1, \ + __builtin_choose_expr(sizeof(t) == 2, 1, \ + __builtin_choose_expr(sizeof(t) == 4, 1, \ + __builtin_choose_expr(sizeof(t) == 8, 1, \ + __builtin_choose_expr(sizeof(t) == 16, 2, \ + (void)0))))) + +#define ___bpf_reg_cnt0() (0) +#define ___bpf_reg_cnt1(t, x) (___bpf_reg_cnt0() + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt2(t, x, args...) (___bpf_reg_cnt1(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt3(t, x, args...) (___bpf_reg_cnt2(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt4(t, x, args...) (___bpf_reg_cnt3(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt5(t, x, args...) (___bpf_reg_cnt4(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt6(t, x, args...) (___bpf_reg_cnt5(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt7(t, x, args...) (___bpf_reg_cnt6(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt8(t, x, args...) (___bpf_reg_cnt7(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt9(t, x, args...) (___bpf_reg_cnt8(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt10(t, x, args...) (___bpf_reg_cnt9(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt11(t, x, args...) (___bpf_reg_cnt10(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt12(t, x, args...) (___bpf_reg_cnt11(args) + ___bpf_treg_cnt(t)) +#define ___bpf_reg_cnt(args...) ___bpf_apply(___bpf_reg_cnt, ___bpf_narg2(args))(args) + +#define ___bpf_union_arg(t, x, n) \ + __builtin_choose_expr(sizeof(t) == 1, ({ union { __u8 z[1]; t x; } ___t = { .z = {ctx[n]}}; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 2, ({ union { __u16 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 4, ({ union { __u32 z[1]; t x; } ___t = { .z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 8, ({ union { __u64 z[1]; t x; } ___t = {.z = {ctx[n]} }; ___t.x; }), \ + __builtin_choose_expr(sizeof(t) == 16, ({ union { __u64 z[2]; t x; } ___t = {.z = {ctx[n], ctx[n + 1]} }; ___t.x; }), \ + (void)0))))) + +#define ___bpf_ctx_arg0(n, args...) +#define ___bpf_ctx_arg1(n, t, x) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt1(t, x)) +#define ___bpf_ctx_arg2(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt2(t, x, args)) ___bpf_ctx_arg1(n, args) +#define ___bpf_ctx_arg3(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt3(t, x, args)) ___bpf_ctx_arg2(n, args) +#define ___bpf_ctx_arg4(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt4(t, x, args)) ___bpf_ctx_arg3(n, args) +#define ___bpf_ctx_arg5(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt5(t, x, args)) ___bpf_ctx_arg4(n, args) +#define ___bpf_ctx_arg6(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt6(t, x, args)) ___bpf_ctx_arg5(n, args) +#define ___bpf_ctx_arg7(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt7(t, x, args)) ___bpf_ctx_arg6(n, args) +#define ___bpf_ctx_arg8(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt8(t, x, args)) ___bpf_ctx_arg7(n, args) +#define ___bpf_ctx_arg9(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt9(t, x, args)) ___bpf_ctx_arg8(n, args) +#define ___bpf_ctx_arg10(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt10(t, x, args)) ___bpf_ctx_arg9(n, args) +#define ___bpf_ctx_arg11(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt11(t, x, args)) ___bpf_ctx_arg10(n, args) +#define ___bpf_ctx_arg12(n, t, x, args...) , ___bpf_union_arg(t, x, n - ___bpf_reg_cnt12(t, x, args)) ___bpf_ctx_arg11(n, args) +#define ___bpf_ctx_arg(args...) ___bpf_apply(___bpf_ctx_arg, ___bpf_narg2(args))(___bpf_reg_cnt(args), args) + +#define ___bpf_ctx_decl0() +#define ___bpf_ctx_decl1(t, x) , t x +#define ___bpf_ctx_decl2(t, x, args...) , t x ___bpf_ctx_decl1(args) +#define ___bpf_ctx_decl3(t, x, args...) , t x ___bpf_ctx_decl2(args) +#define ___bpf_ctx_decl4(t, x, args...) , t x ___bpf_ctx_decl3(args) +#define ___bpf_ctx_decl5(t, x, args...) , t x ___bpf_ctx_decl4(args) +#define ___bpf_ctx_decl6(t, x, args...) , t x ___bpf_ctx_decl5(args) +#define ___bpf_ctx_decl7(t, x, args...) , t x ___bpf_ctx_decl6(args) +#define ___bpf_ctx_decl8(t, x, args...) , t x ___bpf_ctx_decl7(args) +#define ___bpf_ctx_decl9(t, x, args...) , t x ___bpf_ctx_decl8(args) +#define ___bpf_ctx_decl10(t, x, args...) , t x ___bpf_ctx_decl9(args) +#define ___bpf_ctx_decl11(t, x, args...) , t x ___bpf_ctx_decl10(args) +#define ___bpf_ctx_decl12(t, x, args...) , t x ___bpf_ctx_decl11(args) +#define ___bpf_ctx_decl(args...) ___bpf_apply(___bpf_ctx_decl, ___bpf_narg2(args))(args) + +/* + * BPF_PROG2 is an enhanced version of BPF_PROG in order to handle struct + * arguments. Since each struct argument might take one or two u64 values + * in the trampoline stack, argument type size is needed to place proper number + * of u64 values for each argument. Therefore, BPF_PROG2 has different + * syntax from BPF_PROG. For example, for the following BPF_PROG syntax: + * + * int BPF_PROG(test2, int a, int b) { ... } + * + * the corresponding BPF_PROG2 syntax is: + * + * int BPF_PROG2(test2, int, a, int, b) { ... } + * + * where type and the corresponding argument name are separated by comma. + * + * Use BPF_PROG2 macro if one of the arguments might be a struct/union larger + * than 8 bytes: + * + * int BPF_PROG2(test_struct_arg, struct bpf_testmod_struct_arg_1, a, int, b, + * int, c, int, d, struct bpf_testmod_struct_arg_2, e, int, ret) + * { + * // access a, b, c, d, e, and ret directly + * ... + * } + */ +#define BPF_PROG2(name, args...) \ +name(unsigned long long *ctx); \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)); \ +typeof(name(0)) name(unsigned long long *ctx) \ +{ \ + return ____##name(ctx ___bpf_ctx_arg(args)); \ +} \ +static __always_inline typeof(name(0)) \ +____##name(unsigned long long *ctx ___bpf_ctx_decl(args)) + struct pt_regs; #define ___bpf_kprobe_args0() ctx -#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (void *)PT_REGS_PARM1(ctx) -#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (void *)PT_REGS_PARM2(ctx) -#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (void *)PT_REGS_PARM3(ctx) -#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (void *)PT_REGS_PARM4(ctx) -#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (void *)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args1(x) ___bpf_kprobe_args0(), (unsigned long long)PT_REGS_PARM1(ctx) +#define ___bpf_kprobe_args2(x, args...) ___bpf_kprobe_args1(args), (unsigned long long)PT_REGS_PARM2(ctx) +#define ___bpf_kprobe_args3(x, args...) ___bpf_kprobe_args2(args), (unsigned long long)PT_REGS_PARM3(ctx) +#define ___bpf_kprobe_args4(x, args...) ___bpf_kprobe_args3(args), (unsigned long long)PT_REGS_PARM4(ctx) +#define ___bpf_kprobe_args5(x, args...) ___bpf_kprobe_args4(args), (unsigned long long)PT_REGS_PARM5(ctx) +#define ___bpf_kprobe_args6(x, args...) ___bpf_kprobe_args5(args), (unsigned long long)PT_REGS_PARM6(ctx) +#define ___bpf_kprobe_args7(x, args...) ___bpf_kprobe_args6(args), (unsigned long long)PT_REGS_PARM7(ctx) +#define ___bpf_kprobe_args8(x, args...) ___bpf_kprobe_args7(args), (unsigned long long)PT_REGS_PARM8(ctx) #define ___bpf_kprobe_args(args...) ___bpf_apply(___bpf_kprobe_args, ___bpf_narg(args))(args) /* @@ -458,7 +808,7 @@ struct pt_regs; */ #define BPF_KPROBE(name, args...) \ name(struct pt_regs *ctx); \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args); \ typeof(name(0)) name(struct pt_regs *ctx) \ { \ @@ -467,11 +817,11 @@ typeof(name(0)) name(struct pt_regs *ctx) \ return ____##name(___bpf_kprobe_args(args)); \ _Pragma("GCC diagnostic pop") \ } \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) #define ___bpf_kretprobe_args0() ctx -#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (void *)PT_REGS_RC(ctx) +#define ___bpf_kretprobe_args1(x) ___bpf_kretprobe_args0(), (unsigned long long)PT_REGS_RC(ctx) #define ___bpf_kretprobe_args(args...) ___bpf_apply(___bpf_kretprobe_args, ___bpf_narg(args))(args) /* @@ -482,7 +832,7 @@ ____##name(struct pt_regs *ctx, ##args) */ #define BPF_KRETPROBE(name, args...) \ name(struct pt_regs *ctx); \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args); \ typeof(name(0)) name(struct pt_regs *ctx) \ { \ @@ -493,39 +843,80 @@ typeof(name(0)) name(struct pt_regs *ctx) \ } \ static __always_inline typeof(name(0)) ____##name(struct pt_regs *ctx, ##args) +/* If kernel has CONFIG_ARCH_HAS_SYSCALL_WRAPPER, read pt_regs directly */ #define ___bpf_syscall_args0() ctx -#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (void *)PT_REGS_PARM1_CORE_SYSCALL(regs) -#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (void *)PT_REGS_PARM2_CORE_SYSCALL(regs) -#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (void *)PT_REGS_PARM3_CORE_SYSCALL(regs) -#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (void *)PT_REGS_PARM4_CORE_SYSCALL(regs) -#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (void *)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syscall_args1(x) ___bpf_syscall_args0(), (unsigned long long)PT_REGS_PARM1_SYSCALL(regs) +#define ___bpf_syscall_args2(x, args...) ___bpf_syscall_args1(args), (unsigned long long)PT_REGS_PARM2_SYSCALL(regs) +#define ___bpf_syscall_args3(x, args...) ___bpf_syscall_args2(args), (unsigned long long)PT_REGS_PARM3_SYSCALL(regs) +#define ___bpf_syscall_args4(x, args...) ___bpf_syscall_args3(args), (unsigned long long)PT_REGS_PARM4_SYSCALL(regs) +#define ___bpf_syscall_args5(x, args...) ___bpf_syscall_args4(args), (unsigned long long)PT_REGS_PARM5_SYSCALL(regs) +#define ___bpf_syscall_args6(x, args...) ___bpf_syscall_args5(args), (unsigned long long)PT_REGS_PARM6_SYSCALL(regs) +#define ___bpf_syscall_args7(x, args...) ___bpf_syscall_args6(args), (unsigned long long)PT_REGS_PARM7_SYSCALL(regs) #define ___bpf_syscall_args(args...) ___bpf_apply(___bpf_syscall_args, ___bpf_narg(args))(args) +/* If kernel doesn't have CONFIG_ARCH_HAS_SYSCALL_WRAPPER, we have to BPF_CORE_READ from pt_regs */ +#define ___bpf_syswrap_args0() ctx +#define ___bpf_syswrap_args1(x) ___bpf_syswrap_args0(), (unsigned long long)PT_REGS_PARM1_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args2(x, args...) ___bpf_syswrap_args1(args), (unsigned long long)PT_REGS_PARM2_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args3(x, args...) ___bpf_syswrap_args2(args), (unsigned long long)PT_REGS_PARM3_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args4(x, args...) ___bpf_syswrap_args3(args), (unsigned long long)PT_REGS_PARM4_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args5(x, args...) ___bpf_syswrap_args4(args), (unsigned long long)PT_REGS_PARM5_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args6(x, args...) ___bpf_syswrap_args5(args), (unsigned long long)PT_REGS_PARM6_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args7(x, args...) ___bpf_syswrap_args6(args), (unsigned long long)PT_REGS_PARM7_CORE_SYSCALL(regs) +#define ___bpf_syswrap_args(args...) ___bpf_apply(___bpf_syswrap_args, ___bpf_narg(args))(args) + /* - * BPF_KPROBE_SYSCALL is a variant of BPF_KPROBE, which is intended for + * BPF_KSYSCALL is a variant of BPF_KPROBE, which is intended for * tracing syscall functions, like __x64_sys_close. It hides the underlying * platform-specific low-level way of getting syscall input arguments from * struct pt_regs, and provides a familiar typed and named function arguments * syntax and semantics of accessing syscall input parameters. * - * Original struct pt_regs* context is preserved as 'ctx' argument. This might + * Original struct pt_regs * context is preserved as 'ctx' argument. This might * be necessary when using BPF helpers like bpf_perf_event_output(). * - * This macro relies on BPF CO-RE support. + * At the moment BPF_KSYSCALL does not transparently handle all the calling + * convention quirks for the following syscalls: + * + * - mmap(): __ARCH_WANT_SYS_OLD_MMAP. + * - clone(): CONFIG_CLONE_BACKWARDS, CONFIG_CLONE_BACKWARDS2 and + * CONFIG_CLONE_BACKWARDS3. + * - socket-related syscalls: __ARCH_WANT_SYS_SOCKETCALL. + * - compat syscalls. + * + * This may or may not change in the future. User needs to take extra measures + * to handle such quirks explicitly, if necessary. + * + * This macro relies on BPF CO-RE support and virtual __kconfig externs. */ -#define BPF_KPROBE_SYSCALL(name, args...) \ +#define BPF_KSYSCALL(name, args...) \ name(struct pt_regs *ctx); \ -static __attribute__((always_inline)) typeof(name(0)) \ +extern _Bool LINUX_HAS_SYSCALL_WRAPPER __kconfig; \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args); \ typeof(name(0)) name(struct pt_regs *ctx) \ { \ - struct pt_regs *regs = PT_REGS_SYSCALL_REGS(ctx); \ + struct pt_regs *regs = LINUX_HAS_SYSCALL_WRAPPER \ + ? (struct pt_regs *)PT_REGS_PARM1(ctx) \ + : ctx; \ _Pragma("GCC diagnostic push") \ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ - return ____##name(___bpf_syscall_args(args)); \ + if (LINUX_HAS_SYSCALL_WRAPPER) \ + return ____##name(___bpf_syswrap_args(args)); \ + else \ + return ____##name(___bpf_syscall_args(args)); \ _Pragma("GCC diagnostic pop") \ } \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) +#define BPF_KPROBE_SYSCALL BPF_KSYSCALL + +/* BPF_UPROBE and BPF_URETPROBE are identical to BPF_KPROBE and BPF_KRETPROBE, + * but are named way less confusingly for SEC("uprobe") and SEC("uretprobe") + * use cases. + */ +#define BPF_UPROBE(name, args...) BPF_KPROBE(name, ##args) +#define BPF_URETPROBE(name, args...) BPF_KRETPROBE(name, ##args) + #endif diff --git a/pkg/plugin/lib/common/libbpf/_src/btf.c b/pkg/plugin/lib/common/libbpf/_src/btf.c index bb1e06eb1e..32c00db3b9 100644 --- a/pkg/plugin/lib/common/libbpf/_src/btf.c +++ b/pkg/plugin/lib/common/libbpf/_src/btf.c @@ -116,6 +116,9 @@ struct btf { /* whether strings are already deduplicated */ bool strs_deduped; + /* whether base_btf should be freed in btf_free for this instance */ + bool owns_base; + /* BTF object FD, if loaded into kernel */ int fd; @@ -130,7 +133,7 @@ static inline __u64 ptr_to_u64(const void *ptr) /* Ensure given dynamically allocated memory region pointed to by *data* with * capacity of *cap_cnt* elements each taking *elem_sz* bytes has enough - * memory to accomodate *add_cnt* new elements, assuming *cur_cnt* elements + * memory to accommodate *add_cnt* new elements, assuming *cur_cnt* elements * are already used. At most *max_cnt* elements can be ever allocated. * If necessary, memory is reallocated and all existing data is copied over, * new pointer to the memory region is stored at *data, new memory region @@ -305,6 +308,8 @@ static int btf_type_size(const struct btf_type *t) return base_size + sizeof(__u32); case BTF_KIND_ENUM: return base_size + vlen * sizeof(struct btf_enum); + case BTF_KIND_ENUM64: + return base_size + vlen * sizeof(struct btf_enum64); case BTF_KIND_ARRAY: return base_size + sizeof(struct btf_array); case BTF_KIND_STRUCT: @@ -334,6 +339,7 @@ static void btf_bswap_type_base(struct btf_type *t) static int btf_bswap_type_rest(struct btf_type *t) { struct btf_var_secinfo *v; + struct btf_enum64 *e64; struct btf_member *m; struct btf_array *a; struct btf_param *p; @@ -361,6 +367,13 @@ static int btf_bswap_type_rest(struct btf_type *t) e->val = bswap_32(e->val); } return 0; + case BTF_KIND_ENUM64: + for (i = 0, e64 = btf_enum64(t); i < vlen; i++, e64++) { + e64->name_off = bswap_32(e64->name_off); + e64->val_lo32 = bswap_32(e64->val_lo32); + e64->val_hi32 = bswap_32(e64->val_hi32); + } + return 0; case BTF_KIND_ARRAY: a = btf_array(t); a->type = bswap_32(a->type); @@ -438,9 +451,163 @@ static int btf_parse_type_sec(struct btf *btf) return 0; } -__u32 btf__get_nr_types(const struct btf *btf) +static int btf_validate_str(const struct btf *btf, __u32 str_off, const char *what, __u32 type_id) { - return btf->start_id + btf->nr_types - 1; + const char *s; + + s = btf__str_by_offset(btf, str_off); + if (!s) { + pr_warn("btf: type [%u]: invalid %s (string offset %u)\n", type_id, what, str_off); + return -EINVAL; + } + + return 0; +} + +static int btf_validate_id(const struct btf *btf, __u32 id, __u32 ctx_id) +{ + const struct btf_type *t; + + t = btf__type_by_id(btf, id); + if (!t) { + pr_warn("btf: type [%u]: invalid referenced type ID %u\n", ctx_id, id); + return -EINVAL; + } + + return 0; +} + +static int btf_validate_type(const struct btf *btf, const struct btf_type *t, __u32 id) +{ + __u32 kind = btf_kind(t); + int err, i, n; + + err = btf_validate_str(btf, t->name_off, "type name", id); + if (err) + return err; + + switch (kind) { + case BTF_KIND_UNKN: + case BTF_KIND_INT: + case BTF_KIND_FWD: + case BTF_KIND_FLOAT: + break; + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_VOLATILE: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + err = btf_validate_id(btf, t->type, id); + if (err) + return err; + break; + case BTF_KIND_ARRAY: { + const struct btf_array *a = btf_array(t); + + err = btf_validate_id(btf, a->type, id); + err = err ?: btf_validate_id(btf, a->index_type, id); + if (err) + return err; + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: { + const struct btf_member *m = btf_members(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_str(btf, m->name_off, "field name", id); + err = err ?: btf_validate_id(btf, m->type, id); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM: { + const struct btf_enum *m = btf_enum(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_str(btf, m->name_off, "enum name", id); + if (err) + return err; + } + break; + } + case BTF_KIND_ENUM64: { + const struct btf_enum64 *m = btf_enum64(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_str(btf, m->name_off, "enum name", id); + if (err) + return err; + } + break; + } + case BTF_KIND_FUNC: { + const struct btf_type *ft; + + err = btf_validate_id(btf, t->type, id); + if (err) + return err; + ft = btf__type_by_id(btf, t->type); + if (btf_kind(ft) != BTF_KIND_FUNC_PROTO) { + pr_warn("btf: type [%u]: referenced type [%u] is not FUNC_PROTO\n", id, t->type); + return -EINVAL; + } + break; + } + case BTF_KIND_FUNC_PROTO: { + const struct btf_param *m = btf_params(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_str(btf, m->name_off, "param name", id); + err = err ?: btf_validate_id(btf, m->type, id); + if (err) + return err; + } + break; + } + case BTF_KIND_DATASEC: { + const struct btf_var_secinfo *m = btf_var_secinfos(t); + + n = btf_vlen(t); + for (i = 0; i < n; i++, m++) { + err = btf_validate_id(btf, m->type, id); + if (err) + return err; + } + break; + } + default: + pr_warn("btf: type [%u]: unrecognized kind %u\n", id, kind); + return -EINVAL; + } + return 0; +} + +/* Validate basic sanity of BTF. It's intentionally less thorough than + * kernel's validation and validates only properties of BTF that libbpf relies + * on to be correct (e.g., valid type IDs, valid string offsets, etc) + */ +static int btf_sanity_check(const struct btf *btf) +{ + const struct btf_type *t; + __u32 i, n = btf__type_cnt(btf); + int err; + + for (i = btf->start_id; i < n; i++) { + t = btf_type_by_id(btf, i); + err = btf_validate_type(btf, t, i); + if (err) + return err; + } + return 0; } __u32 btf__type_cnt(const struct btf *btf) @@ -472,9 +639,22 @@ const struct btf_type *btf__type_by_id(const struct btf *btf, __u32 type_id) static int determine_ptr_size(const struct btf *btf) { + static const char * const long_aliases[] = { + "long", + "long int", + "int long", + "unsigned long", + "long unsigned", + "unsigned long int", + "unsigned int long", + "long unsigned int", + "long int unsigned", + "int unsigned long", + "int long unsigned", + }; const struct btf_type *t; const char *name; - int i, n; + int i, j, n; if (btf->base_btf && btf->base_btf->ptr_sz > 0) return btf->base_btf->ptr_sz; @@ -485,15 +665,16 @@ static int determine_ptr_size(const struct btf *btf) if (!btf_is_int(t)) continue; + if (t->size != 4 && t->size != 8) + continue; + name = btf__name_by_offset(btf, t->name_off); if (!name) continue; - if (strcmp(name, "long int") == 0 || - strcmp(name, "long unsigned int") == 0) { - if (t->size != 4 && t->size != 8) - continue; - return t->size; + for (j = 0; j < ARRAY_SIZE(long_aliases); j++) { + if (strcmp(name, long_aliases[j]) == 0) + return t->size; } } @@ -597,6 +778,7 @@ __s64 btf__resolve_size(const struct btf *btf, __u32 type_id) case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_DATASEC: case BTF_KIND_FLOAT: size = t->size; @@ -644,6 +826,7 @@ int btf__align_of(const struct btf *btf, __u32 id) switch (kind) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FLOAT: return min(btf_ptr_sz(btf), (size_t)t->size); case BTF_KIND_PTR: @@ -667,8 +850,21 @@ int btf__align_of(const struct btf *btf, __u32 id) if (align <= 0) return libbpf_err(align); max_align = max(max_align, align); + + /* if field offset isn't aligned according to field + * type's alignment, then struct must be packed + */ + if (btf_member_bitfield_size(t, i) == 0 && + (m->offset % (8 * align)) != 0) + return 1; } + /* if struct/union size isn't a multiple of its alignment, + * then struct must be packed + */ + if ((t->size % max_align) != 0) + return 1; + return max_align; } default: @@ -776,6 +972,8 @@ void btf__free(struct btf *btf) free(btf->raw_data); free(btf->raw_data_swapped); free(btf->type_offs); + if (btf->owns_base) + btf__free(btf->base_btf); free(btf); } @@ -868,6 +1066,7 @@ static struct btf *btf_new(const void *data, __u32 size, struct btf *base_btf) err = btf_parse_str_sec(btf); err = err ?: btf_parse_type_sec(btf); + err = err ?: btf_sanity_check(btf); if (err) goto done; @@ -885,53 +1084,43 @@ struct btf *btf__new(const void *data, __u32 size) return libbpf_ptr(btf_new(data, size, NULL)); } -static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, - struct btf_ext **btf_ext) +struct btf *btf__new_split(const void *data, __u32 size, struct btf *base_btf) +{ + return libbpf_ptr(btf_new(data, size, base_btf)); +} + +struct btf_elf_secs { + Elf_Data *btf_data; + Elf_Data *btf_ext_data; + Elf_Data *btf_base_data; +}; + +static int btf_find_elf_sections(Elf *elf, const char *path, struct btf_elf_secs *secs) { - Elf_Data *btf_data = NULL, *btf_ext_data = NULL; - int err = 0, fd = -1, idx = 0; - struct btf *btf = NULL; Elf_Scn *scn = NULL; - Elf *elf = NULL; + Elf_Data *data; GElf_Ehdr ehdr; size_t shstrndx; + int idx = 0; - if (elf_version(EV_CURRENT) == EV_NONE) { - pr_warn("failed to init libelf for %s\n", path); - return ERR_PTR(-LIBBPF_ERRNO__LIBELF); - } - - fd = open(path, O_RDONLY | O_CLOEXEC); - if (fd < 0) { - err = -errno; - pr_warn("failed to open %s: %s\n", path, strerror(errno)); - return ERR_PTR(err); - } - - err = -LIBBPF_ERRNO__FORMAT; - - elf = elf_begin(fd, ELF_C_READ, NULL); - if (!elf) { - pr_warn("failed to open %s as ELF file\n", path); - goto done; - } if (!gelf_getehdr(elf, &ehdr)) { pr_warn("failed to get EHDR from %s\n", path); - goto done; + goto err; } if (elf_getshdrstrndx(elf, &shstrndx)) { pr_warn("failed to get section names section index for %s\n", path); - goto done; + goto err; } if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL)) { pr_warn("failed to get e_shstrndx from %s\n", path); - goto done; + goto err; } while ((scn = elf_nextscn(elf, scn)) != NULL) { + Elf_Data **field; GElf_Shdr sh; char *name; @@ -939,44 +1128,103 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, if (gelf_getshdr(scn, &sh) != &sh) { pr_warn("failed to get section(%d) header from %s\n", idx, path); - goto done; + goto err; } name = elf_strptr(elf, shstrndx, sh.sh_name); if (!name) { pr_warn("failed to get section(%d) name from %s\n", idx, path); - goto done; + goto err; } - if (strcmp(name, BTF_ELF_SEC) == 0) { - btf_data = elf_getdata(scn, 0); - if (!btf_data) { - pr_warn("failed to get section(%d, %s) data from %s\n", - idx, name, path); - goto done; - } - continue; - } else if (btf_ext && strcmp(name, BTF_EXT_ELF_SEC) == 0) { - btf_ext_data = elf_getdata(scn, 0); - if (!btf_ext_data) { - pr_warn("failed to get section(%d, %s) data from %s\n", - idx, name, path); - goto done; - } + + if (strcmp(name, BTF_ELF_SEC) == 0) + field = &secs->btf_data; + else if (strcmp(name, BTF_EXT_ELF_SEC) == 0) + field = &secs->btf_ext_data; + else if (strcmp(name, BTF_BASE_ELF_SEC) == 0) + field = &secs->btf_base_data; + else continue; + + data = elf_getdata(scn, 0); + if (!data) { + pr_warn("failed to get section(%d, %s) data from %s\n", + idx, name, path); + goto err; } + *field = data; + } + + return 0; + +err: + return -LIBBPF_ERRNO__FORMAT; +} + +static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, + struct btf_ext **btf_ext) +{ + struct btf_elf_secs secs = {}; + struct btf *dist_base_btf = NULL; + struct btf *btf = NULL; + int err = 0, fd = -1; + Elf *elf = NULL; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn("failed to init libelf for %s\n", path); + return ERR_PTR(-LIBBPF_ERRNO__LIBELF); } - err = 0; + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + err = -errno; + pr_warn("failed to open %s: %s\n", path, strerror(errno)); + return ERR_PTR(err); + } - if (!btf_data) { - err = -ENOENT; + elf = elf_begin(fd, ELF_C_READ, NULL); + if (!elf) { + pr_warn("failed to open %s as ELF file\n", path); goto done; } - btf = btf_new(btf_data->d_buf, btf_data->d_size, base_btf); - err = libbpf_get_error(btf); + + err = btf_find_elf_sections(elf, path, &secs); if (err) goto done; + if (!secs.btf_data) { + pr_warn("failed to find '%s' ELF section in %s\n", BTF_ELF_SEC, path); + err = -ENODATA; + goto done; + } + + if (secs.btf_base_data) { + dist_base_btf = btf_new(secs.btf_base_data->d_buf, secs.btf_base_data->d_size, + NULL); + if (IS_ERR(dist_base_btf)) { + err = PTR_ERR(dist_base_btf); + dist_base_btf = NULL; + goto done; + } + } + + btf = btf_new(secs.btf_data->d_buf, secs.btf_data->d_size, + dist_base_btf ?: base_btf); + if (IS_ERR(btf)) { + err = PTR_ERR(btf); + goto done; + } + if (dist_base_btf && base_btf) { + err = btf__relocate(btf, base_btf); + if (err) + goto done; + btf__free(dist_base_btf); + dist_base_btf = NULL; + } + + if (dist_base_btf) + btf->owns_base = true; + switch (gelf_getclass(elf)) { case ELFCLASS32: btf__set_pointer_size(btf, 4); @@ -989,11 +1237,12 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, break; } - if (btf_ext && btf_ext_data) { - *btf_ext = btf_ext__new(btf_ext_data->d_buf, btf_ext_data->d_size); - err = libbpf_get_error(*btf_ext); - if (err) + if (btf_ext && secs.btf_ext_data) { + *btf_ext = btf_ext__new(secs.btf_ext_data->d_buf, secs.btf_ext_data->d_size); + if (IS_ERR(*btf_ext)) { + err = PTR_ERR(*btf_ext); goto done; + } } else if (btf_ext) { *btf_ext = NULL; } @@ -1007,6 +1256,7 @@ static struct btf *btf_parse_elf(const char *path, struct btf *base_btf, if (btf_ext) btf_ext__free(*btf_ext); + btf__free(dist_base_btf); btf__free(btf); return ERR_PTR(err); @@ -1031,7 +1281,7 @@ static struct btf *btf_parse_raw(const char *path, struct btf *base_btf) int err = 0; long sz; - f = fopen(path, "rb"); + f = fopen(path, "rbe"); if (!f) { err = -errno; goto err_out; @@ -1124,7 +1374,9 @@ struct btf *btf__parse_split(const char *path, struct btf *base_btf) static void *btf_get_raw_data(const struct btf *btf, __u32 *size, bool swap_endian); -int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level) +int btf_load_into_kernel(struct btf *btf, + char *log_buf, size_t log_sz, __u32 log_level, + int token_fd) { LIBBPF_OPTS(bpf_btf_load_opts, opts); __u32 buf_sz = 0, raw_size; @@ -1174,6 +1426,10 @@ int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 lo opts.log_level = log_level; } + opts.token_fd = token_fd; + if (token_fd) + opts.btf_flags |= BPF_F_TOKEN_FD; + btf->fd = bpf_btf_load(raw_data, raw_size, &opts); if (btf->fd < 0) { /* time to turn on verbose mode and try again */ @@ -1201,11 +1457,9 @@ int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 lo int btf__load_into_kernel(struct btf *btf) { - return btf_load_into_kernel(btf, NULL, 0, 0); + return btf_load_into_kernel(btf, NULL, 0, 0, 0); } -int btf__load(struct btf *) __attribute__((alias("btf__load_into_kernel"))); - int btf__fd(const struct btf *btf) { return btf->fd; @@ -1317,9 +1571,9 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) void *ptr; int err; - /* we won't know btf_size until we call bpf_obj_get_info_by_fd(). so + /* we won't know btf_size until we call bpf_btf_get_info_by_fd(). so * let's start with a sane default - 4KiB here - and resize it only if - * bpf_obj_get_info_by_fd() needs a bigger buffer. + * bpf_btf_get_info_by_fd() needs a bigger buffer. */ last_size = 4096; ptr = malloc(last_size); @@ -1329,7 +1583,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) memset(&btf_info, 0, sizeof(btf_info)); btf_info.btf = ptr_to_u64(ptr); btf_info.btf_size = last_size; - err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); + err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len); if (!err && btf_info.btf_size > last_size) { void *temp_ptr; @@ -1347,7 +1601,7 @@ struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf) btf_info.btf = ptr_to_u64(ptr); btf_info.btf_size = last_size; - err = bpf_obj_get_info_by_fd(btf_fd, &btf_info, &len); + err = bpf_btf_get_info_by_fd(btf_fd, &btf_info, &len); } if (err || btf_info.btf_size > last_size) { @@ -1382,92 +1636,6 @@ struct btf *btf__load_from_kernel_by_id(__u32 id) return btf__load_from_kernel_by_id_split(id, NULL); } -int btf__get_from_id(__u32 id, struct btf **btf) -{ - struct btf *res; - int err; - - *btf = NULL; - res = btf__load_from_kernel_by_id(id); - err = libbpf_get_error(res); - - if (err) - return libbpf_err(err); - - *btf = res; - return 0; -} - -int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, - __u32 expected_key_size, __u32 expected_value_size, - __u32 *key_type_id, __u32 *value_type_id) -{ - const struct btf_type *container_type; - const struct btf_member *key, *value; - const size_t max_name = 256; - char container_name[max_name]; - __s64 key_size, value_size; - __s32 container_id; - - if (snprintf(container_name, max_name, "____btf_map_%s", map_name) == max_name) { - pr_warn("map:%s length of '____btf_map_%s' is too long\n", - map_name, map_name); - return libbpf_err(-EINVAL); - } - - container_id = btf__find_by_name(btf, container_name); - if (container_id < 0) { - pr_debug("map:%s container_name:%s cannot be found in BTF. Missing BPF_ANNOTATE_KV_PAIR?\n", - map_name, container_name); - return libbpf_err(container_id); - } - - container_type = btf__type_by_id(btf, container_id); - if (!container_type) { - pr_warn("map:%s cannot find BTF type for container_id:%u\n", - map_name, container_id); - return libbpf_err(-EINVAL); - } - - if (!btf_is_struct(container_type) || btf_vlen(container_type) < 2) { - pr_warn("map:%s container_name:%s is an invalid container struct\n", - map_name, container_name); - return libbpf_err(-EINVAL); - } - - key = btf_members(container_type); - value = key + 1; - - key_size = btf__resolve_size(btf, key->type); - if (key_size < 0) { - pr_warn("map:%s invalid BTF key_type_size\n", map_name); - return libbpf_err(key_size); - } - - if (expected_key_size != key_size) { - pr_warn("map:%s btf_key_type_size:%u != map_def_key_size:%u\n", - map_name, (__u32)key_size, expected_key_size); - return libbpf_err(-EINVAL); - } - - value_size = btf__resolve_size(btf, value->type); - if (value_size < 0) { - pr_warn("map:%s invalid BTF value_type_size\n", map_name); - return libbpf_err(value_size); - } - - if (expected_value_size != value_size) { - pr_warn("map:%s btf_value_type_size:%u != map_def_value_size:%u\n", - map_name, (__u32)value_size, expected_value_size); - return libbpf_err(-EINVAL); - } - - *key_type_id = key->type; - *value_type_id = value->type; - - return 0; -} - static void btf_invalidate_raw_data(struct btf *btf) { if (btf->raw_data) { @@ -1623,18 +1791,17 @@ struct btf_pipe { struct hashmap *str_off_map; /* map string offsets from src to dst */ }; -static int btf_rewrite_str(__u32 *str_off, void *ctx) +static int btf_rewrite_str(struct btf_pipe *p, __u32 *str_off) { - struct btf_pipe *p = ctx; - void *mapped_off; + long mapped_off; int off, err; if (!*str_off) /* nothing to do for empty strings */ return 0; if (p->str_off_map && - hashmap__find(p->str_off_map, (void *)(long)*str_off, &mapped_off)) { - *str_off = (__u32)(long)mapped_off; + hashmap__find(p->str_off_map, *str_off, &mapped_off)) { + *str_off = mapped_off; return 0; } @@ -1646,7 +1813,7 @@ static int btf_rewrite_str(__u32 *str_off, void *ctx) * performing expensive string comparisons. */ if (p->str_off_map) { - err = hashmap__append(p->str_off_map, (void *)(long)*str_off, (void *)(long)off); + err = hashmap__append(p->str_off_map, *str_off, off); if (err) return err; } @@ -1655,10 +1822,11 @@ static int btf_rewrite_str(__u32 *str_off, void *ctx) return 0; } -int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_type *src_type) +static int btf_add_type(struct btf_pipe *p, const struct btf_type *src_type) { - struct btf_pipe p = { .src = src_btf, .dst = btf }; + struct btf_field_iter it; struct btf_type *t; + __u32 *str_off; int sz, err; sz = btf_type_size(src_type); @@ -1666,39 +1834,37 @@ int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_t return libbpf_err(sz); /* deconstruct BTF, if necessary, and invalidate raw_data */ - if (btf_ensure_modifiable(btf)) + if (btf_ensure_modifiable(p->dst)) return libbpf_err(-ENOMEM); - t = btf_add_type_mem(btf, sz); + t = btf_add_type_mem(p->dst, sz); if (!t) return libbpf_err(-ENOMEM); memcpy(t, src_type, sz); - err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_STRS); if (err) return libbpf_err(err); - return btf_commit_type(btf, sz); + while ((str_off = btf_field_iter_next(&it))) { + err = btf_rewrite_str(p, str_off); + if (err) + return libbpf_err(err); + } + + return btf_commit_type(p->dst, sz); } -static int btf_rewrite_type_ids(__u32 *type_id, void *ctx) +int btf__add_type(struct btf *btf, const struct btf *src_btf, const struct btf_type *src_type) { - struct btf *btf = ctx; - - if (!*type_id) /* nothing to do for VOID references */ - return 0; + struct btf_pipe p = { .src = src_btf, .dst = btf }; - /* we haven't updated btf's type count yet, so - * btf->start_id + btf->nr_types - 1 is the type ID offset we should - * add to all newly added BTF types - */ - *type_id += btf->start_id + btf->nr_types - 1; - return 0; + return btf_add_type(&p, src_type); } -static size_t btf_dedup_identity_hash_fn(const void *key, void *ctx); -static bool btf_dedup_equal_fn(const void *k1, const void *k2, void *ctx); +static size_t btf_dedup_identity_hash_fn(long key, void *ctx); +static bool btf_dedup_equal_fn(long k1, long k2, void *ctx); int btf__add_btf(struct btf *btf, const struct btf *src_btf) { @@ -1742,6 +1908,9 @@ int btf__add_btf(struct btf *btf, const struct btf *src_btf) memcpy(t, src_btf->types_data, data_sz); for (i = 0; i < cnt; i++) { + struct btf_field_iter it; + __u32 *type_id, *str_off; + sz = btf_type_size(t); if (sz < 0) { /* unlikely, has to be corrupted src_btf */ @@ -1753,16 +1922,32 @@ int btf__add_btf(struct btf *btf, const struct btf *src_btf) *off = t - btf->types_data; /* add, dedup, and remap strings referenced by this BTF type */ - err = btf_type_visit_str_offs(t, btf_rewrite_str, &p); + err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_STRS); if (err) goto err_out; + while ((str_off = btf_field_iter_next(&it))) { + err = btf_rewrite_str(&p, str_off); + if (err) + goto err_out; + } /* remap all type IDs referenced from this BTF type */ - err = btf_type_visit_type_ids(t, btf_rewrite_type_ids, btf); + err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_IDS); if (err) goto err_out; - /* go to next type data and type offset index entry */ + while ((type_id = btf_field_iter_next(&it))) { + if (!*type_id) /* nothing to do for VOID references */ + continue; + + /* we haven't updated btf's type count yet, so + * btf->start_id + btf->nr_types - 1 is the type ID offset we should + * add to all newly added BTF types + */ + *type_id += btf->start_id + btf->nr_types - 1; + } + + /* go to next type data and type offset index entry */ t += sz; off++; } @@ -1791,7 +1976,8 @@ int btf__add_btf(struct btf *btf, const struct btf *src_btf) memset(btf->strs_data + old_strs_len, 0, btf->hdr->str_len - old_strs_len); /* and now restore original strings section size; types data size - * wasn't modified, so doesn't need restoring, see big comment above */ + * wasn't modified, so doesn't need restoring, see big comment above + */ btf->hdr->str_len = old_strs_len; hashmap__free(p.str_off_map); @@ -2115,20 +2301,8 @@ int btf__add_field(struct btf *btf, const char *name, int type_id, return 0; } -/* - * Append new BTF_KIND_ENUM type with: - * - *name* - name of the enum, can be NULL or empty for anonymous enums; - * - *byte_sz* - size of the enum, in bytes. - * - * Enum initially has no enum values in it (and corresponds to enum forward - * declaration). Enumerator values can be added by btf__add_enum_value() - * immediately after btf__add_enum() succeeds. - * - * Returns: - * - >0, type ID of newly added BTF type; - * - <0, on error. - */ -int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) +static int btf_add_enum_common(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed, __u8 kind) { struct btf_type *t; int sz, name_off = 0; @@ -2153,12 +2327,34 @@ int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) /* start out with vlen=0; it will be adjusted when adding enum values */ t->name_off = name_off; - t->info = btf_type_info(BTF_KIND_ENUM, 0, 0); + t->info = btf_type_info(kind, 0, is_signed); t->size = byte_sz; return btf_commit_type(btf, sz); } +/* + * Append new BTF_KIND_ENUM type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum_value() + * immediately after btf__add_enum() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum(struct btf *btf, const char *name, __u32 byte_sz) +{ + /* + * set the signedness to be unsigned, it will change to signed + * if any later enumerator is negative. + */ + return btf_add_enum_common(btf, name, byte_sz, false, BTF_KIND_ENUM); +} + /* * Append new enum value for the current ENUM type with: * - *name* - name of the enumerator value, can't be NULL or empty; @@ -2206,6 +2402,82 @@ int btf__add_enum_value(struct btf *btf, const char *name, __s64 value) t = btf_last_type(btf); btf_type_inc_vlen(t); + /* if negative value, set signedness to signed */ + if (value < 0) + t->info = btf_type_info(btf_kind(t), btf_vlen(t), true); + + btf->hdr->type_len += sz; + btf->hdr->str_off += sz; + return 0; +} + +/* + * Append new BTF_KIND_ENUM64 type with: + * - *name* - name of the enum, can be NULL or empty for anonymous enums; + * - *byte_sz* - size of the enum, in bytes. + * - *is_signed* - whether the enum values are signed or not; + * + * Enum initially has no enum values in it (and corresponds to enum forward + * declaration). Enumerator values can be added by btf__add_enum64_value() + * immediately after btf__add_enum64() succeeds. + * + * Returns: + * - >0, type ID of newly added BTF type; + * - <0, on error. + */ +int btf__add_enum64(struct btf *btf, const char *name, __u32 byte_sz, + bool is_signed) +{ + return btf_add_enum_common(btf, name, byte_sz, is_signed, + BTF_KIND_ENUM64); +} + +/* + * Append new enum value for the current ENUM64 type with: + * - *name* - name of the enumerator value, can't be NULL or empty; + * - *value* - integer value corresponding to enum value *name*; + * Returns: + * - 0, on success; + * - <0, on error. + */ +int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value) +{ + struct btf_enum64 *v; + struct btf_type *t; + int sz, name_off; + + /* last type should be BTF_KIND_ENUM64 */ + if (btf->nr_types == 0) + return libbpf_err(-EINVAL); + t = btf_last_type(btf); + if (!btf_is_enum64(t)) + return libbpf_err(-EINVAL); + + /* non-empty name */ + if (!name || !name[0]) + return libbpf_err(-EINVAL); + + /* decompose and invalidate raw data */ + if (btf_ensure_modifiable(btf)) + return libbpf_err(-ENOMEM); + + sz = sizeof(struct btf_enum64); + v = btf_add_type_mem(btf, sz); + if (!v) + return libbpf_err(-ENOMEM); + + name_off = btf__add_str(btf, name); + if (name_off < 0) + return name_off; + + v->name_off = name_off; + v->val_lo32 = (__u32)value; + v->val_hi32 = value >> 32; + + /* update parent type's vlen */ + t = btf_last_type(btf); + btf_type_inc_vlen(t); + btf->hdr->type_len += sz; btf->hdr->str_off += sz; return 0; @@ -2310,7 +2582,7 @@ int btf__add_restrict(struct btf *btf, int ref_type_id) */ int btf__add_type_tag(struct btf *btf, const char *value, int ref_type_id) { - if (!value|| !value[0]) + if (!value || !value[0]) return libbpf_err(-EINVAL); return btf_add_ref_kind(btf, BTF_KIND_TYPE_TAG, value, ref_type_id); @@ -2847,86 +3119,15 @@ struct btf_ext *btf_ext__new(const __u8 *data, __u32 size) return btf_ext; } -const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size) +const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size) { *size = btf_ext->data_size; return btf_ext->data; } -static int btf_ext_reloc_info(const struct btf *btf, - const struct btf_ext_info *ext_info, - const char *sec_name, __u32 insns_cnt, - void **info, __u32 *cnt) -{ - __u32 sec_hdrlen = sizeof(struct btf_ext_info_sec); - __u32 i, record_size, existing_len, records_len; - struct btf_ext_info_sec *sinfo; - const char *info_sec_name; - __u64 remain_len; - void *data; - - record_size = ext_info->rec_size; - sinfo = ext_info->info; - remain_len = ext_info->len; - while (remain_len > 0) { - records_len = sinfo->num_info * record_size; - info_sec_name = btf__name_by_offset(btf, sinfo->sec_name_off); - if (strcmp(info_sec_name, sec_name)) { - remain_len -= sec_hdrlen + records_len; - sinfo = (void *)sinfo + sec_hdrlen + records_len; - continue; - } - - existing_len = (*cnt) * record_size; - data = realloc(*info, existing_len + records_len); - if (!data) - return libbpf_err(-ENOMEM); - - memcpy(data + existing_len, sinfo->data, records_len); - /* adjust insn_off only, the rest data will be passed - * to the kernel. - */ - for (i = 0; i < sinfo->num_info; i++) { - __u32 *insn_off; - - insn_off = data + existing_len + (i * record_size); - *insn_off = *insn_off / sizeof(struct bpf_insn) + insns_cnt; - } - *info = data; - *cnt += sinfo->num_info; - return 0; - } - - return libbpf_err(-ENOENT); -} - -int btf_ext__reloc_func_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **func_info, __u32 *cnt) -{ - return btf_ext_reloc_info(btf, &btf_ext->func_info, sec_name, - insns_cnt, func_info, cnt); -} - -int btf_ext__reloc_line_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **line_info, __u32 *cnt) -{ - return btf_ext_reloc_info(btf, &btf_ext->line_info, sec_name, - insns_cnt, line_info, cnt); -} - -__u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext) -{ - return btf_ext->func_info.rec_size; -} +__attribute__((alias("btf_ext__raw_data"))) +const void *btf_ext__get_raw_data(const struct btf_ext *btf_ext, __u32 *size); -__u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext) -{ - return btf_ext->line_info.rec_size; -} struct btf_dedup; @@ -2937,6 +3138,7 @@ static int btf_dedup_strings(struct btf_dedup *d); static int btf_dedup_prim_types(struct btf_dedup *d); static int btf_dedup_struct_types(struct btf_dedup *d); static int btf_dedup_ref_types(struct btf_dedup *d); +static int btf_dedup_resolve_fwds(struct btf_dedup *d); static int btf_dedup_compact_types(struct btf_dedup *d); static int btf_dedup_remap_types(struct btf_dedup *d); @@ -3044,15 +3246,16 @@ static int btf_dedup_remap_types(struct btf_dedup *d); * Algorithm summary * ================= * - * Algorithm completes its work in 6 separate passes: + * Algorithm completes its work in 7 separate passes: * * 1. Strings deduplication. * 2. Primitive types deduplication (int, enum, fwd). * 3. Struct/union types deduplication. - * 4. Reference types deduplication (pointers, typedefs, arrays, funcs, func + * 4. Resolve unambiguous forward declarations. + * 5. Reference types deduplication (pointers, typedefs, arrays, funcs, func * protos, and const/volatile/restrict modifiers). - * 5. Types compaction. - * 6. Types remapping. + * 6. Types compaction. + * 7. Types remapping. * * Algorithm determines canonical type descriptor, which is a single * representative type for each truly unique type. This canonical type is the @@ -3077,9 +3280,7 @@ static int btf_dedup_remap_types(struct btf_dedup *d); * deduplicating structs/unions is described in greater details in comments for * `btf_dedup_is_equiv` function. */ - -DEFAULT_VERSION(btf__dedup_v0_6_0, btf__dedup, LIBBPF_0.6.0) -int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts) +int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts) { struct btf_dedup *d; int err; @@ -3118,6 +3319,11 @@ int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts) pr_debug("btf_dedup_struct_types failed:%d\n", err); goto done; } + err = btf_dedup_resolve_fwds(d); + if (err < 0) { + pr_debug("btf_dedup_resolve_fwds failed:%d\n", err); + goto done; + } err = btf_dedup_ref_types(d); if (err < 0) { pr_debug("btf_dedup_ref_types failed:%d\n", err); @@ -3139,19 +3345,6 @@ int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts) return libbpf_err(err); } -COMPAT_VERSION(btf__dedup_deprecated, btf__dedup, LIBBPF_0.0.2) -int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *unused_opts) -{ - LIBBPF_OPTS(btf_dedup_opts, opts, .btf_ext = btf_ext); - - if (unused_opts) { - pr_warn("please use new version of btf__dedup() that supports options\n"); - return libbpf_err(-ENOTSUP); - } - - return btf__dedup(btf, &opts); -} - #define BTF_UNPROCESSED_ID ((__u32)-1) #define BTF_IN_PROGRESS_ID ((__u32)-2) @@ -3197,12 +3390,11 @@ static long hash_combine(long h, long value) } #define for_each_dedup_cand(d, node, hash) \ - hashmap__for_each_key_entry(d->dedup_table, node, (void *)hash) + hashmap__for_each_key_entry(d->dedup_table, node, hash) static int btf_dedup_table_add(struct btf_dedup *d, long hash, __u32 type_id) { - return hashmap__append(d->dedup_table, - (void *)hash, (void *)(long)type_id); + return hashmap__append(d->dedup_table, hash, type_id); } static int btf_dedup_hypot_map_add(struct btf_dedup *d, @@ -3249,17 +3441,17 @@ static void btf_dedup_free(struct btf_dedup *d) free(d); } -static size_t btf_dedup_identity_hash_fn(const void *key, void *ctx) +static size_t btf_dedup_identity_hash_fn(long key, void *ctx) { - return (size_t)key; + return key; } -static size_t btf_dedup_collision_hash_fn(const void *key, void *ctx) +static size_t btf_dedup_collision_hash_fn(long key, void *ctx) { return 0; } -static bool btf_dedup_equal_fn(const void *k1, const void *k2, void *ctx) +static bool btf_dedup_equal_fn(long k1, long k2, void *ctx) { return k1 == k2; } @@ -3330,11 +3522,19 @@ static int btf_for_each_str_off(struct btf_dedup *d, str_off_visit_fn fn, void * int i, r; for (i = 0; i < d->btf->nr_types; i++) { + struct btf_field_iter it; struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + __u32 *str_off; - r = btf_type_visit_str_offs(t, fn, ctx); + r = btf_field_iter_init(&it, t, BTF_FIELD_ITER_STRS); if (r) return r; + + while ((str_off = btf_field_iter_next(&it))) { + r = fn(str_off, ctx); + if (r) + return r; + } } if (!d->btf_ext) @@ -3470,28 +3670,22 @@ static bool btf_equal_int_tag(struct btf_type *t1, struct btf_type *t2) return info1 == info2; } -/* Calculate type signature hash of ENUM. */ +/* Calculate type signature hash of ENUM/ENUM64. */ static long btf_hash_enum(struct btf_type *t) { long h; - /* don't hash vlen and enum members to support enum fwd resolving */ + /* don't hash vlen, enum members and size to support enum fwd resolving */ h = hash_combine(0, t->name_off); - h = hash_combine(h, t->info & ~0xffff); - h = hash_combine(h, t->size); return h; } -/* Check structural equality of two ENUMs. */ -static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2) +static bool btf_equal_enum_members(struct btf_type *t1, struct btf_type *t2) { const struct btf_enum *m1, *m2; __u16 vlen; int i; - if (!btf_equal_common(t1, t2)) - return false; - vlen = btf_vlen(t1); m1 = btf_enum(t1); m2 = btf_enum(t2); @@ -3504,19 +3698,55 @@ static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2) return true; } +static bool btf_equal_enum64_members(struct btf_type *t1, struct btf_type *t2) +{ + const struct btf_enum64 *m1, *m2; + __u16 vlen; + int i; + + vlen = btf_vlen(t1); + m1 = btf_enum64(t1); + m2 = btf_enum64(t2); + for (i = 0; i < vlen; i++) { + if (m1->name_off != m2->name_off || m1->val_lo32 != m2->val_lo32 || + m1->val_hi32 != m2->val_hi32) + return false; + m1++; + m2++; + } + return true; +} + +/* Check structural equality of two ENUMs or ENUM64s. */ +static bool btf_equal_enum(struct btf_type *t1, struct btf_type *t2) +{ + if (!btf_equal_common(t1, t2)) + return false; + + /* t1 & t2 kinds are identical because of btf_equal_common */ + if (btf_kind(t1) == BTF_KIND_ENUM) + return btf_equal_enum_members(t1, t2); + else + return btf_equal_enum64_members(t1, t2); +} + static inline bool btf_is_enum_fwd(struct btf_type *t) { - return btf_is_enum(t) && btf_vlen(t) == 0; + return btf_is_any_enum(t) && btf_vlen(t) == 0; } static bool btf_compat_enum(struct btf_type *t1, struct btf_type *t2) { if (!btf_is_enum_fwd(t1) && !btf_is_enum_fwd(t2)) return btf_equal_enum(t1, t2); - /* ignore vlen when comparing */ + /* At this point either t1 or t2 or both are forward declarations, thus: + * - skip comparing vlen because it is zero for forward declarations; + * - skip comparing size to allow enum forward declarations + * to be compatible with enum64 full declarations; + * - skip comparing kind for the same reason. + */ return t1->name_off == t2->name_off && - (t1->info & ~0xffff) == (t2->info & ~0xffff) && - t1->size == t2->size; + btf_is_any_enum(t1) && btf_is_any_enum(t2); } /* @@ -3731,6 +3961,7 @@ static int btf_dedup_prep(struct btf_dedup *d) h = btf_hash_int_decl_tag(t); break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: h = btf_hash_enum(t); break; case BTF_KIND_STRUCT: @@ -3790,7 +4021,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) case BTF_KIND_INT: h = btf_hash_int_decl_tag(t); for_each_dedup_cand(d, hash_entry, h) { - cand_id = (__u32)(long)hash_entry->value; + cand_id = hash_entry->value; cand = btf_type_by_id(d->btf, cand_id); if (btf_equal_int_tag(t, cand)) { new_id = cand_id; @@ -3800,9 +4031,10 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: h = btf_hash_enum(t); for_each_dedup_cand(d, hash_entry, h) { - cand_id = (__u32)(long)hash_entry->value; + cand_id = hash_entry->value; cand = btf_type_by_id(d->btf, cand_id); if (btf_equal_enum(t, cand)) { new_id = cand_id; @@ -3824,7 +4056,7 @@ static int btf_dedup_prim_type(struct btf_dedup *d, __u32 type_id) case BTF_KIND_FLOAT: h = btf_hash_common(t); for_each_dedup_cand(d, hash_entry, h) { - cand_id = (__u32)(long)hash_entry->value; + cand_id = hash_entry->value; cand = btf_type_by_id(d->btf, cand_id); if (btf_equal_common(t, cand)) { new_id = cand_id; @@ -3903,14 +4135,14 @@ static inline __u16 btf_fwd_kind(struct btf_type *t) } /* Check if given two types are identical ARRAY definitions */ -static int btf_dedup_identical_arrays(struct btf_dedup *d, __u32 id1, __u32 id2) +static bool btf_dedup_identical_arrays(struct btf_dedup *d, __u32 id1, __u32 id2) { struct btf_type *t1, *t2; t1 = btf_type_by_id(d->btf, id1); t2 = btf_type_by_id(d->btf, id2); if (!btf_is_array(t1) || !btf_is_array(t2)) - return 0; + return false; return btf_equal_array(t1, t2); } @@ -3934,7 +4166,9 @@ static bool btf_dedup_identical_structs(struct btf_dedup *d, __u32 id1, __u32 id m1 = btf_members(t1); m2 = btf_members(t2); for (i = 0, n = btf_vlen(t1); i < n; i++, m1++, m2++) { - if (m1->type != m2->type) + if (m1->type != m2->type && + !btf_dedup_identical_arrays(d, m1->type, m2->type) && + !btf_dedup_identical_structs(d, m1->type, m2->type)) return false; } return true; @@ -4113,6 +4347,7 @@ static int btf_dedup_is_equiv(struct btf_dedup *d, __u32 cand_id, return btf_equal_int_tag(cand_type, canon_type); case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: return btf_compat_enum(cand_type, canon_type); case BTF_KIND_FWD: @@ -4324,7 +4559,7 @@ static int btf_dedup_struct_type(struct btf_dedup *d, __u32 type_id) h = btf_hash_struct(t); for_each_dedup_cand(d, hash_entry, h) { - __u32 cand_id = (__u32)(long)hash_entry->value; + __u32 cand_id = hash_entry->value; int eq; /* @@ -4429,7 +4664,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) h = btf_hash_common(t); for_each_dedup_cand(d, hash_entry, h) { - cand_id = (__u32)(long)hash_entry->value; + cand_id = hash_entry->value; cand = btf_type_by_id(d->btf, cand_id); if (btf_equal_common(t, cand)) { new_id = cand_id; @@ -4446,7 +4681,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) h = btf_hash_int_decl_tag(t); for_each_dedup_cand(d, hash_entry, h) { - cand_id = (__u32)(long)hash_entry->value; + cand_id = hash_entry->value; cand = btf_type_by_id(d->btf, cand_id); if (btf_equal_int_tag(t, cand)) { new_id = cand_id; @@ -4470,7 +4705,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) h = btf_hash_array(t); for_each_dedup_cand(d, hash_entry, h) { - cand_id = (__u32)(long)hash_entry->value; + cand_id = hash_entry->value; cand = btf_type_by_id(d->btf, cand_id); if (btf_equal_array(t, cand)) { new_id = cand_id; @@ -4502,7 +4737,7 @@ static int btf_dedup_ref_type(struct btf_dedup *d, __u32 type_id) h = btf_hash_fnproto(t); for_each_dedup_cand(d, hash_entry, h) { - cand_id = (__u32)(long)hash_entry->value; + cand_id = hash_entry->value; cand = btf_type_by_id(d->btf, cand_id); if (btf_equal_fnproto(t, cand)) { new_id = cand_id; @@ -4538,6 +4773,134 @@ static int btf_dedup_ref_types(struct btf_dedup *d) return 0; } +/* + * Collect a map from type names to type ids for all canonical structs + * and unions. If the same name is shared by several canonical types + * use a special value 0 to indicate this fact. + */ +static int btf_dedup_fill_unique_names_map(struct btf_dedup *d, struct hashmap *names_map) +{ + __u32 nr_types = btf__type_cnt(d->btf); + struct btf_type *t; + __u32 type_id; + __u16 kind; + int err; + + /* + * Iterate over base and split module ids in order to get all + * available structs in the map. + */ + for (type_id = 1; type_id < nr_types; ++type_id) { + t = btf_type_by_id(d->btf, type_id); + kind = btf_kind(t); + + if (kind != BTF_KIND_STRUCT && kind != BTF_KIND_UNION) + continue; + + /* Skip non-canonical types */ + if (type_id != d->map[type_id]) + continue; + + err = hashmap__add(names_map, t->name_off, type_id); + if (err == -EEXIST) + err = hashmap__set(names_map, t->name_off, 0, NULL, NULL); + + if (err) + return err; + } + + return 0; +} + +static int btf_dedup_resolve_fwd(struct btf_dedup *d, struct hashmap *names_map, __u32 type_id) +{ + struct btf_type *t = btf_type_by_id(d->btf, type_id); + enum btf_fwd_kind fwd_kind = btf_kflag(t); + __u16 cand_kind, kind = btf_kind(t); + struct btf_type *cand_t; + uintptr_t cand_id; + + if (kind != BTF_KIND_FWD) + return 0; + + /* Skip if this FWD already has a mapping */ + if (type_id != d->map[type_id]) + return 0; + + if (!hashmap__find(names_map, t->name_off, &cand_id)) + return 0; + + /* Zero is a special value indicating that name is not unique */ + if (!cand_id) + return 0; + + cand_t = btf_type_by_id(d->btf, cand_id); + cand_kind = btf_kind(cand_t); + if ((cand_kind == BTF_KIND_STRUCT && fwd_kind != BTF_FWD_STRUCT) || + (cand_kind == BTF_KIND_UNION && fwd_kind != BTF_FWD_UNION)) + return 0; + + d->map[type_id] = cand_id; + + return 0; +} + +/* + * Resolve unambiguous forward declarations. + * + * The lion's share of all FWD declarations is resolved during + * `btf_dedup_struct_types` phase when different type graphs are + * compared against each other. However, if in some compilation unit a + * FWD declaration is not a part of a type graph compared against + * another type graph that declaration's canonical type would not be + * changed. Example: + * + * CU #1: + * + * struct foo; + * struct foo *some_global; + * + * CU #2: + * + * struct foo { int u; }; + * struct foo *another_global; + * + * After `btf_dedup_struct_types` the BTF looks as follows: + * + * [1] STRUCT 'foo' size=4 vlen=1 ... + * [2] INT 'int' size=4 ... + * [3] PTR '(anon)' type_id=1 + * [4] FWD 'foo' fwd_kind=struct + * [5] PTR '(anon)' type_id=4 + * + * This pass assumes that such FWD declarations should be mapped to + * structs or unions with identical name in case if the name is not + * ambiguous. + */ +static int btf_dedup_resolve_fwds(struct btf_dedup *d) +{ + int i, err; + struct hashmap *names_map; + + names_map = hashmap__new(btf_dedup_identity_hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(names_map)) + return PTR_ERR(names_map); + + err = btf_dedup_fill_unique_names_map(d, names_map); + if (err < 0) + goto exit; + + for (i = 0; i < d->btf->nr_types; i++) { + err = btf_dedup_resolve_fwd(d, names_map, d->btf->start_id + i); + if (err < 0) + break; + } + +exit: + hashmap__free(names_map); + return err; +} + /* * Compact types. * @@ -4633,10 +4996,23 @@ static int btf_dedup_remap_types(struct btf_dedup *d) for (i = 0; i < d->btf->nr_types; i++) { struct btf_type *t = btf_type_by_id(d->btf, d->btf->start_id + i); + struct btf_field_iter it; + __u32 *type_id; - r = btf_type_visit_type_ids(t, btf_dedup_remap_type_id, d); + r = btf_field_iter_init(&it, t, BTF_FIELD_ITER_IDS); if (r) return r; + + while ((type_id = btf_field_iter_next(&it))) { + __u32 resolved_id, new_id; + + resolved_id = resolve_type_id(d, *type_id); + new_id = d->hypot_map[resolved_id]; + if (new_id > BTF_MAX_NR_TYPES) + return -EINVAL; + + *type_id = new_id; + } } if (!d->btf_ext) @@ -4655,38 +5031,46 @@ static int btf_dedup_remap_types(struct btf_dedup *d) */ struct btf *btf__load_vmlinux_btf(void) { - struct { - const char *path_fmt; - bool raw_btf; - } locations[] = { - /* try canonical vmlinux BTF through sysfs first */ - { "/sys/kernel/btf/vmlinux", true /* raw BTF */ }, - /* fall back to trying to find vmlinux ELF on disk otherwise */ - { "/boot/vmlinux-%1$s" }, - { "/lib/modules/%1$s/vmlinux-%1$s" }, - { "/lib/modules/%1$s/build/vmlinux" }, - { "/usr/lib/modules/%1$s/kernel/vmlinux" }, - { "/usr/lib/debug/boot/vmlinux-%1$s" }, - { "/usr/lib/debug/boot/vmlinux-%1$s.debug" }, - { "/usr/lib/debug/lib/modules/%1$s/vmlinux" }, + const char *sysfs_btf_path = "/sys/kernel/btf/vmlinux"; + /* fall back locations, trying to find vmlinux on disk */ + const char *locations[] = { + "/boot/vmlinux-%1$s", + "/lib/modules/%1$s/vmlinux-%1$s", + "/lib/modules/%1$s/build/vmlinux", + "/usr/lib/modules/%1$s/kernel/vmlinux", + "/usr/lib/debug/boot/vmlinux-%1$s", + "/usr/lib/debug/boot/vmlinux-%1$s.debug", + "/usr/lib/debug/lib/modules/%1$s/vmlinux", }; char path[PATH_MAX + 1]; struct utsname buf; struct btf *btf; int i, err; - uname(&buf); + /* is canonical sysfs location accessible? */ + if (faccessat(AT_FDCWD, sysfs_btf_path, F_OK, AT_EACCESS) < 0) { + pr_warn("kernel BTF is missing at '%s', was CONFIG_DEBUG_INFO_BTF enabled?\n", + sysfs_btf_path); + } else { + btf = btf__parse(sysfs_btf_path, NULL); + if (!btf) { + err = -errno; + pr_warn("failed to read kernel BTF from '%s': %d\n", sysfs_btf_path, err); + return libbpf_err_ptr(err); + } + pr_debug("loaded kernel BTF from '%s'\n", sysfs_btf_path); + return btf; + } + /* try fallback locations */ + uname(&buf); for (i = 0; i < ARRAY_SIZE(locations); i++) { - snprintf(path, PATH_MAX, locations[i].path_fmt, buf.release); + snprintf(path, PATH_MAX, locations[i], buf.release); - if (access(path, R_OK)) + if (faccessat(AT_FDCWD, path, R_OK, AT_EACCESS)) continue; - if (locations[i].raw_btf) - btf = btf__parse_raw(path); - else - btf = btf__parse_elf(path, NULL); + btf = btf__parse(path, NULL); err = libbpf_get_error(btf); pr_debug("loading kernel BTF '%s': %d\n", path, err); if (err) @@ -4709,125 +5093,6 @@ struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_bt return btf__parse_split(path, vmlinux_btf); } -int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx) -{ - int i, n, err; - - switch (btf_kind(t)) { - case BTF_KIND_INT: - case BTF_KIND_FLOAT: - case BTF_KIND_ENUM: - return 0; - - case BTF_KIND_FWD: - case BTF_KIND_CONST: - case BTF_KIND_VOLATILE: - case BTF_KIND_RESTRICT: - case BTF_KIND_PTR: - case BTF_KIND_TYPEDEF: - case BTF_KIND_FUNC: - case BTF_KIND_VAR: - case BTF_KIND_DECL_TAG: - case BTF_KIND_TYPE_TAG: - return visit(&t->type, ctx); - - case BTF_KIND_ARRAY: { - struct btf_array *a = btf_array(t); - - err = visit(&a->type, ctx); - err = err ?: visit(&a->index_type, ctx); - return err; - } - - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: { - struct btf_member *m = btf_members(t); - - for (i = 0, n = btf_vlen(t); i < n; i++, m++) { - err = visit(&m->type, ctx); - if (err) - return err; - } - return 0; - } - - case BTF_KIND_FUNC_PROTO: { - struct btf_param *m = btf_params(t); - - err = visit(&t->type, ctx); - if (err) - return err; - for (i = 0, n = btf_vlen(t); i < n; i++, m++) { - err = visit(&m->type, ctx); - if (err) - return err; - } - return 0; - } - - case BTF_KIND_DATASEC: { - struct btf_var_secinfo *m = btf_var_secinfos(t); - - for (i = 0, n = btf_vlen(t); i < n; i++, m++) { - err = visit(&m->type, ctx); - if (err) - return err; - } - return 0; - } - - default: - return -EINVAL; - } -} - -int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx) -{ - int i, n, err; - - err = visit(&t->name_off, ctx); - if (err) - return err; - - switch (btf_kind(t)) { - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: { - struct btf_member *m = btf_members(t); - - for (i = 0, n = btf_vlen(t); i < n; i++, m++) { - err = visit(&m->name_off, ctx); - if (err) - return err; - } - break; - } - case BTF_KIND_ENUM: { - struct btf_enum *m = btf_enum(t); - - for (i = 0, n = btf_vlen(t); i < n; i++, m++) { - err = visit(&m->name_off, ctx); - if (err) - return err; - } - break; - } - case BTF_KIND_FUNC_PROTO: { - struct btf_param *m = btf_params(t); - - for (i = 0, n = btf_vlen(t); i < n; i++, m++) { - err = visit(&m->name_off, ctx); - if (err) - return err; - } - break; - } - default: - break; - } - - return 0; -} - int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx) { const struct btf_ext_info *seg; @@ -4907,3 +5172,325 @@ int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void return 0; } + +struct btf_distill { + struct btf_pipe pipe; + int *id_map; + unsigned int split_start_id; + unsigned int split_start_str; + int diff_id; +}; + +static int btf_add_distilled_type_ids(struct btf_distill *dist, __u32 i) +{ + struct btf_type *split_t = btf_type_by_id(dist->pipe.src, i); + struct btf_field_iter it; + __u32 *id; + int err; + + err = btf_field_iter_init(&it, split_t, BTF_FIELD_ITER_IDS); + if (err) + return err; + while ((id = btf_field_iter_next(&it))) { + struct btf_type *base_t; + + if (!*id) + continue; + /* split BTF id, not needed */ + if (*id >= dist->split_start_id) + continue; + /* already added ? */ + if (dist->id_map[*id] > 0) + continue; + + /* only a subset of base BTF types should be referenced from + * split BTF; ensure nothing unexpected is referenced. + */ + base_t = btf_type_by_id(dist->pipe.src, *id); + switch (btf_kind(base_t)) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_FWD: + case BTF_KIND_ARRAY: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_TYPEDEF: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + case BTF_KIND_PTR: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_VOLATILE: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_TYPE_TAG: + dist->id_map[*id] = *id; + break; + default: + pr_warn("unexpected reference to base type[%u] of kind [%u] when creating distilled base BTF.\n", + *id, btf_kind(base_t)); + return -EINVAL; + } + /* If a base type is used, ensure types it refers to are + * marked as used also; so for example if we find a PTR to INT + * we need both the PTR and INT. + * + * The only exception is named struct/unions, since distilled + * base BTF composite types have no members. + */ + if (btf_is_composite(base_t) && base_t->name_off) + continue; + err = btf_add_distilled_type_ids(dist, *id); + if (err) + return err; + } + return 0; +} + +static int btf_add_distilled_types(struct btf_distill *dist) +{ + bool adding_to_base = dist->pipe.dst->start_id == 1; + int id = btf__type_cnt(dist->pipe.dst); + struct btf_type *t; + int i, err = 0; + + + /* Add types for each of the required references to either distilled + * base or split BTF, depending on type characteristics. + */ + for (i = 1; i < dist->split_start_id; i++) { + const char *name; + int kind; + + if (!dist->id_map[i]) + continue; + t = btf_type_by_id(dist->pipe.src, i); + kind = btf_kind(t); + name = btf__name_by_offset(dist->pipe.src, t->name_off); + + switch (kind) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_FWD: + /* Named int, float, fwd are added to base. */ + if (!adding_to_base) + continue; + err = btf_add_type(&dist->pipe, t); + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + /* Named struct/union are added to base as 0-vlen + * struct/union of same size. Anonymous struct/unions + * are added to split BTF as-is. + */ + if (adding_to_base) { + if (!t->name_off) + continue; + err = btf_add_composite(dist->pipe.dst, kind, name, t->size); + } else { + if (t->name_off) + continue; + err = btf_add_type(&dist->pipe, t); + } + break; + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + /* Named enum[64]s are added to base as a sized + * enum; relocation will match with appropriately-named + * and sized enum or enum64. + * + * Anonymous enums are added to split BTF as-is. + */ + if (adding_to_base) { + if (!t->name_off) + continue; + err = btf__add_enum(dist->pipe.dst, name, t->size); + } else { + if (t->name_off) + continue; + err = btf_add_type(&dist->pipe, t); + } + break; + case BTF_KIND_ARRAY: + case BTF_KIND_TYPEDEF: + case BTF_KIND_PTR: + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_VOLATILE: + case BTF_KIND_FUNC_PROTO: + case BTF_KIND_TYPE_TAG: + /* All other types are added to split BTF. */ + if (adding_to_base) + continue; + err = btf_add_type(&dist->pipe, t); + break; + default: + pr_warn("unexpected kind when adding base type '%s'[%u] of kind [%u] to distilled base BTF.\n", + name, i, kind); + return -EINVAL; + + } + if (err < 0) + break; + dist->id_map[i] = id++; + } + return err; +} + +/* Split BTF ids without a mapping will be shifted downwards since distilled + * base BTF is smaller than the original base BTF. For those that have a + * mapping (either to base or updated split BTF), update the id based on + * that mapping. + */ +static int btf_update_distilled_type_ids(struct btf_distill *dist, __u32 i) +{ + struct btf_type *t = btf_type_by_id(dist->pipe.dst, i); + struct btf_field_iter it; + __u32 *id; + int err; + + err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_IDS); + if (err) + return err; + while ((id = btf_field_iter_next(&it))) { + if (dist->id_map[*id]) + *id = dist->id_map[*id]; + else if (*id >= dist->split_start_id) + *id -= dist->diff_id; + } + return 0; +} + +/* Create updated split BTF with distilled base BTF; distilled base BTF + * consists of BTF information required to clarify the types that split + * BTF refers to, omitting unneeded details. Specifically it will contain + * base types and memberless definitions of named structs, unions and enumerated + * types. Associated reference types like pointers, arrays and anonymous + * structs, unions and enumerated types will be added to split BTF. + * Size is recorded for named struct/unions to help guide matching to the + * target base BTF during later relocation. + * + * The only case where structs, unions or enumerated types are fully represented + * is when they are anonymous; in such cases, the anonymous type is added to + * split BTF in full. + * + * We return newly-created split BTF where the split BTF refers to a newly-created + * distilled base BTF. Both must be freed separately by the caller. + */ +int btf__distill_base(const struct btf *src_btf, struct btf **new_base_btf, + struct btf **new_split_btf) +{ + struct btf *new_base = NULL, *new_split = NULL; + const struct btf *old_base; + unsigned int n = btf__type_cnt(src_btf); + struct btf_distill dist = {}; + struct btf_type *t; + int i, err = 0; + + /* src BTF must be split BTF. */ + old_base = btf__base_btf(src_btf); + if (!new_base_btf || !new_split_btf || !old_base) + return libbpf_err(-EINVAL); + + new_base = btf__new_empty(); + if (!new_base) + return libbpf_err(-ENOMEM); + dist.id_map = calloc(n, sizeof(*dist.id_map)); + if (!dist.id_map) { + err = -ENOMEM; + goto done; + } + dist.pipe.src = src_btf; + dist.pipe.dst = new_base; + dist.pipe.str_off_map = hashmap__new(btf_dedup_identity_hash_fn, btf_dedup_equal_fn, NULL); + if (IS_ERR(dist.pipe.str_off_map)) { + err = -ENOMEM; + goto done; + } + dist.split_start_id = btf__type_cnt(old_base); + dist.split_start_str = old_base->hdr->str_len; + + /* Pass over src split BTF; generate the list of base BTF type ids it + * references; these will constitute our distilled BTF set to be + * distributed over base and split BTF as appropriate. + */ + for (i = src_btf->start_id; i < n; i++) { + err = btf_add_distilled_type_ids(&dist, i); + if (err < 0) + goto done; + } + /* Next add types for each of the required references to base BTF and split BTF + * in turn. + */ + err = btf_add_distilled_types(&dist); + if (err < 0) + goto done; + + /* Create new split BTF with distilled base BTF as its base; the final + * state is split BTF with distilled base BTF that represents enough + * about its base references to allow it to be relocated with the base + * BTF available. + */ + new_split = btf__new_empty_split(new_base); + if (!new_split) { + err = -errno; + goto done; + } + dist.pipe.dst = new_split; + /* First add all split types */ + for (i = src_btf->start_id; i < n; i++) { + t = btf_type_by_id(src_btf, i); + err = btf_add_type(&dist.pipe, t); + if (err < 0) + goto done; + } + /* Now add distilled types to split BTF that are not added to base. */ + err = btf_add_distilled_types(&dist); + if (err < 0) + goto done; + + /* All split BTF ids will be shifted downwards since there are less base + * BTF ids in distilled base BTF. + */ + dist.diff_id = dist.split_start_id - btf__type_cnt(new_base); + + n = btf__type_cnt(new_split); + /* Now update base/split BTF ids. */ + for (i = 1; i < n; i++) { + err = btf_update_distilled_type_ids(&dist, i); + if (err < 0) + break; + } +done: + free(dist.id_map); + hashmap__free(dist.pipe.str_off_map); + if (err) { + btf__free(new_split); + btf__free(new_base); + return libbpf_err(err); + } + *new_base_btf = new_base; + *new_split_btf = new_split; + + return 0; +} + +const struct btf_header *btf_header(const struct btf *btf) +{ + return btf->hdr; +} + +void btf_set_base_btf(struct btf *btf, const struct btf *base_btf) +{ + btf->base_btf = (struct btf *)base_btf; + btf->start_id = btf__type_cnt(base_btf); + btf->start_str_off = base_btf->hdr->str_len; +} + +int btf__relocate(struct btf *btf, const struct btf *base_btf) +{ + int err = btf_relocate(btf, base_btf, NULL); + + if (!err) + btf->owns_base = false; + return libbpf_err(err); +} diff --git a/pkg/plugin/lib/common/libbpf/_src/btf.h b/pkg/plugin/lib/common/libbpf/_src/btf.h index 951ac74757..b68d216837 100644 --- a/pkg/plugin/lib/common/libbpf/_src/btf.h +++ b/pkg/plugin/lib/common/libbpf/_src/btf.h @@ -18,6 +18,7 @@ extern "C" { #define BTF_ELF_SEC ".BTF" #define BTF_EXT_ELF_SEC ".BTF.ext" +#define BTF_BASE_ELF_SEC ".BTF.base" #define MAPS_ELF_SEC ".maps" struct btf; @@ -107,6 +108,27 @@ LIBBPF_API struct btf *btf__new_empty(void); */ LIBBPF_API struct btf *btf__new_empty_split(struct btf *base_btf); +/** + * @brief **btf__distill_base()** creates new versions of the split BTF + * *src_btf* and its base BTF. The new base BTF will only contain the types + * needed to improve robustness of the split BTF to small changes in base BTF. + * When that split BTF is loaded against a (possibly changed) base, this + * distilled base BTF will help update references to that (possibly changed) + * base BTF. + * + * Both the new split and its associated new base BTF must be freed by + * the caller. + * + * If successful, 0 is returned and **new_base_btf** and **new_split_btf** + * will point at new base/split BTF. Both the new split and its associated + * new base BTF must be freed by the caller. + * + * A negative value is returned on error and the thread-local `errno` variable + * is set to the error code as well. + */ +LIBBPF_API int btf__distill_base(const struct btf *src_btf, struct btf **new_base_btf, + struct btf **new_split_btf); + LIBBPF_API struct btf *btf__parse(const char *path, struct btf_ext **btf_ext); LIBBPF_API struct btf *btf__parse_split(const char *path, struct btf *base_btf); LIBBPF_API struct btf *btf__parse_elf(const char *path, struct btf_ext **btf_ext); @@ -116,24 +138,15 @@ LIBBPF_API struct btf *btf__parse_raw_split(const char *path, struct btf *base_b LIBBPF_API struct btf *btf__load_vmlinux_btf(void); LIBBPF_API struct btf *btf__load_module_btf(const char *module_name, struct btf *vmlinux_btf); -LIBBPF_API struct btf *libbpf_find_kernel_btf(void); LIBBPF_API struct btf *btf__load_from_kernel_by_id(__u32 id); LIBBPF_API struct btf *btf__load_from_kernel_by_id_split(__u32 id, struct btf *base_btf); -LIBBPF_DEPRECATED_SINCE(0, 6, "use btf__load_from_kernel_by_id instead") -LIBBPF_API int btf__get_from_id(__u32 id, struct btf **btf); -LIBBPF_DEPRECATED_SINCE(0, 6, "intended for internal libbpf use only") -LIBBPF_API int btf__finalize_data(struct bpf_object *obj, struct btf *btf); -LIBBPF_DEPRECATED_SINCE(0, 6, "use btf__load_into_kernel instead") -LIBBPF_API int btf__load(struct btf *btf); LIBBPF_API int btf__load_into_kernel(struct btf *btf); LIBBPF_API __s32 btf__find_by_name(const struct btf *btf, const char *type_name); LIBBPF_API __s32 btf__find_by_name_kind(const struct btf *btf, const char *type_name, __u32 kind); -LIBBPF_DEPRECATED_SINCE(0, 7, "use btf__type_cnt() instead; note that btf__get_nr_types() == btf__type_cnt() - 1") -LIBBPF_API __u32 btf__get_nr_types(const struct btf *btf); LIBBPF_API __u32 btf__type_cnt(const struct btf *btf); LIBBPF_API const struct btf *btf__base_btf(const struct btf *btf); LIBBPF_API const struct btf_type *btf__type_by_id(const struct btf *btf, @@ -150,29 +163,10 @@ LIBBPF_API void btf__set_fd(struct btf *btf, int fd); LIBBPF_API const void *btf__raw_data(const struct btf *btf, __u32 *size); LIBBPF_API const char *btf__name_by_offset(const struct btf *btf, __u32 offset); LIBBPF_API const char *btf__str_by_offset(const struct btf *btf, __u32 offset); -LIBBPF_DEPRECATED_SINCE(0, 7, "this API is not necessary when BTF-defined maps are used") -LIBBPF_API int btf__get_map_kv_tids(const struct btf *btf, const char *map_name, - __u32 expected_key_size, - __u32 expected_value_size, - __u32 *key_type_id, __u32 *value_type_id); LIBBPF_API struct btf_ext *btf_ext__new(const __u8 *data, __u32 size); LIBBPF_API void btf_ext__free(struct btf_ext *btf_ext); LIBBPF_API const void *btf_ext__raw_data(const struct btf_ext *btf_ext, __u32 *size); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") -int btf_ext__reloc_func_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **func_info, __u32 *cnt); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_line_info was never meant as a public API and has wrong assumptions embedded in it; it will be removed in the future libbpf versions") -int btf_ext__reloc_line_info(const struct btf *btf, - const struct btf_ext *btf_ext, - const char *sec_name, __u32 insns_cnt, - void **line_info, __u32 *cnt); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_func_info is deprecated; write custom func_info parsing to fetch rec_size") -__u32 btf_ext__func_info_rec_size(const struct btf_ext *btf_ext); -LIBBPF_API LIBBPF_DEPRECATED("btf_ext__reloc_line_info is deprecated; write custom line_info parsing to fetch rec_size") -__u32 btf_ext__line_info_rec_size(const struct btf_ext *btf_ext); LIBBPF_API int btf__find_str(struct btf *btf, const char *s); LIBBPF_API int btf__add_str(struct btf *btf, const char *s); @@ -215,6 +209,8 @@ LIBBPF_API int btf__add_field(struct btf *btf, const char *name, int field_type_ /* enum construction APIs */ LIBBPF_API int btf__add_enum(struct btf *btf, const char *name, __u32 bytes_sz); LIBBPF_API int btf__add_enum_value(struct btf *btf, const char *name, __s64 value); +LIBBPF_API int btf__add_enum64(struct btf *btf, const char *name, __u32 bytes_sz, bool is_signed); +LIBBPF_API int btf__add_enum64_value(struct btf *btf, const char *name, __u64 value); enum btf_fwd_kind { BTF_FWD_STRUCT = 0, @@ -257,22 +253,26 @@ struct btf_dedup_opts { LIBBPF_API int btf__dedup(struct btf *btf, const struct btf_dedup_opts *opts); -LIBBPF_API int btf__dedup_v0_6_0(struct btf *btf, const struct btf_dedup_opts *opts); - -LIBBPF_DEPRECATED_SINCE(0, 7, "use btf__dedup() instead") -LIBBPF_API int btf__dedup_deprecated(struct btf *btf, struct btf_ext *btf_ext, const void *opts); -#define btf__dedup(...) ___libbpf_overload(___btf_dedup, __VA_ARGS__) -#define ___btf_dedup3(btf, btf_ext, opts) btf__dedup_deprecated(btf, btf_ext, opts) -#define ___btf_dedup2(btf, opts) btf__dedup(btf, opts) +/** + * @brief **btf__relocate()** will check the split BTF *btf* for references + * to base BTF kinds, and verify those references are compatible with + * *base_btf*; if they are, *btf* is adjusted such that is re-parented to + * *base_btf* and type ids and strings are adjusted to accommodate this. + * + * If successful, 0 is returned and **btf** now has **base_btf** as its + * base. + * + * A negative value is returned on error and the thread-local `errno` variable + * is set to the error code as well. + */ +LIBBPF_API int btf__relocate(struct btf *btf, const struct btf *base_btf); struct btf_dump; struct btf_dump_opts { - union { - size_t sz; - void *ctx; /* DEPRECATED: will be gone in v1.0 */ - }; + size_t sz; }; +#define btf_dump_opts__last_field sz typedef void (*btf_dump_printf_fn_t)(void *ctx, const char *fmt, va_list args); @@ -281,51 +281,6 @@ LIBBPF_API struct btf_dump *btf_dump__new(const struct btf *btf, void *ctx, const struct btf_dump_opts *opts); -LIBBPF_API struct btf_dump *btf_dump__new_v0_6_0(const struct btf *btf, - btf_dump_printf_fn_t printf_fn, - void *ctx, - const struct btf_dump_opts *opts); - -LIBBPF_API struct btf_dump *btf_dump__new_deprecated(const struct btf *btf, - const struct btf_ext *btf_ext, - const struct btf_dump_opts *opts, - btf_dump_printf_fn_t printf_fn); - -/* Choose either btf_dump__new() or btf_dump__new_deprecated() based on the - * type of 4th argument. If it's btf_dump's print callback, use deprecated - * API; otherwise, choose the new btf_dump__new(). ___libbpf_override() - * doesn't work here because both variants have 4 input arguments. - * - * (void *) casts are necessary to avoid compilation warnings about type - * mismatches, because even though __builtin_choose_expr() only ever evaluates - * one side the other side still has to satisfy type constraints (this is - * compiler implementation limitation which might be lifted eventually, - * according to the documentation). So passing struct btf_ext in place of - * btf_dump_printf_fn_t would be generating compilation warning. Casting to - * void * avoids this issue. - * - * Also, two type compatibility checks for a function and function pointer are - * required because passing function reference into btf_dump__new() as - * btf_dump__new(..., my_callback, ...) and as btf_dump__new(..., - * &my_callback, ...) (not explicit ampersand in the latter case) actually - * differs as far as __builtin_types_compatible_p() is concerned. Thus two - * checks are combined to detect callback argument. - * - * The rest works just like in case of ___libbpf_override() usage with symbol - * versioning. - * - * C++ compilers don't support __builtin_types_compatible_p(), so at least - * don't screw up compilation for them and let C++ users pick btf_dump__new - * vs btf_dump__new_deprecated explicitly. - */ -#ifndef __cplusplus -#define btf_dump__new(a1, a2, a3, a4) __builtin_choose_expr( \ - __builtin_types_compatible_p(typeof(a4), btf_dump_printf_fn_t) || \ - __builtin_types_compatible_p(typeof(a4), void(void *, const char *, va_list)), \ - btf_dump__new_deprecated((void *)a1, (void *)a2, (void *)a3, (void *)a4), \ - btf_dump__new((void *)a1, (void *)a2, (void *)a3, (void *)a4)) -#endif - LIBBPF_API void btf_dump__free(struct btf_dump *d); LIBBPF_API int btf_dump__dump_type(struct btf_dump *d, __u32 id); @@ -393,9 +348,10 @@ btf_dump__dump_type_data(struct btf_dump *d, __u32 id, #ifndef BTF_KIND_FLOAT #define BTF_KIND_FLOAT 16 /* Floating point */ #endif -/* The kernel header switched to enums, so these two were never #defined */ +/* The kernel header switched to enums, so the following were never #defined */ #define BTF_KIND_DECL_TAG 17 /* Decl Tag */ #define BTF_KIND_TYPE_TAG 18 /* Type Tag */ +#define BTF_KIND_ENUM64 19 /* Enum for up-to 64bit values */ static inline __u16 btf_kind(const struct btf_type *t) { @@ -454,6 +410,11 @@ static inline bool btf_is_enum(const struct btf_type *t) return btf_kind(t) == BTF_KIND_ENUM; } +static inline bool btf_is_enum64(const struct btf_type *t) +{ + return btf_kind(t) == BTF_KIND_ENUM64; +} + static inline bool btf_is_fwd(const struct btf_type *t) { return btf_kind(t) == BTF_KIND_FWD; @@ -524,6 +485,18 @@ static inline bool btf_is_type_tag(const struct btf_type *t) return btf_kind(t) == BTF_KIND_TYPE_TAG; } +static inline bool btf_is_any_enum(const struct btf_type *t) +{ + return btf_is_enum(t) || btf_is_enum64(t); +} + +static inline bool btf_kind_core_compat(const struct btf_type *t1, + const struct btf_type *t2) +{ + return btf_kind(t1) == btf_kind(t2) || + (btf_is_any_enum(t1) && btf_is_any_enum(t2)); +} + static inline __u8 btf_int_encoding(const struct btf_type *t) { return BTF_INT_ENCODING(*(__u32 *)(t + 1)); @@ -549,6 +522,39 @@ static inline struct btf_enum *btf_enum(const struct btf_type *t) return (struct btf_enum *)(t + 1); } +struct btf_enum64; + +static inline struct btf_enum64 *btf_enum64(const struct btf_type *t) +{ + return (struct btf_enum64 *)(t + 1); +} + +static inline __u64 btf_enum64_value(const struct btf_enum64 *e) +{ + /* struct btf_enum64 is introduced in Linux 6.0, which is very + * bleeding-edge. Here we are avoiding relying on struct btf_enum64 + * definition coming from kernel UAPI headers to support wider range + * of system-wide kernel headers. + * + * Given this header can be also included from C++ applications, that + * further restricts C tricks we can use (like using compatible + * anonymous struct). So just treat struct btf_enum64 as + * a three-element array of u32 and access second (lo32) and third + * (hi32) elements directly. + * + * For reference, here is a struct btf_enum64 definition: + * + * const struct btf_enum64 { + * __u32 name_off; + * __u32 val_lo32; + * __u32 val_hi32; + * }; + */ + const __u32 *e64 = (const __u32 *)e; + + return ((__u64)e64[2] << 32) | e64[1]; +} + static inline struct btf_member *btf_members(const struct btf_type *t) { return (struct btf_member *)(t + 1); diff --git a/pkg/plugin/lib/common/libbpf/_src/btf_dump.c b/pkg/plugin/lib/common/libbpf/_src/btf_dump.c index 6b1bc1f437..5dbca76b95 100644 --- a/pkg/plugin/lib/common/libbpf/_src/btf_dump.c +++ b/pkg/plugin/lib/common/libbpf/_src/btf_dump.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -117,14 +118,14 @@ struct btf_dump { struct btf_dump_data *typed_dump; }; -static size_t str_hash_fn(const void *key, void *ctx) +static size_t str_hash_fn(long key, void *ctx) { - return str_hash(key); + return str_hash((void *)key); } -static bool str_equal_fn(const void *a, const void *b, void *ctx) +static bool str_equal_fn(long a, long b, void *ctx) { - return strcmp(a, b) == 0; + return strcmp((void *)a, (void *)b) == 0; } static const char *btf_name_of(const struct btf_dump *d, __u32 name_off) @@ -144,15 +145,17 @@ static void btf_dump_printf(const struct btf_dump *d, const char *fmt, ...) static int btf_dump_mark_referenced(struct btf_dump *d); static int btf_dump_resize(struct btf_dump *d); -DEFAULT_VERSION(btf_dump__new_v0_6_0, btf_dump__new, LIBBPF_0.6.0) -struct btf_dump *btf_dump__new_v0_6_0(const struct btf *btf, - btf_dump_printf_fn_t printf_fn, - void *ctx, - const struct btf_dump_opts *opts) +struct btf_dump *btf_dump__new(const struct btf *btf, + btf_dump_printf_fn_t printf_fn, + void *ctx, + const struct btf_dump_opts *opts) { struct btf_dump *d; int err; + if (!OPTS_VALID(opts, btf_dump_opts)) + return libbpf_err_ptr(-EINVAL); + if (!printf_fn) return libbpf_err_ptr(-EINVAL); @@ -188,17 +191,6 @@ struct btf_dump *btf_dump__new_v0_6_0(const struct btf *btf, return libbpf_err_ptr(err); } -COMPAT_VERSION(btf_dump__new_deprecated, btf_dump__new, LIBBPF_0.0.4) -struct btf_dump *btf_dump__new_deprecated(const struct btf *btf, - const struct btf_ext *btf_ext, - const struct btf_dump_opts *opts, - btf_dump_printf_fn_t printf_fn) -{ - if (!printf_fn) - return libbpf_err_ptr(-EINVAL); - return btf_dump__new_v0_6_0(btf, printf_fn, opts ? opts->ctx : NULL, opts); -} - static int btf_dump_resize(struct btf_dump *d) { int err, last_id = btf__type_cnt(d->btf) - 1; @@ -228,6 +220,17 @@ static int btf_dump_resize(struct btf_dump *d) return 0; } +static void btf_dump_free_names(struct hashmap *map) +{ + size_t bkt; + struct hashmap_entry *cur; + + hashmap__for_each_entry(map, cur, bkt) + free((void *)cur->pkey); + + hashmap__free(map); +} + void btf_dump__free(struct btf_dump *d) { int i; @@ -246,8 +249,8 @@ void btf_dump__free(struct btf_dump *d) free(d->cached_names); free(d->emit_queue); free(d->decl_stack); - hashmap__free(d->type_names); - hashmap__free(d->ident_names); + btf_dump_free_names(d->type_names); + btf_dump_free_names(d->ident_names); free(d); } @@ -318,6 +321,7 @@ static int btf_dump_mark_referenced(struct btf_dump *d) switch (btf_kind(t)) { case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FWD: case BTF_KIND_FLOAT: break; @@ -538,6 +542,7 @@ static int btf_dump_order_type(struct btf_dump *d, __u32 id, bool through_ptr) return 1; } case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FWD: /* * non-anonymous or non-referenced enums are top-level @@ -739,6 +744,7 @@ static void btf_dump_emit_type(struct btf_dump *d, __u32 id, __u32 cont_id) tstate->emit_state = EMITTED; break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: if (top_level_def) { btf_dump_emit_enum_def(d, id, t, 0); btf_dump_printf(d, ";\n\n"); @@ -828,14 +834,9 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id, const struct btf_type *t) { const struct btf_member *m; - int align, i, bit_sz; + int max_align = 1, align, i, bit_sz; __u16 vlen; - align = btf__align_of(btf, id); - /* size of a non-packed struct has to be a multiple of its alignment*/ - if (align && t->size % align) - return true; - m = btf_members(t); vlen = btf_vlen(t); /* all non-bitfield fields have to be naturally aligned */ @@ -844,8 +845,11 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id, bit_sz = btf_member_bitfield_size(t, i); if (align && bit_sz == 0 && m->offset % (8 * align) != 0) return true; + max_align = max(align, max_align); } - + /* size of a non-packed struct has to be a multiple of its alignment */ + if (t->size % max_align != 0) + return true; /* * if original struct was marked as packed, but its layout is * naturally aligned, we'll detect that it's not packed @@ -853,44 +857,97 @@ static bool btf_is_struct_packed(const struct btf *btf, __u32 id, return false; } -static int chip_away_bits(int total, int at_most) -{ - return total % at_most ? : at_most; -} - static void btf_dump_emit_bit_padding(const struct btf_dump *d, - int cur_off, int m_off, int m_bit_sz, - int align, int lvl) + int cur_off, int next_off, int next_align, + bool in_bitfield, int lvl) { - int off_diff = m_off - cur_off; - int ptr_bits = d->ptr_sz * 8; + const struct { + const char *name; + int bits; + } pads[] = { + {"long", d->ptr_sz * 8}, {"int", 32}, {"short", 16}, {"char", 8} + }; + int new_off, pad_bits, bits, i; + const char *pad_type; + + if (cur_off >= next_off) + return; /* no gap */ + + /* For filling out padding we want to take advantage of + * natural alignment rules to minimize unnecessary explicit + * padding. First, we find the largest type (among long, int, + * short, or char) that can be used to force naturally aligned + * boundary. Once determined, we'll use such type to fill in + * the remaining padding gap. In some cases we can rely on + * compiler filling some gaps, but sometimes we need to force + * alignment to close natural alignment with markers like + * `long: 0` (this is always the case for bitfields). Note + * that even if struct itself has, let's say 4-byte alignment + * (i.e., it only uses up to int-aligned types), using `long: + * X;` explicit padding doesn't actually change struct's + * overall alignment requirements, but compiler does take into + * account that type's (long, in this example) natural + * alignment requirements when adding implicit padding. We use + * this fact heavily and don't worry about ruining correct + * struct alignment requirement. + */ + for (i = 0; i < ARRAY_SIZE(pads); i++) { + pad_bits = pads[i].bits; + pad_type = pads[i].name; - if (off_diff <= 0) - /* no gap */ - return; - if (m_bit_sz == 0 && off_diff < align * 8) - /* natural padding will take care of a gap */ - return; + new_off = roundup(cur_off, pad_bits); + if (new_off <= next_off) + break; + } - while (off_diff > 0) { - const char *pad_type; - int pad_bits; - - if (ptr_bits > 32 && off_diff > 32) { - pad_type = "long"; - pad_bits = chip_away_bits(off_diff, ptr_bits); - } else if (off_diff > 16) { - pad_type = "int"; - pad_bits = chip_away_bits(off_diff, 32); - } else if (off_diff > 8) { - pad_type = "short"; - pad_bits = chip_away_bits(off_diff, 16); - } else { - pad_type = "char"; - pad_bits = chip_away_bits(off_diff, 8); + if (new_off > cur_off && new_off <= next_off) { + /* We need explicit `: 0` aligning mark if next + * field is right on alignment offset and its + * alignment requirement is less strict than 's + * alignment (so compiler won't naturally align to the + * offset we expect), or if subsequent `: X`, + * will actually completely fit in the remaining hole, + * making compiler basically ignore `: X` + * completely. + */ + if (in_bitfield || + (new_off == next_off && roundup(cur_off, next_align * 8) != new_off) || + (new_off != next_off && next_off - new_off <= new_off - cur_off)) + /* but for bitfields we'll emit explicit bit count */ + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, + in_bitfield ? new_off - cur_off : 0); + cur_off = new_off; + } + + /* Now we know we start at naturally aligned offset for a chosen + * padding type (long, int, short, or char), and so the rest is just + * a straightforward filling of remaining padding gap with full + * `: sizeof();` markers, except for the last one, which + * might need smaller than sizeof() padding. + */ + while (cur_off != next_off) { + bits = min(next_off - cur_off, pad_bits); + if (bits == pad_bits) { + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); + cur_off += bits; + continue; + } + /* For the remainder padding that doesn't cover entire + * pad_type bit length, we pick the smallest necessary type. + * This is pure aesthetics, we could have just used `long`, + * but having smallest necessary one communicates better the + * scale of the padding gap. + */ + for (i = ARRAY_SIZE(pads) - 1; i >= 0; i--) { + pad_type = pads[i].name; + pad_bits = pads[i].bits; + if (pad_bits < bits) + continue; + + btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, bits); + cur_off += bits; + break; } - btf_dump_printf(d, "\n%s%s: %d;", pfx(lvl), pad_type, pad_bits); - off_diff -= pad_bits; } } @@ -910,9 +967,11 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, { const struct btf_member *m = btf_members(t); bool is_struct = btf_is_struct(t); - int align, i, packed, off = 0; + bool packed, prev_bitfield = false; + int align, i, off = 0; __u16 vlen = btf_vlen(t); + align = btf__align_of(d->btf, id); packed = is_struct ? btf_is_struct_packed(d->btf, id, t) : 0; btf_dump_printf(d, "%s%s%s {", @@ -922,37 +981,47 @@ static void btf_dump_emit_struct_def(struct btf_dump *d, for (i = 0; i < vlen; i++, m++) { const char *fname; - int m_off, m_sz; + int m_off, m_sz, m_align; + bool in_bitfield; fname = btf_name_of(d, m->name_off); m_sz = btf_member_bitfield_size(t, i); m_off = btf_member_bit_offset(t, i); - align = packed ? 1 : btf__align_of(d->btf, m->type); + m_align = packed ? 1 : btf__align_of(d->btf, m->type); + + in_bitfield = prev_bitfield && m_sz != 0; - btf_dump_emit_bit_padding(d, off, m_off, m_sz, align, lvl + 1); + btf_dump_emit_bit_padding(d, off, m_off, m_align, in_bitfield, lvl + 1); btf_dump_printf(d, "\n%s", pfx(lvl + 1)); btf_dump_emit_type_decl(d, m->type, fname, lvl + 1); if (m_sz) { btf_dump_printf(d, ": %d", m_sz); off = m_off + m_sz; + prev_bitfield = true; } else { m_sz = max((__s64)0, btf__resolve_size(d->btf, m->type)); off = m_off + m_sz * 8; + prev_bitfield = false; } + btf_dump_printf(d, ";"); } /* pad at the end, if necessary */ - if (is_struct) { - align = packed ? 1 : btf__align_of(d->btf, id); - btf_dump_emit_bit_padding(d, off, t->size * 8, 0, align, - lvl + 1); - } + if (is_struct) + btf_dump_emit_bit_padding(d, off, t->size * 8, align, false, lvl + 1); - if (vlen) + /* + * Keep `struct empty {}` on a single line, + * only print newline when there are regular or padding fields. + */ + if (vlen || t->size) { btf_dump_printf(d, "\n"); - btf_dump_printf(d, "%s}", pfx(lvl)); + btf_dump_printf(d, "%s}", pfx(lvl)); + } else { + btf_dump_printf(d, "}"); + } if (packed) btf_dump_printf(d, " __attribute__((packed))"); } @@ -989,38 +1058,118 @@ static void btf_dump_emit_enum_fwd(struct btf_dump *d, __u32 id, btf_dump_printf(d, "enum %s", btf_dump_type_name(d, id)); } -static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, - const struct btf_type *t, - int lvl) +static void btf_dump_emit_enum32_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) { const struct btf_enum *v = btf_enum(t); - __u16 vlen = btf_vlen(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; const char *name; size_t dup_cnt; int i; + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + /* enumerators share namespace with typedef idents */ + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %d," : "\n%s%s___%zd = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, dup_cnt, v->val); + } else { + fmt_str = is_signed ? "\n%s%s = %d," : "\n%s%s = %u,"; + btf_dump_printf(d, fmt_str, pfx(lvl + 1), name, v->val); + } + } +} + +static void btf_dump_emit_enum64_val(struct btf_dump *d, + const struct btf_type *t, + int lvl, __u16 vlen) +{ + const struct btf_enum64 *v = btf_enum64(t); + bool is_signed = btf_kflag(t); + const char *fmt_str; + const char *name; + size_t dup_cnt; + __u64 val; + int i; + + for (i = 0; i < vlen; i++, v++) { + name = btf_name_of(d, v->name_off); + dup_cnt = btf_dump_name_dups(d, d->ident_names, name); + val = btf_enum64_value(v); + if (dup_cnt > 1) { + fmt_str = is_signed ? "\n%s%s___%zd = %lldLL," + : "\n%s%s___%zd = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, dup_cnt, + (unsigned long long)val); + } else { + fmt_str = is_signed ? "\n%s%s = %lldLL," + : "\n%s%s = %lluULL,"; + btf_dump_printf(d, fmt_str, + pfx(lvl + 1), name, + (unsigned long long)val); + } + } +} +static void btf_dump_emit_enum_def(struct btf_dump *d, __u32 id, + const struct btf_type *t, + int lvl) +{ + __u16 vlen = btf_vlen(t); + btf_dump_printf(d, "enum%s%s", t->name_off ? " " : "", btf_dump_type_name(d, id)); - if (vlen) { - btf_dump_printf(d, " {"); - for (i = 0; i < vlen; i++, v++) { - name = btf_name_of(d, v->name_off); - /* enumerators share namespace with typedef idents */ - dup_cnt = btf_dump_name_dups(d, d->ident_names, name); - if (dup_cnt > 1) { - btf_dump_printf(d, "\n%s%s___%zu = %u,", - pfx(lvl + 1), name, dup_cnt, - (__u32)v->val); - } else { - btf_dump_printf(d, "\n%s%s = %u,", - pfx(lvl + 1), name, - (__u32)v->val); + if (!vlen) + return; + + btf_dump_printf(d, " {"); + if (btf_is_enum(t)) + btf_dump_emit_enum32_val(d, t, lvl, vlen); + else + btf_dump_emit_enum64_val(d, t, lvl, vlen); + btf_dump_printf(d, "\n%s}", pfx(lvl)); + + /* special case enums with special sizes */ + if (t->size == 1) { + /* one-byte enums can be forced with mode(byte) attribute */ + btf_dump_printf(d, " __attribute__((mode(byte)))"); + } else if (t->size == 8 && d->ptr_sz == 8) { + /* enum can be 8-byte sized if one of the enumerator values + * doesn't fit in 32-bit integer, or by adding mode(word) + * attribute (but probably only on 64-bit architectures); do + * our best here to try to satisfy the contract without adding + * unnecessary attributes + */ + bool needs_word_mode; + + if (btf_is_enum(t)) { + /* enum can't represent 64-bit values, so we need word mode */ + needs_word_mode = true; + } else { + /* enum64 needs mode(word) if none of its values has + * non-zero upper 32-bits (which means that all values + * fit in 32-bit integers and won't cause compiler to + * bump enum to be 64-bit naturally + */ + int i; + + needs_word_mode = true; + for (i = 0; i < vlen; i++) { + if (btf_enum64(t)[i].val_hi32 != 0) { + needs_word_mode = false; + break; + } } } - btf_dump_printf(d, "\n%s}", pfx(lvl)); + if (needs_word_mode) + btf_dump_printf(d, " __attribute__((mode(word)))"); } + } static void btf_dump_emit_fwd_def(struct btf_dump *d, __u32 id, @@ -1178,6 +1327,7 @@ static void btf_dump_emit_type_decl(struct btf_dump *d, __u32 id, break; case BTF_KIND_INT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FWD: case BTF_KIND_STRUCT: case BTF_KIND_UNION: @@ -1312,6 +1462,7 @@ static void btf_dump_emit_type_chain(struct btf_dump *d, btf_dump_emit_struct_fwd(d, id, t); break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: btf_dump_emit_mods(d, decls); /* inline anonymous enum */ if (t->name_off == 0 && !d->skip_anon_defs) @@ -1481,11 +1632,22 @@ static void btf_dump_emit_type_cast(struct btf_dump *d, __u32 id, static size_t btf_dump_name_dups(struct btf_dump *d, struct hashmap *name_map, const char *orig_name) { + char *old_name, *new_name; size_t dup_cnt = 0; + int err; + + new_name = strdup(orig_name); + if (!new_name) + return 1; - hashmap__find(name_map, orig_name, (void **)&dup_cnt); + (void)hashmap__find(name_map, orig_name, &dup_cnt); dup_cnt++; - hashmap__set(name_map, orig_name, (void *)dup_cnt, NULL, NULL); + + err = hashmap__set(name_map, new_name, dup_cnt, &old_name, NULL); + if (err) + free(new_name); + + free(old_name); return dup_cnt; } @@ -1767,6 +1929,7 @@ static int btf_dump_int_data(struct btf_dump *d, if (d->typed_dump->is_array_terminated) break; if (*(char *)data == '\0') { + btf_dump_type_values(d, "'\\0'"); d->typed_dump->is_array_terminated = true; break; } @@ -1869,6 +2032,7 @@ static int btf_dump_array_data(struct btf_dump *d, __u32 i, elem_type_id; __s64 elem_size; bool is_array_member; + bool is_array_terminated; elem_type_id = array->type; elem_type = skip_mods_and_typedefs(d->btf, elem_type_id, NULL); @@ -1904,12 +2068,15 @@ static int btf_dump_array_data(struct btf_dump *d, */ is_array_member = d->typed_dump->is_array_member; d->typed_dump->is_array_member = true; + is_array_terminated = d->typed_dump->is_array_terminated; + d->typed_dump->is_array_terminated = false; for (i = 0; i < array->nelems; i++, data += elem_size) { if (d->typed_dump->is_array_terminated) break; btf_dump_dump_type_data(d, NULL, elem_type, elem_type_id, data, 0, 0); } d->typed_dump->is_array_member = is_array_member; + d->typed_dump->is_array_terminated = is_array_terminated; d->typed_dump->depth--; btf_dump_data_pfx(d); btf_dump_type_values(d, "]"); @@ -1924,7 +2091,7 @@ static int btf_dump_struct_data(struct btf_dump *d, { const struct btf_member *m = btf_members(t); __u16 n = btf_vlen(t); - int i, err; + int i, err = 0; /* note that we increment depth before calling btf_dump_print() below; * this is intentional. btf_dump_data_newline() will not print a @@ -1988,7 +2155,8 @@ static int btf_dump_get_enum_value(struct btf_dump *d, __u32 id, __s64 *value) { - /* handle unaligned enum value */ + bool is_signed = btf_kflag(t); + if (!ptr_is_aligned(d->btf, id, data)) { __u64 val; int err; @@ -2005,13 +2173,13 @@ static int btf_dump_get_enum_value(struct btf_dump *d, *value = *(__s64 *)data; return 0; case 4: - *value = *(__s32 *)data; + *value = is_signed ? (__s64)*(__s32 *)data : *(__u32 *)data; return 0; case 2: - *value = *(__s16 *)data; + *value = is_signed ? *(__s16 *)data : *(__u16 *)data; return 0; case 1: - *value = *(__s8 *)data; + *value = is_signed ? *(__s8 *)data : *(__u8 *)data; return 0; default: pr_warn("unexpected size %d for enum, id:[%u]\n", t->size, id); @@ -2024,7 +2192,7 @@ static int btf_dump_enum_data(struct btf_dump *d, __u32 id, const void *data) { - const struct btf_enum *e; + bool is_signed; __s64 value; int i, err; @@ -2032,14 +2200,31 @@ static int btf_dump_enum_data(struct btf_dump *d, if (err) return err; - for (i = 0, e = btf_enum(t); i < btf_vlen(t); i++, e++) { - if (value != e->val) - continue; - btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); - return 0; - } + is_signed = btf_kflag(t); + if (btf_is_enum(t)) { + const struct btf_enum *e; + + for (i = 0, e = btf_enum(t); i < btf_vlen(t); i++, e++) { + if (value != e->val) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } + + btf_dump_type_values(d, is_signed ? "%d" : "%u", value); + } else { + const struct btf_enum64 *e; + + for (i = 0, e = btf_enum64(t); i < btf_vlen(t); i++, e++) { + if (value != btf_enum64_value(e)) + continue; + btf_dump_type_values(d, "%s", btf_name_of(d, e->name_off)); + return 0; + } - btf_dump_type_values(d, "%d", value); + btf_dump_type_values(d, is_signed ? "%lldLL" : "%lluULL", + (unsigned long long)value); + } return 0; } @@ -2070,9 +2255,25 @@ static int btf_dump_type_data_check_overflow(struct btf_dump *d, const struct btf_type *t, __u32 id, const void *data, - __u8 bits_offset) + __u8 bits_offset, + __u8 bit_sz) { - __s64 size = btf__resolve_size(d->btf, id); + __s64 size; + + if (bit_sz) { + /* bits_offset is at most 7. bit_sz is at most 128. */ + __u8 nr_bytes = (bits_offset + bit_sz + 7) / 8; + + /* When bit_sz is non zero, it is called from + * btf_dump_struct_data() where it only cares about + * negative error value. + * Return nr_bytes in success case to make it + * consistent as the regular integer case below. + */ + return data + nr_bytes > d->typed_dump->data_end ? -E2BIG : nr_bytes; + } + + size = btf__resolve_size(d->btf, id); if (size < 0 || size >= INT_MAX) { pr_warn("unexpected size [%zu] for id [%u]\n", @@ -2099,6 +2300,7 @@ static int btf_dump_type_data_check_overflow(struct btf_dump *d, case BTF_KIND_FLOAT: case BTF_KIND_PTR: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: if (data + bits_offset / 8 + size > d->typed_dump->data_end) return -E2BIG; break; @@ -2203,6 +2405,7 @@ static int btf_dump_type_data_check_zero(struct btf_dump *d, return -ENODATA; } case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: err = btf_dump_get_enum_value(d, t, data, id, &value); if (err) return err; @@ -2225,7 +2428,7 @@ static int btf_dump_dump_type_data(struct btf_dump *d, { int size, err = 0; - size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset); + size = btf_dump_type_data_check_overflow(d, t, id, data, bits_offset, bit_sz); if (size < 0) return size; err = btf_dump_type_data_check_zero(d, t, id, data, bits_offset, bit_sz); @@ -2275,6 +2478,7 @@ static int btf_dump_dump_type_data(struct btf_dump *d, err = btf_dump_struct_data(d, t, id, data); break; case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: /* handle bitfield and int enum values */ if (bit_sz) { __u64 print_num; @@ -2325,7 +2529,7 @@ int btf_dump__dump_type_data(struct btf_dump *d, __u32 id, d->typed_dump->indent_lvl = OPTS_GET(opts, indent_level, 0); /* default indent string is a tab */ - if (!opts->indent_str) + if (!OPTS_GET(opts, indent_str, NULL)) d->typed_dump->indent_str[0] = '\t'; else libbpf_strlcpy(d->typed_dump->indent_str, opts->indent_str, diff --git a/pkg/plugin/lib/common/libbpf/_src/btf_iter.c b/pkg/plugin/lib/common/libbpf/_src/btf_iter.c new file mode 100644 index 0000000000..9a6c822c22 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_src/btf_iter.c @@ -0,0 +1,177 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2021 Facebook */ +/* Copyright (c) 2024, Oracle and/or its affiliates. */ + +#ifdef __KERNEL__ +#include +#include + +#define btf_var_secinfos(t) (struct btf_var_secinfo *)btf_type_var_secinfo(t) + +#else +#include "btf.h" +#include "libbpf_internal.h" +#endif + +int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t, + enum btf_field_iter_kind iter_kind) +{ + it->p = NULL; + it->m_idx = -1; + it->off_idx = 0; + it->vlen = 0; + + switch (iter_kind) { + case BTF_FIELD_ITER_IDS: + switch (btf_kind(t)) { + case BTF_KIND_UNKN: + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + it->desc = (struct btf_field_desc) {}; + break; + case BTF_KIND_FWD: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + it->desc = (struct btf_field_desc) { 1, {offsetof(struct btf_type, type)} }; + break; + case BTF_KIND_ARRAY: + it->desc = (struct btf_field_desc) { + 2, {sizeof(struct btf_type) + offsetof(struct btf_array, type), + sizeof(struct btf_type) + offsetof(struct btf_array, index_type)} + }; + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + it->desc = (struct btf_field_desc) { + 0, {}, + sizeof(struct btf_member), + 1, {offsetof(struct btf_member, type)} + }; + break; + case BTF_KIND_FUNC_PROTO: + it->desc = (struct btf_field_desc) { + 1, {offsetof(struct btf_type, type)}, + sizeof(struct btf_param), + 1, {offsetof(struct btf_param, type)} + }; + break; + case BTF_KIND_DATASEC: + it->desc = (struct btf_field_desc) { + 0, {}, + sizeof(struct btf_var_secinfo), + 1, {offsetof(struct btf_var_secinfo, type)} + }; + break; + default: + return -EINVAL; + } + break; + case BTF_FIELD_ITER_STRS: + switch (btf_kind(t)) { + case BTF_KIND_UNKN: + it->desc = (struct btf_field_desc) {}; + break; + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_FWD: + case BTF_KIND_ARRAY: + case BTF_KIND_CONST: + case BTF_KIND_VOLATILE: + case BTF_KIND_RESTRICT: + case BTF_KIND_PTR: + case BTF_KIND_TYPEDEF: + case BTF_KIND_FUNC: + case BTF_KIND_VAR: + case BTF_KIND_DECL_TAG: + case BTF_KIND_TYPE_TAG: + case BTF_KIND_DATASEC: + it->desc = (struct btf_field_desc) { + 1, {offsetof(struct btf_type, name_off)} + }; + break; + case BTF_KIND_ENUM: + it->desc = (struct btf_field_desc) { + 1, {offsetof(struct btf_type, name_off)}, + sizeof(struct btf_enum), + 1, {offsetof(struct btf_enum, name_off)} + }; + break; + case BTF_KIND_ENUM64: + it->desc = (struct btf_field_desc) { + 1, {offsetof(struct btf_type, name_off)}, + sizeof(struct btf_enum64), + 1, {offsetof(struct btf_enum64, name_off)} + }; + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + it->desc = (struct btf_field_desc) { + 1, {offsetof(struct btf_type, name_off)}, + sizeof(struct btf_member), + 1, {offsetof(struct btf_member, name_off)} + }; + break; + case BTF_KIND_FUNC_PROTO: + it->desc = (struct btf_field_desc) { + 1, {offsetof(struct btf_type, name_off)}, + sizeof(struct btf_param), + 1, {offsetof(struct btf_param, name_off)} + }; + break; + default: + return -EINVAL; + } + break; + default: + return -EINVAL; + } + + if (it->desc.m_sz) + it->vlen = btf_vlen(t); + + it->p = t; + return 0; +} + +__u32 *btf_field_iter_next(struct btf_field_iter *it) +{ + if (!it->p) + return NULL; + + if (it->m_idx < 0) { + if (it->off_idx < it->desc.t_off_cnt) + return it->p + it->desc.t_offs[it->off_idx++]; + /* move to per-member iteration */ + it->m_idx = 0; + it->p += sizeof(struct btf_type); + it->off_idx = 0; + } + + /* if type doesn't have members, stop */ + if (it->desc.m_sz == 0) { + it->p = NULL; + return NULL; + } + + if (it->off_idx >= it->desc.m_off_cnt) { + /* exhausted this member's fields, go to the next member */ + it->m_idx++; + it->p += it->desc.m_sz; + it->off_idx = 0; + } + + if (it->m_idx < it->vlen) + return it->p + it->desc.m_offs[it->off_idx++]; + + it->p = NULL; + return NULL; +} diff --git a/pkg/plugin/lib/common/libbpf/_src/btf_relocate.c b/pkg/plugin/lib/common/libbpf/_src/btf_relocate.c new file mode 100644 index 0000000000..17f8b32f94 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_src/btf_relocate.c @@ -0,0 +1,519 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright (c) 2024, Oracle and/or its affiliates. */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include + +#define btf_type_by_id (struct btf_type *)btf_type_by_id +#define btf__type_cnt btf_nr_types +#define btf__base_btf btf_base_btf +#define btf__name_by_offset btf_name_by_offset +#define btf__str_by_offset btf_str_by_offset +#define btf_kflag btf_type_kflag + +#define calloc(nmemb, sz) kvcalloc(nmemb, sz, GFP_KERNEL | __GFP_NOWARN) +#define free(ptr) kvfree(ptr) +#define qsort(base, num, sz, cmp) sort(base, num, sz, cmp, NULL) + +#else + +#include "btf.h" +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_internal.h" + +#endif /* __KERNEL__ */ + +struct btf; + +struct btf_relocate { + struct btf *btf; + const struct btf *base_btf; + const struct btf *dist_base_btf; + unsigned int nr_base_types; + unsigned int nr_split_types; + unsigned int nr_dist_base_types; + int dist_str_len; + int base_str_len; + __u32 *id_map; + __u32 *str_map; +}; + +/* Set temporarily in relocation id_map if distilled base struct/union is + * embedded in a split BTF struct/union; in such a case, size information must + * match between distilled base BTF and base BTF representation of type. + */ +#define BTF_IS_EMBEDDED ((__u32)-1) + +/* triple used in sorting/searching distilled base BTF. */ +struct btf_name_info { + const char *name; + /* set when search requires a size match */ + bool needs_size: 1; + unsigned int size: 31; + __u32 id; +}; + +static int btf_relocate_rewrite_type_id(struct btf_relocate *r, __u32 i) +{ + struct btf_type *t = btf_type_by_id(r->btf, i); + struct btf_field_iter it; + __u32 *id; + int err; + + err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_IDS); + if (err) + return err; + + while ((id = btf_field_iter_next(&it))) + *id = r->id_map[*id]; + return 0; +} + +/* Simple string comparison used for sorting within BTF, since all distilled + * types are named. If strings match, and size is non-zero for both elements + * fall back to using size for ordering. + */ +static int cmp_btf_name_size(const void *n1, const void *n2) +{ + const struct btf_name_info *ni1 = n1; + const struct btf_name_info *ni2 = n2; + int name_diff = strcmp(ni1->name, ni2->name); + + if (!name_diff && ni1->needs_size && ni2->needs_size) + return ni2->size - ni1->size; + return name_diff; +} + +/* Binary search with a small twist; find leftmost element that matches + * so that we can then iterate through all exact matches. So for example + * searching { "a", "bb", "bb", "c" } we would always match on the + * leftmost "bb". + */ +static struct btf_name_info *search_btf_name_size(struct btf_name_info *key, + struct btf_name_info *vals, + int nelems) +{ + struct btf_name_info *ret = NULL; + int high = nelems - 1; + int low = 0; + + while (low <= high) { + int mid = (low + high)/2; + struct btf_name_info *val = &vals[mid]; + int diff = cmp_btf_name_size(key, val); + + if (diff == 0) + ret = val; + /* even if found, keep searching for leftmost match */ + if (diff <= 0) + high = mid - 1; + else + low = mid + 1; + } + return ret; +} + +/* If a member of a split BTF struct/union refers to a base BTF + * struct/union, mark that struct/union id temporarily in the id_map + * with BTF_IS_EMBEDDED. Members can be const/restrict/volatile/typedef + * reference types, but if a pointer is encountered, the type is no longer + * considered embedded. + */ +static int btf_mark_embedded_composite_type_ids(struct btf_relocate *r, __u32 i) +{ + struct btf_type *t = btf_type_by_id(r->btf, i); + struct btf_field_iter it; + __u32 *id; + int err; + + if (!btf_is_composite(t)) + return 0; + + err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_IDS); + if (err) + return err; + + while ((id = btf_field_iter_next(&it))) { + __u32 next_id = *id; + + while (next_id) { + t = btf_type_by_id(r->btf, next_id); + switch (btf_kind(t)) { + case BTF_KIND_CONST: + case BTF_KIND_RESTRICT: + case BTF_KIND_VOLATILE: + case BTF_KIND_TYPEDEF: + case BTF_KIND_TYPE_TAG: + next_id = t->type; + break; + case BTF_KIND_ARRAY: { + struct btf_array *a = btf_array(t); + + next_id = a->type; + break; + } + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + if (next_id < r->nr_dist_base_types) + r->id_map[next_id] = BTF_IS_EMBEDDED; + next_id = 0; + break; + default: + next_id = 0; + break; + } + } + } + + return 0; +} + +/* Build a map from distilled base BTF ids to base BTF ids. To do so, iterate + * through base BTF looking up distilled type (using binary search) equivalents. + */ +static int btf_relocate_map_distilled_base(struct btf_relocate *r) +{ + struct btf_name_info *info, *info_end; + struct btf_type *base_t, *dist_t; + __u8 *base_name_cnt = NULL; + int err = 0; + __u32 id; + + /* generate a sort index array of name/type ids sorted by name for + * distilled base BTF to speed name-based lookups. + */ + info = calloc(r->nr_dist_base_types, sizeof(*info)); + if (!info) { + err = -ENOMEM; + goto done; + } + info_end = info + r->nr_dist_base_types; + for (id = 0; id < r->nr_dist_base_types; id++) { + dist_t = btf_type_by_id(r->dist_base_btf, id); + info[id].name = btf__name_by_offset(r->dist_base_btf, dist_t->name_off); + info[id].id = id; + info[id].size = dist_t->size; + info[id].needs_size = true; + } + qsort(info, r->nr_dist_base_types, sizeof(*info), cmp_btf_name_size); + + /* Mark distilled base struct/union members of split BTF structs/unions + * in id_map with BTF_IS_EMBEDDED; this signals that these types + * need to match both name and size, otherwise embedding the base + * struct/union in the split type is invalid. + */ + for (id = r->nr_dist_base_types; id < r->nr_split_types; id++) { + err = btf_mark_embedded_composite_type_ids(r, id); + if (err) + goto done; + } + + /* Collect name counts for composite types in base BTF. If multiple + * instances of a struct/union of the same name exist, we need to use + * size to determine which to map to since name alone is ambiguous. + */ + base_name_cnt = calloc(r->base_str_len, sizeof(*base_name_cnt)); + if (!base_name_cnt) { + err = -ENOMEM; + goto done; + } + for (id = 1; id < r->nr_base_types; id++) { + base_t = btf_type_by_id(r->base_btf, id); + if (!btf_is_composite(base_t) || !base_t->name_off) + continue; + if (base_name_cnt[base_t->name_off] < 255) + base_name_cnt[base_t->name_off]++; + } + + /* Now search base BTF for matching distilled base BTF types. */ + for (id = 1; id < r->nr_base_types; id++) { + struct btf_name_info *dist_info, base_info = {}; + int dist_kind, base_kind; + + base_t = btf_type_by_id(r->base_btf, id); + /* distilled base consists of named types only. */ + if (!base_t->name_off) + continue; + base_kind = btf_kind(base_t); + base_info.id = id; + base_info.name = btf__name_by_offset(r->base_btf, base_t->name_off); + switch (base_kind) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + /* These types should match both name and size */ + base_info.needs_size = true; + base_info.size = base_t->size; + break; + case BTF_KIND_FWD: + /* No size considerations for fwds. */ + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + /* Size only needs to be used for struct/union if there + * are multiple types in base BTF with the same name. + * If there are multiple _distilled_ types with the same + * name (a very unlikely scenario), that doesn't matter + * unless corresponding _base_ types to match them are + * missing. + */ + base_info.needs_size = base_name_cnt[base_t->name_off] > 1; + base_info.size = base_t->size; + break; + default: + continue; + } + /* iterate over all matching distilled base types */ + for (dist_info = search_btf_name_size(&base_info, info, r->nr_dist_base_types); + dist_info != NULL && dist_info < info_end && + cmp_btf_name_size(&base_info, dist_info) == 0; + dist_info++) { + if (!dist_info->id || dist_info->id >= r->nr_dist_base_types) { + pr_warn("base BTF id [%d] maps to invalid distilled base BTF id [%d]\n", + id, dist_info->id); + err = -EINVAL; + goto done; + } + dist_t = btf_type_by_id(r->dist_base_btf, dist_info->id); + dist_kind = btf_kind(dist_t); + + /* Validate that the found distilled type is compatible. + * Do not error out on mismatch as another match may + * occur for an identically-named type. + */ + switch (dist_kind) { + case BTF_KIND_FWD: + switch (base_kind) { + case BTF_KIND_FWD: + if (btf_kflag(dist_t) != btf_kflag(base_t)) + continue; + break; + case BTF_KIND_STRUCT: + if (btf_kflag(base_t)) + continue; + break; + case BTF_KIND_UNION: + if (!btf_kflag(base_t)) + continue; + break; + default: + continue; + } + break; + case BTF_KIND_INT: + if (dist_kind != base_kind || + btf_int_encoding(base_t) != btf_int_encoding(dist_t)) + continue; + break; + case BTF_KIND_FLOAT: + if (dist_kind != base_kind) + continue; + break; + case BTF_KIND_ENUM: + /* ENUM and ENUM64 are encoded as sized ENUM in + * distilled base BTF. + */ + if (base_kind != dist_kind && base_kind != BTF_KIND_ENUM64) + continue; + break; + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + /* size verification is required for embedded + * struct/unions. + */ + if (r->id_map[dist_info->id] == BTF_IS_EMBEDDED && + base_t->size != dist_t->size) + continue; + break; + default: + continue; + } + if (r->id_map[dist_info->id] && + r->id_map[dist_info->id] != BTF_IS_EMBEDDED) { + /* we already have a match; this tells us that + * multiple base types of the same name + * have the same size, since for cases where + * multiple types have the same name we match + * on name and size. In this case, we have + * no way of determining which to relocate + * to in base BTF, so error out. + */ + pr_warn("distilled base BTF type '%s' [%u], size %u has multiple candidates of the same size (ids [%u, %u]) in base BTF\n", + base_info.name, dist_info->id, + base_t->size, id, r->id_map[dist_info->id]); + err = -EINVAL; + goto done; + } + /* map id and name */ + r->id_map[dist_info->id] = id; + r->str_map[dist_t->name_off] = base_t->name_off; + } + } + /* ensure all distilled BTF ids now have a mapping... */ + for (id = 1; id < r->nr_dist_base_types; id++) { + const char *name; + + if (r->id_map[id] && r->id_map[id] != BTF_IS_EMBEDDED) + continue; + dist_t = btf_type_by_id(r->dist_base_btf, id); + name = btf__name_by_offset(r->dist_base_btf, dist_t->name_off); + pr_warn("distilled base BTF type '%s' [%d] is not mapped to base BTF id\n", + name, id); + err = -EINVAL; + break; + } +done: + free(base_name_cnt); + free(info); + return err; +} + +/* distilled base should only have named int/float/enum/fwd/struct/union types. */ +static int btf_relocate_validate_distilled_base(struct btf_relocate *r) +{ + unsigned int i; + + for (i = 1; i < r->nr_dist_base_types; i++) { + struct btf_type *t = btf_type_by_id(r->dist_base_btf, i); + int kind = btf_kind(t); + + switch (kind) { + case BTF_KIND_INT: + case BTF_KIND_FLOAT: + case BTF_KIND_ENUM: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_FWD: + if (t->name_off) + break; + pr_warn("type [%d], kind [%d] is invalid for distilled base BTF; it is anonymous\n", + i, kind); + return -EINVAL; + default: + pr_warn("type [%d] in distilled based BTF has unexpected kind [%d]\n", + i, kind); + return -EINVAL; + } + } + return 0; +} + +static int btf_relocate_rewrite_strs(struct btf_relocate *r, __u32 i) +{ + struct btf_type *t = btf_type_by_id(r->btf, i); + struct btf_field_iter it; + __u32 *str_off; + int off, err; + + err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_STRS); + if (err) + return err; + + while ((str_off = btf_field_iter_next(&it))) { + if (!*str_off) + continue; + if (*str_off >= r->dist_str_len) { + *str_off += r->base_str_len - r->dist_str_len; + } else { + off = r->str_map[*str_off]; + if (!off) { + pr_warn("string '%s' [offset %u] is not mapped to base BTF", + btf__str_by_offset(r->btf, off), *str_off); + return -ENOENT; + } + *str_off = off; + } + } + return 0; +} + +/* If successful, output of relocation is updated BTF with base BTF pointing + * at base_btf, and type ids, strings adjusted accordingly. + */ +int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **id_map) +{ + unsigned int nr_types = btf__type_cnt(btf); + const struct btf_header *dist_base_hdr; + const struct btf_header *base_hdr; + struct btf_relocate r = {}; + int err = 0; + __u32 id, i; + + r.dist_base_btf = btf__base_btf(btf); + if (!base_btf || r.dist_base_btf == base_btf) + return -EINVAL; + + r.nr_dist_base_types = btf__type_cnt(r.dist_base_btf); + r.nr_base_types = btf__type_cnt(base_btf); + r.nr_split_types = nr_types - r.nr_dist_base_types; + r.btf = btf; + r.base_btf = base_btf; + + r.id_map = calloc(nr_types, sizeof(*r.id_map)); + r.str_map = calloc(btf_header(r.dist_base_btf)->str_len, sizeof(*r.str_map)); + dist_base_hdr = btf_header(r.dist_base_btf); + base_hdr = btf_header(r.base_btf); + r.dist_str_len = dist_base_hdr->str_len; + r.base_str_len = base_hdr->str_len; + if (!r.id_map || !r.str_map) { + err = -ENOMEM; + goto err_out; + } + + err = btf_relocate_validate_distilled_base(&r); + if (err) + goto err_out; + + /* Split BTF ids need to be adjusted as base and distilled base + * have different numbers of types, changing the start id of split + * BTF. + */ + for (id = r.nr_dist_base_types; id < nr_types; id++) + r.id_map[id] = id + r.nr_base_types - r.nr_dist_base_types; + + /* Build a map from distilled base ids to actual base BTF ids; it is used + * to update split BTF id references. Also build a str_map mapping from + * distilled base BTF names to base BTF names. + */ + err = btf_relocate_map_distilled_base(&r); + if (err) + goto err_out; + + /* Next, rewrite type ids in split BTF, replacing split ids with updated + * ids based on number of types in base BTF, and base ids with + * relocated ids from base_btf. + */ + for (i = 0, id = r.nr_dist_base_types; i < r.nr_split_types; i++, id++) { + err = btf_relocate_rewrite_type_id(&r, id); + if (err) + goto err_out; + } + /* String offsets now need to be updated using the str_map. */ + for (i = 0; i < r.nr_split_types; i++) { + err = btf_relocate_rewrite_strs(&r, i + r.nr_dist_base_types); + if (err) + goto err_out; + } + /* Finally reset base BTF to be base_btf */ + btf_set_base_btf(btf, base_btf); + + if (id_map) { + *id_map = r.id_map; + r.id_map = NULL; + } +err_out: + free(r.id_map); + free(r.str_map); + return err; +} diff --git a/pkg/plugin/lib/common/libbpf/_src/elf.c b/pkg/plugin/lib/common/libbpf/_src/elf.c new file mode 100644 index 0000000000..c92e023941 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_src/elf.c @@ -0,0 +1,558 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include +#include +#include +#include + +#include "libbpf_internal.h" +#include "str_error.h" + +/* A SHT_GNU_versym section holds 16-bit words. This bit is set if + * the symbol is hidden and can only be seen when referenced using an + * explicit version number. This is a GNU extension. + */ +#define VERSYM_HIDDEN 0x8000 + +/* This is the mask for the rest of the data in a word read from a + * SHT_GNU_versym section. + */ +#define VERSYM_VERSION 0x7fff + +int elf_open(const char *binary_path, struct elf_fd *elf_fd) +{ + char errmsg[STRERR_BUFSIZE]; + int fd, ret; + Elf *elf; + + if (elf_version(EV_CURRENT) == EV_NONE) { + pr_warn("elf: failed to init libelf for %s\n", binary_path); + return -LIBBPF_ERRNO__LIBELF; + } + fd = open(binary_path, O_RDONLY | O_CLOEXEC); + if (fd < 0) { + ret = -errno; + pr_warn("elf: failed to open %s: %s\n", binary_path, + libbpf_strerror_r(ret, errmsg, sizeof(errmsg))); + return ret; + } + elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); + if (!elf) { + pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1)); + close(fd); + return -LIBBPF_ERRNO__FORMAT; + } + elf_fd->fd = fd; + elf_fd->elf = elf; + return 0; +} + +void elf_close(struct elf_fd *elf_fd) +{ + if (!elf_fd) + return; + elf_end(elf_fd->elf); + close(elf_fd->fd); +} + +/* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ +static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn) +{ + while ((scn = elf_nextscn(elf, scn)) != NULL) { + GElf_Shdr sh; + + if (!gelf_getshdr(scn, &sh)) + continue; + if (sh.sh_type == sh_type) + return scn; + } + return NULL; +} + +struct elf_sym { + const char *name; + GElf_Sym sym; + GElf_Shdr sh; + int ver; + bool hidden; +}; + +struct elf_sym_iter { + Elf *elf; + Elf_Data *syms; + Elf_Data *versyms; + Elf_Data *verdefs; + size_t nr_syms; + size_t strtabidx; + size_t verdef_strtabidx; + size_t next_sym_idx; + struct elf_sym sym; + int st_type; +}; + +static int elf_sym_iter_new(struct elf_sym_iter *iter, + Elf *elf, const char *binary_path, + int sh_type, int st_type) +{ + Elf_Scn *scn = NULL; + GElf_Ehdr ehdr; + GElf_Shdr sh; + + memset(iter, 0, sizeof(*iter)); + + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1)); + return -EINVAL; + } + + scn = elf_find_next_scn_by_type(elf, sh_type, NULL); + if (!scn) { + pr_debug("elf: failed to find symbol table ELF sections in '%s'\n", + binary_path); + return -ENOENT; + } + + if (!gelf_getshdr(scn, &sh)) + return -EINVAL; + + iter->strtabidx = sh.sh_link; + iter->syms = elf_getdata(scn, 0); + if (!iter->syms) { + pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n", + binary_path, elf_errmsg(-1)); + return -EINVAL; + } + iter->nr_syms = iter->syms->d_size / sh.sh_entsize; + iter->elf = elf; + iter->st_type = st_type; + + /* Version symbol table is meaningful to dynsym only */ + if (sh_type != SHT_DYNSYM) + return 0; + + scn = elf_find_next_scn_by_type(elf, SHT_GNU_versym, NULL); + if (!scn) + return 0; + iter->versyms = elf_getdata(scn, 0); + + scn = elf_find_next_scn_by_type(elf, SHT_GNU_verdef, NULL); + if (!scn) + return 0; + + iter->verdefs = elf_getdata(scn, 0); + if (!iter->verdefs || !gelf_getshdr(scn, &sh)) { + pr_warn("elf: failed to get verdef ELF section in '%s'\n", binary_path); + return -EINVAL; + } + iter->verdef_strtabidx = sh.sh_link; + + return 0; +} + +static struct elf_sym *elf_sym_iter_next(struct elf_sym_iter *iter) +{ + struct elf_sym *ret = &iter->sym; + GElf_Sym *sym = &ret->sym; + const char *name = NULL; + GElf_Versym versym; + Elf_Scn *sym_scn; + size_t idx; + + for (idx = iter->next_sym_idx; idx < iter->nr_syms; idx++) { + if (!gelf_getsym(iter->syms, idx, sym)) + continue; + if (GELF_ST_TYPE(sym->st_info) != iter->st_type) + continue; + name = elf_strptr(iter->elf, iter->strtabidx, sym->st_name); + if (!name) + continue; + sym_scn = elf_getscn(iter->elf, sym->st_shndx); + if (!sym_scn) + continue; + if (!gelf_getshdr(sym_scn, &ret->sh)) + continue; + + iter->next_sym_idx = idx + 1; + ret->name = name; + ret->ver = 0; + ret->hidden = false; + + if (iter->versyms) { + if (!gelf_getversym(iter->versyms, idx, &versym)) + continue; + ret->ver = versym & VERSYM_VERSION; + ret->hidden = versym & VERSYM_HIDDEN; + } + return ret; + } + + return NULL; +} + +static const char *elf_get_vername(struct elf_sym_iter *iter, int ver) +{ + GElf_Verdaux verdaux; + GElf_Verdef verdef; + int offset; + + if (!iter->verdefs) + return NULL; + + offset = 0; + while (gelf_getverdef(iter->verdefs, offset, &verdef)) { + if (verdef.vd_ndx != ver) { + if (!verdef.vd_next) + break; + + offset += verdef.vd_next; + continue; + } + + if (!gelf_getverdaux(iter->verdefs, offset + verdef.vd_aux, &verdaux)) + break; + + return elf_strptr(iter->elf, iter->verdef_strtabidx, verdaux.vda_name); + + } + return NULL; +} + +static bool symbol_match(struct elf_sym_iter *iter, int sh_type, struct elf_sym *sym, + const char *name, size_t name_len, const char *lib_ver) +{ + const char *ver_name; + + /* Symbols are in forms of func, func@LIB_VER or func@@LIB_VER + * make sure the func part matches the user specified name + */ + if (strncmp(sym->name, name, name_len) != 0) + return false; + + /* ...but we don't want a search for "foo" to match 'foo2" also, so any + * additional characters in sname should be of the form "@@LIB". + */ + if (sym->name[name_len] != '\0' && sym->name[name_len] != '@') + return false; + + /* If user does not specify symbol version, then we got a match */ + if (!lib_ver) + return true; + + /* If user specifies symbol version, for dynamic symbols, + * get version name from ELF verdef section for comparison. + */ + if (sh_type == SHT_DYNSYM) { + ver_name = elf_get_vername(iter, sym->ver); + if (!ver_name) + return false; + return strcmp(ver_name, lib_ver) == 0; + } + + /* For normal symbols, it is already in form of func@LIB_VER */ + return strcmp(sym->name, name) == 0; +} + +/* Transform symbol's virtual address (absolute for binaries and relative + * for shared libs) into file offset, which is what kernel is expecting + * for uprobe/uretprobe attachment. + * See Documentation/trace/uprobetracer.rst for more details. This is done + * by looking up symbol's containing section's header and using iter's virtual + * address (sh_addr) and corresponding file offset (sh_offset) to transform + * sym.st_value (virtual address) into desired final file offset. + */ +static unsigned long elf_sym_offset(struct elf_sym *sym) +{ + return sym->sym.st_value - sym->sh.sh_addr + sym->sh.sh_offset; +} + +/* Find offset of function name in the provided ELF object. "binary_path" is + * the path to the ELF binary represented by "elf", and only used for error + * reporting matters. "name" matches symbol name or name@@LIB for library + * functions. + */ +long elf_find_func_offset(Elf *elf, const char *binary_path, const char *name) +{ + int i, sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; + const char *at_symbol, *lib_ver; + bool is_shared_lib; + long ret = -ENOENT; + size_t name_len; + GElf_Ehdr ehdr; + + if (!gelf_getehdr(elf, &ehdr)) { + pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } + /* for shared lib case, we do not need to calculate relative offset */ + is_shared_lib = ehdr.e_type == ET_DYN; + + /* Does name specify "@@LIB_VER" or "@LIB_VER" ? */ + at_symbol = strchr(name, '@'); + if (at_symbol) { + name_len = at_symbol - name; + /* skip second @ if it's @@LIB_VER case */ + if (at_symbol[1] == '@') + at_symbol++; + lib_ver = at_symbol + 1; + } else { + name_len = strlen(name); + lib_ver = NULL; + } + + /* Search SHT_DYNSYM, SHT_SYMTAB for symbol. This search order is used because if + * a binary is stripped, it may only have SHT_DYNSYM, and a fully-statically + * linked binary may not have SHT_DYMSYM, so absence of a section should not be + * reported as a warning/error. + */ + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + struct elf_sym_iter iter; + struct elf_sym *sym; + int last_bind = -1; + int cur_bind; + + ret = elf_sym_iter_new(&iter, elf, binary_path, sh_types[i], STT_FUNC); + if (ret == -ENOENT) + continue; + if (ret) + goto out; + + while ((sym = elf_sym_iter_next(&iter))) { + if (!symbol_match(&iter, sh_types[i], sym, name, name_len, lib_ver)) + continue; + + cur_bind = GELF_ST_BIND(sym->sym.st_info); + + if (ret > 0) { + /* handle multiple matches */ + if (elf_sym_offset(sym) == ret) { + /* same offset, no problem */ + continue; + } else if (last_bind != STB_WEAK && cur_bind != STB_WEAK) { + /* Only accept one non-weak bind. */ + pr_warn("elf: ambiguous match for '%s', '%s' in '%s'\n", + sym->name, name, binary_path); + ret = -LIBBPF_ERRNO__FORMAT; + goto out; + } else if (cur_bind == STB_WEAK) { + /* already have a non-weak bind, and + * this is a weak bind, so ignore. + */ + continue; + } + } + + ret = elf_sym_offset(sym); + last_bind = cur_bind; + } + if (ret > 0) + break; + } + + if (ret > 0) { + pr_debug("elf: symbol address match for '%s' in '%s': 0x%lx\n", name, binary_path, + ret); + } else { + if (ret == 0) { + pr_warn("elf: '%s' is 0 in symtab for '%s': %s\n", name, binary_path, + is_shared_lib ? "should not be 0 in a shared library" : + "try using shared library path instead"); + ret = -ENOENT; + } else { + pr_warn("elf: failed to find symbol '%s' in '%s'\n", name, binary_path); + } + } +out: + return ret; +} + +/* Find offset of function name in ELF object specified by path. "name" matches + * symbol name or name@@LIB for library functions. + */ +long elf_find_func_offset_from_file(const char *binary_path, const char *name) +{ + struct elf_fd elf_fd; + long ret = -ENOENT; + + ret = elf_open(binary_path, &elf_fd); + if (ret) + return ret; + ret = elf_find_func_offset(elf_fd.elf, binary_path, name); + elf_close(&elf_fd); + return ret; +} + +struct symbol { + const char *name; + int bind; + int idx; +}; + +static int symbol_cmp(const void *a, const void *b) +{ + const struct symbol *sym_a = a; + const struct symbol *sym_b = b; + + return strcmp(sym_a->name, sym_b->name); +} + +/* + * Return offsets in @poffsets for symbols specified in @syms array argument. + * On success returns 0 and offsets are returned in allocated array with @cnt + * size, that needs to be released by the caller. + */ +int elf_resolve_syms_offsets(const char *binary_path, int cnt, + const char **syms, unsigned long **poffsets, + int st_type) +{ + int sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; + int err = 0, i, cnt_done = 0; + unsigned long *offsets; + struct symbol *symbols; + struct elf_fd elf_fd; + + err = elf_open(binary_path, &elf_fd); + if (err) + return err; + + offsets = calloc(cnt, sizeof(*offsets)); + symbols = calloc(cnt, sizeof(*symbols)); + + if (!offsets || !symbols) { + err = -ENOMEM; + goto out; + } + + for (i = 0; i < cnt; i++) { + symbols[i].name = syms[i]; + symbols[i].idx = i; + } + + qsort(symbols, cnt, sizeof(*symbols), symbol_cmp); + + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + struct elf_sym_iter iter; + struct elf_sym *sym; + + err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], st_type); + if (err == -ENOENT) + continue; + if (err) + goto out; + + while ((sym = elf_sym_iter_next(&iter))) { + unsigned long sym_offset = elf_sym_offset(sym); + int bind = GELF_ST_BIND(sym->sym.st_info); + struct symbol *found, tmp = { + .name = sym->name, + }; + unsigned long *offset; + + found = bsearch(&tmp, symbols, cnt, sizeof(*symbols), symbol_cmp); + if (!found) + continue; + + offset = &offsets[found->idx]; + if (*offset > 0) { + /* same offset, no problem */ + if (*offset == sym_offset) + continue; + /* handle multiple matches */ + if (found->bind != STB_WEAK && bind != STB_WEAK) { + /* Only accept one non-weak bind. */ + pr_warn("elf: ambiguous match found '%s@%lu' in '%s' previous offset %lu\n", + sym->name, sym_offset, binary_path, *offset); + err = -ESRCH; + goto out; + } else if (bind == STB_WEAK) { + /* already have a non-weak bind, and + * this is a weak bind, so ignore. + */ + continue; + } + } else { + cnt_done++; + } + *offset = sym_offset; + found->bind = bind; + } + } + + if (cnt != cnt_done) { + err = -ENOENT; + goto out; + } + + *poffsets = offsets; + +out: + free(symbols); + if (err) + free(offsets); + elf_close(&elf_fd); + return err; +} + +/* + * Return offsets in @poffsets for symbols specified by @pattern argument. + * On success returns 0 and offsets are returned in allocated @poffsets + * array with the @pctn size, that needs to be released by the caller. + */ +int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern, + unsigned long **poffsets, size_t *pcnt) +{ + int sh_types[2] = { SHT_SYMTAB, SHT_DYNSYM }; + unsigned long *offsets = NULL; + size_t cap = 0, cnt = 0; + struct elf_fd elf_fd; + int err = 0, i; + + err = elf_open(binary_path, &elf_fd); + if (err) + return err; + + for (i = 0; i < ARRAY_SIZE(sh_types); i++) { + struct elf_sym_iter iter; + struct elf_sym *sym; + + err = elf_sym_iter_new(&iter, elf_fd.elf, binary_path, sh_types[i], STT_FUNC); + if (err == -ENOENT) + continue; + if (err) + goto out; + + while ((sym = elf_sym_iter_next(&iter))) { + if (!glob_match(sym->name, pattern)) + continue; + + err = libbpf_ensure_mem((void **) &offsets, &cap, sizeof(*offsets), + cnt + 1); + if (err) + goto out; + + offsets[cnt++] = elf_sym_offset(sym); + } + + /* If we found anything in the first symbol section, + * do not search others to avoid duplicates. + */ + if (cnt) + break; + } + + if (cnt) { + *poffsets = offsets; + *pcnt = cnt; + } else { + err = -ENOENT; + } + +out: + if (err) + free(offsets); + elf_close(&elf_fd); + return err; +} diff --git a/pkg/plugin/lib/common/libbpf/_src/features.c b/pkg/plugin/lib/common/libbpf/_src/features.c new file mode 100644 index 0000000000..50befe125d --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_src/features.c @@ -0,0 +1,613 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* Copyright (c) 2023 Meta Platforms, Inc. and affiliates. */ +#include +#include +#include "bpf.h" +#include "libbpf.h" +#include "libbpf_common.h" +#include "libbpf_internal.h" +#include "str_error.h" + +static inline __u64 ptr_to_u64(const void *ptr) +{ + return (__u64)(unsigned long)ptr; +} + +int probe_fd(int fd) +{ + if (fd >= 0) + close(fd); + return fd >= 0; +} + +static int probe_kern_prog_name(int token_fd) +{ + const size_t attr_sz = offsetofend(union bpf_attr, prog_token_fd); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + union bpf_attr attr; + int ret; + + memset(&attr, 0, attr_sz); + attr.prog_type = BPF_PROG_TYPE_SOCKET_FILTER; + attr.license = ptr_to_u64("GPL"); + attr.insns = ptr_to_u64(insns); + attr.insn_cnt = (__u32)ARRAY_SIZE(insns); + attr.prog_token_fd = token_fd; + if (token_fd) + attr.prog_flags |= BPF_F_TOKEN_FD; + libbpf_strlcpy(attr.prog_name, "libbpf_nametest", sizeof(attr.prog_name)); + + /* make sure loading with name works */ + ret = sys_bpf_prog_load(&attr, attr_sz, PROG_LOAD_ATTEMPTS); + return probe_fd(ret); +} + +static int probe_kern_global_data(int token_fd) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16), + BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + LIBBPF_OPTS(bpf_map_create_opts, map_opts, + .token_fd = token_fd, + .map_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + LIBBPF_OPTS(bpf_prog_load_opts, prog_opts, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + int ret, map, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_global", sizeof(int), 32, 1, &map_opts); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + insns[0].imm = map; + + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &prog_opts); + close(map); + return probe_fd(ret); +} + +static int probe_kern_btf(int token_fd) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs), token_fd)); +} + +static int probe_kern_btf_func(int token_fd) +{ + static const char strs[] = "\0int\0x\0a"; + /* void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs), token_fd)); +} + +static int probe_kern_btf_func_global(int token_fd) +{ + static const char strs[] = "\0int\0x\0a"; + /* static void x(int a) {} */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* FUNC_PROTO */ /* [2] */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), + BTF_PARAM_ENC(7, 1), + /* FUNC x BTF_FUNC_GLOBAL */ /* [3] */ + BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs), token_fd)); +} + +static int probe_kern_btf_datasec(int token_fd) +{ + static const char strs[] = "\0x\0.data"; + /* static int a; */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC val */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs), token_fd)); +} + +static int probe_kern_btf_qmark_datasec(int token_fd) +{ + static const char strs[] = "\0x\0?.data"; + /* static int a; */ + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* DATASEC ?.data */ /* [3] */ + BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), + BTF_VAR_SECINFO_ENC(2, 0, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs), token_fd)); +} + +static int probe_kern_btf_float(int token_fd) +{ + static const char strs[] = "\0float"; + __u32 types[] = { + /* float */ + BTF_TYPE_FLOAT_ENC(1, 4), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs), token_fd)); +} + +static int probe_kern_btf_decl_tag(int token_fd) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* VAR x */ /* [2] */ + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), + BTF_VAR_STATIC, + /* attr */ + BTF_TYPE_DECL_TAG_ENC(1, 2, -1), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs), token_fd)); +} + +static int probe_kern_btf_type_tag(int token_fd) +{ + static const char strs[] = "\0tag"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ + /* attr */ + BTF_TYPE_TYPE_TAG_ENC(1, 1), /* [2] */ + /* ptr */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), /* [3] */ + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs), token_fd)); +} + +static int probe_kern_array_mmap(int token_fd) +{ + LIBBPF_OPTS(bpf_map_create_opts, opts, + .map_flags = BPF_F_MMAPABLE | (token_fd ? BPF_F_TOKEN_FD : 0), + .token_fd = token_fd, + ); + int fd; + + fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_mmap", sizeof(int), sizeof(int), 1, &opts); + return probe_fd(fd); +} + +static int probe_kern_exp_attach_type(int token_fd) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + /* use any valid combination of program type and (optional) + * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS) + * to see if kernel supports expected_attach_type field for + * BPF_PROG_LOAD command + */ + fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts); + return probe_fd(fd); +} + +static int probe_kern_probe_read_kernel(int token_fd) +{ + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + struct bpf_insn insns[] = { + BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), /* r1 = r10 (fp) */ + BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), /* r1 += -8 */ + BPF_MOV64_IMM(BPF_REG_2, 8), /* r2 = 8 */ + BPF_MOV64_IMM(BPF_REG_3, 0), /* r3 = 0 */ + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel), + BPF_EXIT_INSN(), + }; + int fd, insn_cnt = ARRAY_SIZE(insns); + + fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts); + return probe_fd(fd); +} + +static int probe_prog_bind_map(int token_fd) +{ + char *cp, errmsg[STRERR_BUFSIZE]; + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + LIBBPF_OPTS(bpf_map_create_opts, map_opts, + .token_fd = token_fd, + .map_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + LIBBPF_OPTS(bpf_prog_load_opts, prog_opts, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + int ret, map, prog, insn_cnt = ARRAY_SIZE(insns); + + map = bpf_map_create(BPF_MAP_TYPE_ARRAY, "libbpf_det_bind", sizeof(int), 32, 1, &map_opts); + if (map < 0) { + ret = -errno; + cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); + pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", + __func__, cp, -ret); + return ret; + } + + prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &prog_opts); + if (prog < 0) { + close(map); + return 0; + } + + ret = bpf_prog_bind_map(prog, map, NULL); + + close(map); + close(prog); + + return ret >= 0; +} + +static int probe_module_btf(int token_fd) +{ + static const char strs[] = "\0int"; + __u32 types[] = { + /* int */ + BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), + }; + struct bpf_btf_info info; + __u32 len = sizeof(info); + char name[16]; + int fd, err; + + fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs), token_fd); + if (fd < 0) + return 0; /* BTF not supported at all */ + + memset(&info, 0, sizeof(info)); + info.name = ptr_to_u64(name); + info.name_len = sizeof(name); + + /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; + * kernel's module BTF support coincides with support for + * name/name_len fields in struct bpf_btf_info. + */ + err = bpf_btf_get_info_by_fd(fd, &info, &len); + close(fd); + return !err; +} + +static int probe_perf_link(int token_fd) +{ + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + int prog_fd, link_fd, err; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", + insns, ARRAY_SIZE(insns), &opts); + if (prog_fd < 0) + return -errno; + + /* use invalid perf_event FD to get EBADF, if link is supported; + * otherwise EINVAL should be returned + */ + link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + + return link_fd < 0 && err == -EBADF; +} + +static int probe_uprobe_multi_link(int token_fd) +{ + LIBBPF_OPTS(bpf_prog_load_opts, load_opts, + .expected_attach_type = BPF_TRACE_UPROBE_MULTI, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + LIBBPF_OPTS(bpf_link_create_opts, link_opts); + struct bpf_insn insns[] = { + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }; + int prog_fd, link_fd, err; + unsigned long offset = 0; + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", + insns, ARRAY_SIZE(insns), &load_opts); + if (prog_fd < 0) + return -errno; + + /* Creating uprobe in '/' binary should fail with -EBADF. */ + link_opts.uprobe_multi.path = "/"; + link_opts.uprobe_multi.offsets = &offset; + link_opts.uprobe_multi.cnt = 1; + + link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0 || err != -EBADF) { + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + return 0; + } + + /* Initial multi-uprobe support in kernel didn't handle PID filtering + * correctly (it was doing thread filtering, not process filtering). + * So now we'll detect if PID filtering logic was fixed, and, if not, + * we'll pretend multi-uprobes are not supported, if not. + * Multi-uprobes are used in USDT attachment logic, and we need to be + * conservative here, because multi-uprobe selection happens early at + * load time, while the use of PID filtering is known late at + * attachment time, at which point it's too late to undo multi-uprobe + * selection. + * + * Creating uprobe with pid == -1 for (invalid) '/' binary will fail + * early with -EINVAL on kernels with fixed PID filtering logic; + * otherwise -ESRCH would be returned if passed correct binary path + * (but we'll just get -BADF, of course). + */ + link_opts.uprobe_multi.pid = -1; /* invalid PID */ + link_opts.uprobe_multi.path = "/"; /* invalid path */ + link_opts.uprobe_multi.offsets = &offset; + link_opts.uprobe_multi.cnt = 1; + + link_fd = bpf_link_create(prog_fd, -1, BPF_TRACE_UPROBE_MULTI, &link_opts); + err = -errno; /* close() can clobber errno */ + + if (link_fd >= 0) + close(link_fd); + close(prog_fd); + + return link_fd < 0 && err == -EINVAL; +} + +static int probe_kern_bpf_cookie(int token_fd) +{ + struct bpf_insn insns[] = { + BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie), + BPF_EXIT_INSN(), + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + int ret, insn_cnt = ARRAY_SIZE(insns); + + ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts); + return probe_fd(ret); +} + +static int probe_kern_btf_enum64(int token_fd) +{ + static const char strs[] = "\0enum64"; + __u32 types[] = { + BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_ENUM64, 0, 0), 8), + }; + + return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), + strs, sizeof(strs), token_fd)); +} + +static int probe_kern_arg_ctx_tag(int token_fd) +{ + static const char strs[] = "\0a\0b\0arg:ctx\0"; + const __u32 types[] = { + /* [1] INT */ + BTF_TYPE_INT_ENC(1 /* "a" */, BTF_INT_SIGNED, 0, 32, 4), + /* [2] PTR -> VOID */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 0), + /* [3] FUNC_PROTO `int(void *a)` */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), + BTF_PARAM_ENC(1 /* "a" */, 2), + /* [4] FUNC 'a' -> FUNC_PROTO (main prog) */ + BTF_TYPE_ENC(1 /* "a" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 3), + /* [5] FUNC_PROTO `int(void *b __arg_ctx)` */ + BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 1), + BTF_PARAM_ENC(3 /* "b" */, 2), + /* [6] FUNC 'b' -> FUNC_PROTO (subprog) */ + BTF_TYPE_ENC(3 /* "b" */, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 5), + /* [7] DECL_TAG 'arg:ctx' -> func 'b' arg 'b' */ + BTF_TYPE_DECL_TAG_ENC(5 /* "arg:ctx" */, 6, 0), + }; + const struct bpf_insn insns[] = { + /* main prog */ + BPF_CALL_REL(+1), + BPF_EXIT_INSN(), + /* global subprog */ + BPF_EMIT_CALL(BPF_FUNC_get_func_ip), /* needs PTR_TO_CTX */ + BPF_EXIT_INSN(), + }; + const struct bpf_func_info_min func_infos[] = { + { 0, 4 }, /* main prog -> FUNC 'a' */ + { 2, 6 }, /* subprog -> FUNC 'b' */ + }; + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .token_fd = token_fd, + .prog_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); + int prog_fd, btf_fd, insn_cnt = ARRAY_SIZE(insns); + + btf_fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs), token_fd); + if (btf_fd < 0) + return 0; + + opts.prog_btf_fd = btf_fd; + opts.func_info = &func_infos; + opts.func_info_cnt = ARRAY_SIZE(func_infos); + opts.func_info_rec_size = sizeof(func_infos[0]); + + prog_fd = bpf_prog_load(BPF_PROG_TYPE_KPROBE, "det_arg_ctx", + "GPL", insns, insn_cnt, &opts); + close(btf_fd); + + return probe_fd(prog_fd); +} + +typedef int (*feature_probe_fn)(int /* token_fd */); + +static struct kern_feature_cache feature_cache; + +static struct kern_feature_desc { + const char *desc; + feature_probe_fn probe; +} feature_probes[__FEAT_CNT] = { + [FEAT_PROG_NAME] = { + "BPF program name", probe_kern_prog_name, + }, + [FEAT_GLOBAL_DATA] = { + "global variables", probe_kern_global_data, + }, + [FEAT_BTF] = { + "minimal BTF", probe_kern_btf, + }, + [FEAT_BTF_FUNC] = { + "BTF functions", probe_kern_btf_func, + }, + [FEAT_BTF_GLOBAL_FUNC] = { + "BTF global function", probe_kern_btf_func_global, + }, + [FEAT_BTF_DATASEC] = { + "BTF data section and variable", probe_kern_btf_datasec, + }, + [FEAT_ARRAY_MMAP] = { + "ARRAY map mmap()", probe_kern_array_mmap, + }, + [FEAT_EXP_ATTACH_TYPE] = { + "BPF_PROG_LOAD expected_attach_type attribute", + probe_kern_exp_attach_type, + }, + [FEAT_PROBE_READ_KERN] = { + "bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel, + }, + [FEAT_PROG_BIND_MAP] = { + "BPF_PROG_BIND_MAP support", probe_prog_bind_map, + }, + [FEAT_MODULE_BTF] = { + "module BTF support", probe_module_btf, + }, + [FEAT_BTF_FLOAT] = { + "BTF_KIND_FLOAT support", probe_kern_btf_float, + }, + [FEAT_PERF_LINK] = { + "BPF perf link support", probe_perf_link, + }, + [FEAT_BTF_DECL_TAG] = { + "BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag, + }, + [FEAT_BTF_TYPE_TAG] = { + "BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag, + }, + [FEAT_MEMCG_ACCOUNT] = { + "memcg-based memory accounting", probe_memcg_account, + }, + [FEAT_BPF_COOKIE] = { + "BPF cookie support", probe_kern_bpf_cookie, + }, + [FEAT_BTF_ENUM64] = { + "BTF_KIND_ENUM64 support", probe_kern_btf_enum64, + }, + [FEAT_SYSCALL_WRAPPER] = { + "Kernel using syscall wrapper", probe_kern_syscall_wrapper, + }, + [FEAT_UPROBE_MULTI_LINK] = { + "BPF multi-uprobe link support", probe_uprobe_multi_link, + }, + [FEAT_ARG_CTX_TAG] = { + "kernel-side __arg_ctx tag", probe_kern_arg_ctx_tag, + }, + [FEAT_BTF_QMARK_DATASEC] = { + "BTF DATASEC names starting from '?'", probe_kern_btf_qmark_datasec, + }, +}; + +bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id) +{ + struct kern_feature_desc *feat = &feature_probes[feat_id]; + int ret; + + /* assume global feature cache, unless custom one is provided */ + if (!cache) + cache = &feature_cache; + + if (READ_ONCE(cache->res[feat_id]) == FEAT_UNKNOWN) { + ret = feat->probe(cache->token_fd); + if (ret > 0) { + WRITE_ONCE(cache->res[feat_id], FEAT_SUPPORTED); + } else if (ret == 0) { + WRITE_ONCE(cache->res[feat_id], FEAT_MISSING); + } else { + pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret); + WRITE_ONCE(cache->res[feat_id], FEAT_MISSING); + } + } + + return READ_ONCE(cache->res[feat_id]) == FEAT_SUPPORTED; +} diff --git a/pkg/plugin/lib/common/libbpf/_src/gen_loader.c b/pkg/plugin/lib/common/libbpf/_src/gen_loader.c index 927745b080..cf3323fd47 100644 --- a/pkg/plugin/lib/common/libbpf/_src/gen_loader.c +++ b/pkg/plugin/lib/common/libbpf/_src/gen_loader.c @@ -533,7 +533,7 @@ void bpf_gen__record_attach_target(struct bpf_gen *gen, const char *attach_name, gen->attach_kind = kind; ret = snprintf(gen->attach_target, sizeof(gen->attach_target), "%s%s", prefix, attach_name); - if (ret == sizeof(gen->attach_target)) + if (ret >= sizeof(gen->attach_target)) gen->error = -ENOSPC; } @@ -560,7 +560,7 @@ static void emit_find_attach_target(struct bpf_gen *gen) } void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, - bool is_typeless, int kind, int insn_idx) + bool is_typeless, bool is_ld64, int kind, int insn_idx) { struct ksym_relo_desc *relo; @@ -574,6 +574,7 @@ void bpf_gen__record_extern(struct bpf_gen *gen, const char *name, bool is_weak, relo->name = name; relo->is_weak = is_weak; relo->is_typeless = is_typeless; + relo->is_ld64 = is_ld64; relo->kind = kind; relo->insn_idx = insn_idx; gen->relo_cnt++; @@ -586,9 +587,11 @@ static struct ksym_desc *get_ksym_desc(struct bpf_gen *gen, struct ksym_relo_des int i; for (i = 0; i < gen->nr_ksyms; i++) { - if (!strcmp(gen->ksyms[i].name, relo->name)) { - gen->ksyms[i].ref++; - return &gen->ksyms[i]; + kdesc = &gen->ksyms[i]; + if (kdesc->kind == relo->kind && kdesc->is_ld64 == relo->is_ld64 && + !strcmp(kdesc->name, relo->name)) { + kdesc->ref++; + return kdesc; } } kdesc = libbpf_reallocarray(gen->ksyms, gen->nr_ksyms + 1, sizeof(*kdesc)); @@ -603,6 +606,7 @@ static struct ksym_desc *get_ksym_desc(struct bpf_gen *gen, struct ksym_relo_des kdesc->ref = 1; kdesc->off = 0; kdesc->insn = 0; + kdesc->is_ld64 = relo->is_ld64; return kdesc; } @@ -699,17 +703,17 @@ static void emit_relo_kfunc_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo /* obtain fd in BPF_REG_9 */ emit(gen, BPF_MOV64_REG(BPF_REG_9, BPF_REG_7)); emit(gen, BPF_ALU64_IMM(BPF_RSH, BPF_REG_9, 32)); - /* jump to fd_array store if fd denotes module BTF */ - emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2)); - /* set the default value for off */ - emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); - /* skip BTF fd store for vmlinux BTF */ - emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 4)); /* load fd_array slot pointer */ emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_0, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, blob_fd_array_off(gen, btf_fd_idx))); - /* store BTF fd in slot */ + /* store BTF fd in slot, 0 for vmlinux */ emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_0, BPF_REG_9, 0)); + /* jump to insn[insn_idx].off store if fd denotes module BTF */ + emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_9, 0, 2)); + /* set the default value for off */ + emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), 0)); + /* skip BTF fd store for vmlinux BTF */ + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 1)); /* store index into insn[insn_idx].off */ emit(gen, BPF_ST_MEM(BPF_H, BPF_REG_8, offsetof(struct bpf_insn, off), btf_fd_idx)); log: @@ -804,11 +808,13 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, return; /* try to copy from existing ldimm64 insn */ if (kdesc->ref > 1) { - move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, - kdesc->insn + offsetof(struct bpf_insn, imm)); move_blob2blob(gen, insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm), 4, kdesc->insn + sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm)); - /* jump over src_reg adjustment if imm is not 0, reuse BPF_REG_0 from move_blob2blob */ + move_blob2blob(gen, insn + offsetof(struct bpf_insn, imm), 4, + kdesc->insn + offsetof(struct bpf_insn, imm)); + /* jump over src_reg adjustment if imm (btf_id) is not 0, reuse BPF_REG_0 from move_blob2blob + * If btf_id is zero, clear BPF_PSEUDO_BTF_ID flag in src_reg of ld_imm64 insn + */ emit(gen, BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 3)); goto clear_src_reg; } @@ -831,7 +837,7 @@ static void emit_relo_ksym_btf(struct bpf_gen *gen, struct ksym_relo_desc *relo, emit(gen, BPF_STX_MEM(BPF_W, BPF_REG_8, BPF_REG_7, sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm))); /* skip src_reg adjustment */ - emit(gen, BPF_JMP_IMM(BPF_JSGE, BPF_REG_7, 0, 3)); + emit(gen, BPF_JMP_IMM(BPF_JA, 0, 0, 3)); clear_src_reg: /* clear bpf_object__relocate_data's src_reg assignment, otherwise we get a verifier failure */ reg_mask = src_reg_mask(); @@ -862,23 +868,17 @@ static void emit_relo(struct bpf_gen *gen, struct ksym_relo_desc *relo, int insn { int insn; - pr_debug("gen: emit_relo (%d): %s at %d\n", relo->kind, relo->name, relo->insn_idx); + pr_debug("gen: emit_relo (%d): %s at %d %s\n", + relo->kind, relo->name, relo->insn_idx, relo->is_ld64 ? "ld64" : "call"); insn = insns + sizeof(struct bpf_insn) * relo->insn_idx; emit2(gen, BPF_LD_IMM64_RAW_FULL(BPF_REG_8, BPF_PSEUDO_MAP_IDX_VALUE, 0, 0, 0, insn)); - switch (relo->kind) { - case BTF_KIND_VAR: + if (relo->is_ld64) { if (relo->is_typeless) emit_relo_ksym_typeless(gen, relo, insn); else emit_relo_ksym_btf(gen, relo, insn); - break; - case BTF_KIND_FUNC: + } else { emit_relo_kfunc_btf(gen, relo, insn); - break; - default: - pr_warn("Unknown relocation kind '%d'\n", relo->kind); - gen->error = -EDOM; - return; } } @@ -901,18 +901,20 @@ static void cleanup_core_relo(struct bpf_gen *gen) static void cleanup_relos(struct bpf_gen *gen, int insns) { + struct ksym_desc *kdesc; int i, insn; for (i = 0; i < gen->nr_ksyms; i++) { + kdesc = &gen->ksyms[i]; /* only close fds for typed ksyms and kfuncs */ - if (gen->ksyms[i].kind == BTF_KIND_VAR && !gen->ksyms[i].typeless) { + if (kdesc->is_ld64 && !kdesc->typeless) { /* close fd recorded in insn[insn_idx + 1].imm */ - insn = gen->ksyms[i].insn; + insn = kdesc->insn; insn += sizeof(struct bpf_insn) + offsetof(struct bpf_insn, imm); emit_sys_close_blob(gen, insn); - } else if (gen->ksyms[i].kind == BTF_KIND_FUNC) { - emit_sys_close_blob(gen, blob_fd_array_off(gen, gen->ksyms[i].off)); - if (gen->ksyms[i].off < MAX_FD_ARRAY_SZ) + } else if (!kdesc->is_ld64) { + emit_sys_close_blob(gen, blob_fd_array_off(gen, kdesc->off)); + if (kdesc->off < MAX_FD_ARRAY_SZ) gen->nr_fd_array--; } } diff --git a/pkg/plugin/lib/common/libbpf/_src/hashmap.c b/pkg/plugin/lib/common/libbpf/_src/hashmap.c index aeb09c2887..140ee40556 100644 --- a/pkg/plugin/lib/common/libbpf/_src/hashmap.c +++ b/pkg/plugin/lib/common/libbpf/_src/hashmap.c @@ -128,7 +128,7 @@ static int hashmap_grow(struct hashmap *map) } static bool hashmap_find_entry(const struct hashmap *map, - const void *key, size_t hash, + const long key, size_t hash, struct hashmap_entry ***pprev, struct hashmap_entry **entry) { @@ -151,18 +151,18 @@ static bool hashmap_find_entry(const struct hashmap *map, return false; } -int hashmap__insert(struct hashmap *map, const void *key, void *value, - enum hashmap_insert_strategy strategy, - const void **old_key, void **old_value) +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value) { struct hashmap_entry *entry; size_t h; int err; if (old_key) - *old_key = NULL; + *old_key = 0; if (old_value) - *old_value = NULL; + *old_value = 0; h = hash_bits(map->hash_fn(key, map->ctx), map->cap_bits); if (strategy != HASHMAP_APPEND && @@ -203,7 +203,7 @@ int hashmap__insert(struct hashmap *map, const void *key, void *value, return 0; } -bool hashmap__find(const struct hashmap *map, const void *key, void **value) +bool hashmap_find(const struct hashmap *map, long key, long *value) { struct hashmap_entry *entry; size_t h; @@ -217,8 +217,8 @@ bool hashmap__find(const struct hashmap *map, const void *key, void **value) return true; } -bool hashmap__delete(struct hashmap *map, const void *key, - const void **old_key, void **old_value) +bool hashmap_delete(struct hashmap *map, long key, + long *old_key, long *old_value) { struct hashmap_entry **pprev, *entry; size_t h; diff --git a/pkg/plugin/lib/common/libbpf/_src/hashmap.h b/pkg/plugin/lib/common/libbpf/_src/hashmap.h index 10a4c4cd13..c12f8320e6 100644 --- a/pkg/plugin/lib/common/libbpf/_src/hashmap.h +++ b/pkg/plugin/lib/common/libbpf/_src/hashmap.h @@ -40,12 +40,32 @@ static inline size_t str_hash(const char *s) return h; } -typedef size_t (*hashmap_hash_fn)(const void *key, void *ctx); -typedef bool (*hashmap_equal_fn)(const void *key1, const void *key2, void *ctx); +typedef size_t (*hashmap_hash_fn)(long key, void *ctx); +typedef bool (*hashmap_equal_fn)(long key1, long key2, void *ctx); +/* + * Hashmap interface is polymorphic, keys and values could be either + * long-sized integers or pointers, this is achieved as follows: + * - interface functions that operate on keys and values are hidden + * behind auxiliary macros, e.g. hashmap_insert <-> hashmap__insert; + * - these auxiliary macros cast the key and value parameters as + * long or long *, so the user does not have to specify the casts explicitly; + * - for pointer parameters (e.g. old_key) the size of the pointed + * type is verified by hashmap_cast_ptr using _Static_assert; + * - when iterating using hashmap__for_each_* forms + * hasmap_entry->key should be used for integer keys and + * hasmap_entry->pkey should be used for pointer keys, + * same goes for values. + */ struct hashmap_entry { - const void *key; - void *value; + union { + long key; + const void *pkey; + }; + union { + long value; + void *pvalue; + }; struct hashmap_entry *next; }; @@ -60,16 +80,6 @@ struct hashmap { size_t sz; }; -#define HASHMAP_INIT(hash_fn, equal_fn, ctx) { \ - .hash_fn = (hash_fn), \ - .equal_fn = (equal_fn), \ - .ctx = (ctx), \ - .buckets = NULL, \ - .cap = 0, \ - .cap_bits = 0, \ - .sz = 0, \ -} - void hashmap__init(struct hashmap *map, hashmap_hash_fn hash_fn, hashmap_equal_fn equal_fn, void *ctx); struct hashmap *hashmap__new(hashmap_hash_fn hash_fn, @@ -102,6 +112,13 @@ enum hashmap_insert_strategy { HASHMAP_APPEND, }; +#define hashmap_cast_ptr(p) ({ \ + _Static_assert((__builtin_constant_p((p)) ? (p) == NULL : 0) || \ + sizeof(*(p)) == sizeof(long), \ + #p " pointee should be a long-sized integer or a pointer"); \ + (long *)(p); \ +}) + /* * hashmap__insert() adds key/value entry w/ various semantics, depending on * provided strategy value. If a given key/value pair replaced already @@ -109,42 +126,38 @@ enum hashmap_insert_strategy { * through old_key and old_value to allow calling code do proper memory * management. */ -int hashmap__insert(struct hashmap *map, const void *key, void *value, - enum hashmap_insert_strategy strategy, - const void **old_key, void **old_value); +int hashmap_insert(struct hashmap *map, long key, long value, + enum hashmap_insert_strategy strategy, + long *old_key, long *old_value); -static inline int hashmap__add(struct hashmap *map, - const void *key, void *value) -{ - return hashmap__insert(map, key, value, HASHMAP_ADD, NULL, NULL); -} +#define hashmap__insert(map, key, value, strategy, old_key, old_value) \ + hashmap_insert((map), (long)(key), (long)(value), (strategy), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) -static inline int hashmap__set(struct hashmap *map, - const void *key, void *value, - const void **old_key, void **old_value) -{ - return hashmap__insert(map, key, value, HASHMAP_SET, - old_key, old_value); -} +#define hashmap__add(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_ADD, NULL, NULL) -static inline int hashmap__update(struct hashmap *map, - const void *key, void *value, - const void **old_key, void **old_value) -{ - return hashmap__insert(map, key, value, HASHMAP_UPDATE, - old_key, old_value); -} +#define hashmap__set(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_SET, (old_key), (old_value)) -static inline int hashmap__append(struct hashmap *map, - const void *key, void *value) -{ - return hashmap__insert(map, key, value, HASHMAP_APPEND, NULL, NULL); -} +#define hashmap__update(map, key, value, old_key, old_value) \ + hashmap__insert((map), (key), (value), HASHMAP_UPDATE, (old_key), (old_value)) + +#define hashmap__append(map, key, value) \ + hashmap__insert((map), (key), (value), HASHMAP_APPEND, NULL, NULL) + +bool hashmap_delete(struct hashmap *map, long key, long *old_key, long *old_value); + +#define hashmap__delete(map, key, old_key, old_value) \ + hashmap_delete((map), (long)(key), \ + hashmap_cast_ptr(old_key), \ + hashmap_cast_ptr(old_value)) -bool hashmap__delete(struct hashmap *map, const void *key, - const void **old_key, void **old_value); +bool hashmap_find(const struct hashmap *map, long key, long *value); -bool hashmap__find(const struct hashmap *map, const void *key, void **value); +#define hashmap__find(map, key, value) \ + hashmap_find((map), (long)(key), hashmap_cast_ptr(value)) /* * hashmap__for_each_entry - iterate over all entries in hashmap diff --git a/pkg/plugin/lib/common/libbpf/_src/libbpf.c b/pkg/plugin/lib/common/libbpf/_src/libbpf.c index 73a5192def..a3be6f8fac 100644 --- a/pkg/plugin/lib/common/libbpf/_src/libbpf.c +++ b/pkg/plugin/lib/common/libbpf/_src/libbpf.c @@ -31,11 +31,10 @@ #include #include #include -#include #include #include +#include #include -#include #include #include #include @@ -55,11 +54,14 @@ #include "libbpf_internal.h" #include "hashmap.h" #include "bpf_gen_internal.h" +#include "zip.h" #ifndef BPF_FS_MAGIC #define BPF_FS_MAGIC 0xcafe4a11 #endif +#define BPF_FS_DEFAULT_PATH "/sys/fs/bpf" + #define BPF_INSN_SZ (sizeof(struct bpf_insn)) /* vsprintf() in __base_pr() uses nonliteral format string. It may break @@ -71,11 +73,186 @@ static struct bpf_map *bpf_object__add_map(struct bpf_object *obj); static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog); +static int map_set_def_max_entries(struct bpf_map *map); + +static const char * const attach_type_name[] = { + [BPF_CGROUP_INET_INGRESS] = "cgroup_inet_ingress", + [BPF_CGROUP_INET_EGRESS] = "cgroup_inet_egress", + [BPF_CGROUP_INET_SOCK_CREATE] = "cgroup_inet_sock_create", + [BPF_CGROUP_INET_SOCK_RELEASE] = "cgroup_inet_sock_release", + [BPF_CGROUP_SOCK_OPS] = "cgroup_sock_ops", + [BPF_CGROUP_DEVICE] = "cgroup_device", + [BPF_CGROUP_INET4_BIND] = "cgroup_inet4_bind", + [BPF_CGROUP_INET6_BIND] = "cgroup_inet6_bind", + [BPF_CGROUP_INET4_CONNECT] = "cgroup_inet4_connect", + [BPF_CGROUP_INET6_CONNECT] = "cgroup_inet6_connect", + [BPF_CGROUP_UNIX_CONNECT] = "cgroup_unix_connect", + [BPF_CGROUP_INET4_POST_BIND] = "cgroup_inet4_post_bind", + [BPF_CGROUP_INET6_POST_BIND] = "cgroup_inet6_post_bind", + [BPF_CGROUP_INET4_GETPEERNAME] = "cgroup_inet4_getpeername", + [BPF_CGROUP_INET6_GETPEERNAME] = "cgroup_inet6_getpeername", + [BPF_CGROUP_UNIX_GETPEERNAME] = "cgroup_unix_getpeername", + [BPF_CGROUP_INET4_GETSOCKNAME] = "cgroup_inet4_getsockname", + [BPF_CGROUP_INET6_GETSOCKNAME] = "cgroup_inet6_getsockname", + [BPF_CGROUP_UNIX_GETSOCKNAME] = "cgroup_unix_getsockname", + [BPF_CGROUP_UDP4_SENDMSG] = "cgroup_udp4_sendmsg", + [BPF_CGROUP_UDP6_SENDMSG] = "cgroup_udp6_sendmsg", + [BPF_CGROUP_UNIX_SENDMSG] = "cgroup_unix_sendmsg", + [BPF_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_CGROUP_UDP4_RECVMSG] = "cgroup_udp4_recvmsg", + [BPF_CGROUP_UDP6_RECVMSG] = "cgroup_udp6_recvmsg", + [BPF_CGROUP_UNIX_RECVMSG] = "cgroup_unix_recvmsg", + [BPF_CGROUP_GETSOCKOPT] = "cgroup_getsockopt", + [BPF_CGROUP_SETSOCKOPT] = "cgroup_setsockopt", + [BPF_SK_SKB_STREAM_PARSER] = "sk_skb_stream_parser", + [BPF_SK_SKB_STREAM_VERDICT] = "sk_skb_stream_verdict", + [BPF_SK_SKB_VERDICT] = "sk_skb_verdict", + [BPF_SK_MSG_VERDICT] = "sk_msg_verdict", + [BPF_LIRC_MODE2] = "lirc_mode2", + [BPF_FLOW_DISSECTOR] = "flow_dissector", + [BPF_TRACE_RAW_TP] = "trace_raw_tp", + [BPF_TRACE_FENTRY] = "trace_fentry", + [BPF_TRACE_FEXIT] = "trace_fexit", + [BPF_MODIFY_RETURN] = "modify_return", + [BPF_LSM_MAC] = "lsm_mac", + [BPF_LSM_CGROUP] = "lsm_cgroup", + [BPF_SK_LOOKUP] = "sk_lookup", + [BPF_TRACE_ITER] = "trace_iter", + [BPF_XDP_DEVMAP] = "xdp_devmap", + [BPF_XDP_CPUMAP] = "xdp_cpumap", + [BPF_XDP] = "xdp", + [BPF_SK_REUSEPORT_SELECT] = "sk_reuseport_select", + [BPF_SK_REUSEPORT_SELECT_OR_MIGRATE] = "sk_reuseport_select_or_migrate", + [BPF_PERF_EVENT] = "perf_event", + [BPF_TRACE_KPROBE_MULTI] = "trace_kprobe_multi", + [BPF_STRUCT_OPS] = "struct_ops", + [BPF_NETFILTER] = "netfilter", + [BPF_TCX_INGRESS] = "tcx_ingress", + [BPF_TCX_EGRESS] = "tcx_egress", + [BPF_TRACE_UPROBE_MULTI] = "trace_uprobe_multi", + [BPF_NETKIT_PRIMARY] = "netkit_primary", + [BPF_NETKIT_PEER] = "netkit_peer", + [BPF_TRACE_KPROBE_SESSION] = "trace_kprobe_session", +}; + +static const char * const link_type_name[] = { + [BPF_LINK_TYPE_UNSPEC] = "unspec", + [BPF_LINK_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_LINK_TYPE_TRACING] = "tracing", + [BPF_LINK_TYPE_CGROUP] = "cgroup", + [BPF_LINK_TYPE_ITER] = "iter", + [BPF_LINK_TYPE_NETNS] = "netns", + [BPF_LINK_TYPE_XDP] = "xdp", + [BPF_LINK_TYPE_PERF_EVENT] = "perf_event", + [BPF_LINK_TYPE_KPROBE_MULTI] = "kprobe_multi", + [BPF_LINK_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_LINK_TYPE_NETFILTER] = "netfilter", + [BPF_LINK_TYPE_TCX] = "tcx", + [BPF_LINK_TYPE_UPROBE_MULTI] = "uprobe_multi", + [BPF_LINK_TYPE_NETKIT] = "netkit", + [BPF_LINK_TYPE_SOCKMAP] = "sockmap", +}; + +static const char * const map_type_name[] = { + [BPF_MAP_TYPE_UNSPEC] = "unspec", + [BPF_MAP_TYPE_HASH] = "hash", + [BPF_MAP_TYPE_ARRAY] = "array", + [BPF_MAP_TYPE_PROG_ARRAY] = "prog_array", + [BPF_MAP_TYPE_PERF_EVENT_ARRAY] = "perf_event_array", + [BPF_MAP_TYPE_PERCPU_HASH] = "percpu_hash", + [BPF_MAP_TYPE_PERCPU_ARRAY] = "percpu_array", + [BPF_MAP_TYPE_STACK_TRACE] = "stack_trace", + [BPF_MAP_TYPE_CGROUP_ARRAY] = "cgroup_array", + [BPF_MAP_TYPE_LRU_HASH] = "lru_hash", + [BPF_MAP_TYPE_LRU_PERCPU_HASH] = "lru_percpu_hash", + [BPF_MAP_TYPE_LPM_TRIE] = "lpm_trie", + [BPF_MAP_TYPE_ARRAY_OF_MAPS] = "array_of_maps", + [BPF_MAP_TYPE_HASH_OF_MAPS] = "hash_of_maps", + [BPF_MAP_TYPE_DEVMAP] = "devmap", + [BPF_MAP_TYPE_DEVMAP_HASH] = "devmap_hash", + [BPF_MAP_TYPE_SOCKMAP] = "sockmap", + [BPF_MAP_TYPE_CPUMAP] = "cpumap", + [BPF_MAP_TYPE_XSKMAP] = "xskmap", + [BPF_MAP_TYPE_SOCKHASH] = "sockhash", + [BPF_MAP_TYPE_CGROUP_STORAGE] = "cgroup_storage", + [BPF_MAP_TYPE_REUSEPORT_SOCKARRAY] = "reuseport_sockarray", + [BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE] = "percpu_cgroup_storage", + [BPF_MAP_TYPE_QUEUE] = "queue", + [BPF_MAP_TYPE_STACK] = "stack", + [BPF_MAP_TYPE_SK_STORAGE] = "sk_storage", + [BPF_MAP_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_MAP_TYPE_RINGBUF] = "ringbuf", + [BPF_MAP_TYPE_INODE_STORAGE] = "inode_storage", + [BPF_MAP_TYPE_TASK_STORAGE] = "task_storage", + [BPF_MAP_TYPE_BLOOM_FILTER] = "bloom_filter", + [BPF_MAP_TYPE_USER_RINGBUF] = "user_ringbuf", + [BPF_MAP_TYPE_CGRP_STORAGE] = "cgrp_storage", + [BPF_MAP_TYPE_ARENA] = "arena", +}; + +static const char * const prog_type_name[] = { + [BPF_PROG_TYPE_UNSPEC] = "unspec", + [BPF_PROG_TYPE_SOCKET_FILTER] = "socket_filter", + [BPF_PROG_TYPE_KPROBE] = "kprobe", + [BPF_PROG_TYPE_SCHED_CLS] = "sched_cls", + [BPF_PROG_TYPE_SCHED_ACT] = "sched_act", + [BPF_PROG_TYPE_TRACEPOINT] = "tracepoint", + [BPF_PROG_TYPE_XDP] = "xdp", + [BPF_PROG_TYPE_PERF_EVENT] = "perf_event", + [BPF_PROG_TYPE_CGROUP_SKB] = "cgroup_skb", + [BPF_PROG_TYPE_CGROUP_SOCK] = "cgroup_sock", + [BPF_PROG_TYPE_LWT_IN] = "lwt_in", + [BPF_PROG_TYPE_LWT_OUT] = "lwt_out", + [BPF_PROG_TYPE_LWT_XMIT] = "lwt_xmit", + [BPF_PROG_TYPE_SOCK_OPS] = "sock_ops", + [BPF_PROG_TYPE_SK_SKB] = "sk_skb", + [BPF_PROG_TYPE_CGROUP_DEVICE] = "cgroup_device", + [BPF_PROG_TYPE_SK_MSG] = "sk_msg", + [BPF_PROG_TYPE_RAW_TRACEPOINT] = "raw_tracepoint", + [BPF_PROG_TYPE_CGROUP_SOCK_ADDR] = "cgroup_sock_addr", + [BPF_PROG_TYPE_LWT_SEG6LOCAL] = "lwt_seg6local", + [BPF_PROG_TYPE_LIRC_MODE2] = "lirc_mode2", + [BPF_PROG_TYPE_SK_REUSEPORT] = "sk_reuseport", + [BPF_PROG_TYPE_FLOW_DISSECTOR] = "flow_dissector", + [BPF_PROG_TYPE_CGROUP_SYSCTL] = "cgroup_sysctl", + [BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE] = "raw_tracepoint_writable", + [BPF_PROG_TYPE_CGROUP_SOCKOPT] = "cgroup_sockopt", + [BPF_PROG_TYPE_TRACING] = "tracing", + [BPF_PROG_TYPE_STRUCT_OPS] = "struct_ops", + [BPF_PROG_TYPE_EXT] = "ext", + [BPF_PROG_TYPE_LSM] = "lsm", + [BPF_PROG_TYPE_SK_LOOKUP] = "sk_lookup", + [BPF_PROG_TYPE_SYSCALL] = "syscall", + [BPF_PROG_TYPE_NETFILTER] = "netfilter", +}; static int __base_pr(enum libbpf_print_level level, const char *format, va_list args) { - if (level == LIBBPF_DEBUG) + const char *env_var = "LIBBPF_LOG_LEVEL"; + static enum libbpf_print_level min_level = LIBBPF_INFO; + static bool initialized; + + if (!initialized) { + char *verbosity; + + initialized = true; + verbosity = getenv(env_var); + if (verbosity) { + if (strcasecmp(verbosity, "warn") == 0) + min_level = LIBBPF_WARN; + else if (strcasecmp(verbosity, "debug") == 0) + min_level = LIBBPF_DEBUG; + else if (strcasecmp(verbosity, "info") == 0) + min_level = LIBBPF_INFO; + else + fprintf(stderr, "libbpf: unrecognized '%s' envvar value: '%s', should be one of 'warn', 'debug', or 'info'.\n", + env_var, verbosity); + } + } + + /* if too verbose, skip logging */ + if (level > min_level) return 0; return vfprintf(stderr, format, args); @@ -85,9 +262,10 @@ static libbpf_print_fn_t __libbpf_pr = __base_pr; libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn) { - libbpf_print_fn_t old_print_fn = __libbpf_pr; + libbpf_print_fn_t old_print_fn; + + old_print_fn = __atomic_exchange_n(&__libbpf_pr, fn, __ATOMIC_RELAXED); - __libbpf_pr = fn; return old_print_fn; } @@ -95,13 +273,20 @@ __printf(2, 3) void libbpf_print(enum libbpf_print_level level, const char *format, ...) { va_list args; + int old_errno; + libbpf_print_fn_t print_fn; - if (!__libbpf_pr) + print_fn = __atomic_load_n(&__libbpf_pr, __ATOMIC_RELAXED); + if (!print_fn) return; + old_errno = errno; + va_start(args, format); __libbpf_pr(level, format, args); va_end(args); + + errno = old_errno; } static void pr_perm_msg(int err) @@ -151,12 +336,9 @@ static inline __u64 ptr_to_u64(const void *ptr) return (__u64) (unsigned long) ptr; } -/* this goes away in libbpf 1.0 */ -enum libbpf_strict_mode libbpf_mode = LIBBPF_STRICT_NONE; - int libbpf_set_strict_mode(enum libbpf_strict_mode mode) { - libbpf_mode = mode; + /* as of v1.0 libbpf_set_strict_mode() is a no-op */ return 0; } @@ -183,8 +365,8 @@ enum reloc_type { RELO_LD64, RELO_CALL, RELO_DATA, - RELO_EXTERN_VAR, - RELO_EXTERN_FUNC, + RELO_EXTERN_LD64, + RELO_EXTERN_CALL, RELO_SUBPROG_ADDR, RELO_CORE, }; @@ -197,6 +379,7 @@ struct reloc_desc { struct { int map_idx; int sym_off; + int ext_idx; }; }; }; @@ -215,16 +398,15 @@ enum sec_def_flags { SEC_ATTACHABLE = 2, SEC_ATTACHABLE_OPT = SEC_ATTACHABLE | SEC_EXP_ATTACH_OPT, /* attachment target is specified through BTF ID in either kernel or - * other BPF program's BTF object */ + * other BPF program's BTF object + */ SEC_ATTACH_BTF = 4, /* BPF program type allows sleeping/blocking in kernel */ SEC_SLEEPABLE = 8, - /* allow non-strict prefix matching */ - SEC_SLOPPY_PFX = 16, /* BPF program support non-linear XDP buffer */ - SEC_XDP_FRAGS = 32, - /* deprecated sec definitions not supposed to be used */ - SEC_DEPRECATED = 64, + SEC_XDP_FRAGS = 16, + /* Setup proper attach type for usdt probes. */ + SEC_USDT = 32, }; struct bpf_sec_def { @@ -244,9 +426,10 @@ struct bpf_sec_def { * linux/filter.h. */ struct bpf_program { - const struct bpf_sec_def *sec_def; + char *name; char *sec_name; size_t sec_idx; + const struct bpf_sec_def *sec_def; /* this program's instruction offset (in number of instructions) * within its containing ELF section */ @@ -266,12 +449,6 @@ struct bpf_program { */ size_t sub_insn_off; - char *name; - /* name with / replaced by _; makes recursive pinning - * in bpf_object__pin_programs easier - */ - char *pin_name; - /* instructions that belong to BPF program; insns[0] is located at * sec_insn_off instruction within its ELF section in ELF file, so * when mapping ELF file instruction index to the local instruction, @@ -292,24 +469,22 @@ struct bpf_program { size_t log_size; __u32 log_level; - struct { - int nr; - int *fds; - } instances; - bpf_program_prep_t preprocessor; - struct bpf_object *obj; - void *priv; - bpf_program_clear_priv_t clear_priv; + int fd; bool autoload; + bool autoattach; + bool sym_global; bool mark_btf_static; enum bpf_prog_type type; enum bpf_attach_type expected_attach_type; + int exception_cb_idx; + int prog_ifindex; __u32 attach_btf_obj_fd; __u32 attach_btf_id; __u32 attach_prog_fd; + void *func_info; __u32 func_info_rec_size; __u32 func_info_cnt; @@ -347,6 +522,8 @@ struct bpf_struct_ops { #define KCONFIG_SEC ".kconfig" #define KSYMS_SEC ".ksyms" #define STRUCT_OPS_SEC ".struct_ops" +#define STRUCT_OPS_LINK_SEC ".struct_ops.link" +#define ARENA_SEC ".addr_space.1" enum libbpf_map_type { LIBBPF_MAP_UNSPEC, @@ -356,11 +533,20 @@ enum libbpf_map_type { LIBBPF_MAP_KCONFIG, }; +struct bpf_map_def { + unsigned int type; + unsigned int key_size; + unsigned int value_size; + unsigned int max_entries; + unsigned int map_flags; +}; + struct bpf_map { + struct bpf_object *obj; char *name; /* real_name is defined for special internal maps (.rodata*, * .data*, .bss, .kconfig) and preserves their original ELF section - * name. This is important to be be able to find corresponding BTF + * name. This is important to be able to find corresponding BTF * DATASEC information. */ char *real_name; @@ -372,11 +558,10 @@ struct bpf_map { struct bpf_map_def def; __u32 numa_node; __u32 btf_var_idx; + int mod_btf_fd; __u32 btf_key_type_id; __u32 btf_value_type_id; __u32 btf_vmlinux_value_type_id; - void *priv; - bpf_map_clear_priv_t clear_priv; enum libbpf_map_type libbpf_type; void *mmaped; struct bpf_struct_ops *st_ops; @@ -386,7 +571,8 @@ struct bpf_map { char *pin_path; bool pinned; bool reused; - bool skipped; + bool autocreate; + bool autoattach; __u64 map_extra; }; @@ -411,6 +597,7 @@ struct extern_desc { int btf_id; int sec_btf_id; const char *name; + char *essent_name; bool is_set; bool is_weak; union { @@ -439,8 +626,6 @@ struct extern_desc { }; }; -static LIST_HEAD(bpf_objects_list); - struct module_btf { struct btf *btf; char *name; @@ -455,6 +640,7 @@ enum sec_type { SEC_BSS, SEC_DATA, SEC_RODATA, + SEC_ST_OPS, }; struct elf_sec_desc { @@ -470,17 +656,17 @@ struct elf_state { Elf *elf; Elf64_Ehdr *ehdr; Elf_Data *symbols; - Elf_Data *st_ops_data; + Elf_Data *arena_data; size_t shstrndx; /* section index for section name strings */ size_t strtabidx; struct elf_sec_desc *secs; - int sec_cnt; - int maps_shndx; + size_t sec_cnt; int btf_maps_shndx; __u32 btf_maps_sec_btf_id; int text_shndx; int symbols_shndx; - int st_ops_shndx; + bool has_st_ops; + int arena_data_shndx; }; struct usdt_manager; @@ -509,12 +695,6 @@ struct bpf_object { /* Information when doing ELF related work. Only valid if efile.elf is not NULL */ struct elf_state efile; - /* - * All loaded bpf_object are linked in a list, which is - * hidden to caller. bpf_objects__ handlers deal with - * all objects. - */ - struct list_head list; struct btf *btf; struct btf_ext *btf_ext; @@ -540,15 +720,20 @@ struct bpf_object { size_t log_size; __u32 log_level; - void *priv; - bpf_object_clear_priv_t clear_priv; - int *fd_array; size_t fd_array_cap; size_t fd_array_cnt; struct usdt_manager *usdt_man; + struct bpf_map *arena_map; + void *arena_data; + size_t arena_data_sz; + + struct kern_feature_cache *feat_cache; + char *token_path; + int token_fd; + char path[]; }; @@ -564,25 +749,10 @@ static Elf64_Rel *elf_rel_by_idx(Elf_Data *data, size_t idx); void bpf_program__unload(struct bpf_program *prog) { - int i; - if (!prog) return; - /* - * If the object is opened but the program was never loaded, - * it is possible that prog->instances.nr == -1. - */ - if (prog->instances.nr > 0) { - for (i = 0; i < prog->instances.nr; i++) - zclose(prog->instances.fds[i]); - } else if (prog->instances.nr != -1) { - pr_warn("Internal error: instances.nr is %d\n", - prog->instances.nr); - } - - prog->instances.nr = -1; - zfree(&prog->instances.fds); + zclose(prog->fd); zfree(&prog->func_info); zfree(&prog->line_info); @@ -593,16 +763,9 @@ static void bpf_program__exit(struct bpf_program *prog) if (!prog) return; - if (prog->clear_priv) - prog->clear_priv(prog, prog->priv); - - prog->priv = NULL; - prog->clear_priv = NULL; - bpf_program__unload(prog); zfree(&prog->name); zfree(&prog->sec_name); - zfree(&prog->pin_name); zfree(&prog->insns); zfree(&prog->reloc_desc); @@ -611,26 +774,6 @@ static void bpf_program__exit(struct bpf_program *prog) prog->sec_idx = -1; } -static char *__bpf_program__pin_name(struct bpf_program *prog) -{ - char *name, *p; - - if (libbpf_mode & LIBBPF_STRICT_SEC_NAME) - name = strdup(prog->name); - else - name = strdup(prog->sec_name); - - if (!name) - return NULL; - - p = name; - - while ((p = strchr(p, '/'))) - *p = '_'; - - return name; -} - static bool insn_is_subprog_call(const struct bpf_insn *insn) { return BPF_CLASS(insn->code) == BPF_JMP && @@ -672,6 +815,8 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->insns_cnt = prog->sec_insn_cnt; prog->type = BPF_PROG_TYPE_UNSPEC; + prog->fd = -1; + prog->exception_cb_idx = -1; /* libbpf's convention for SEC("?abc...") is that it's just like * SEC("abc...") but the corresponding bpf_program starts out with @@ -685,8 +830,7 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, prog->autoload = true; } - prog->instances.fds = NULL; - prog->instances.nr = -1; + prog->autoattach = true; /* inherit object's log_level */ prog->log_level = obj->log_level; @@ -699,10 +843,6 @@ bpf_object__init_prog(struct bpf_object *obj, struct bpf_program *prog, if (!prog->name) goto errout; - prog->pin_name = __bpf_program__pin_name(prog); - if (!prog->pin_name) - goto errout; - prog->insns = malloc(insn_data_sz); if (!prog->insns) goto errout; @@ -730,7 +870,6 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, progs = obj->programs; nr_progs = obj->nr_programs; nr_syms = symbols->d_size / sizeof(Elf64_Sym); - sec_off = 0; for (i = 0; i < nr_syms; i++) { sym = elf_sym_by_idx(obj, i); @@ -784,14 +923,16 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, if (err) return err; + if (ELF64_ST_BIND(sym->st_info) != STB_LOCAL) + prog->sym_global = true; + /* if function is a global/weak symbol, but has restricted * (STV_HIDDEN or STV_INTERNAL) visibility, mark its BTF FUNC * as static to enable more permissive BPF verification mode * with more outside context available to BPF verifier */ - if (ELF64_ST_BIND(sym->st_info) != STB_LOCAL - && (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN - || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL)) + if (prog->sym_global && (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN + || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL)) prog->mark_btf_static = true; nr_progs++; @@ -801,42 +942,6 @@ bpf_object__add_programs(struct bpf_object *obj, Elf_Data *sec_data, return 0; } -__u32 get_kernel_version(void) -{ - /* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release, - * but Ubuntu provides /proc/version_signature file, as described at - * https://ubuntu.com/kernel, with an example contents below, which we - * can use to get a proper LINUX_VERSION_CODE. - * - * Ubuntu 5.4.0-12.15-generic 5.4.8 - * - * In the above, 5.4.8 is what kernel is actually expecting, while - * uname() call will return 5.4.0 in info.release. - */ - const char *ubuntu_kver_file = "/proc/version_signature"; - __u32 major, minor, patch; - struct utsname info; - - if (access(ubuntu_kver_file, R_OK) == 0) { - FILE *f; - - f = fopen(ubuntu_kver_file, "r"); - if (f) { - if (fscanf(f, "%*s %*s %d.%d.%d\n", &major, &minor, &patch) == 3) { - fclose(f); - return KERNEL_VERSION(major, minor, patch); - } - fclose(f); - } - /* something went wrong, fall back to uname() approach */ - } - - uname(&info); - if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3) - return 0; - return KERNEL_VERSION(major, minor, patch); -} - static const struct btf_member * find_member_by_offset(const struct btf_type *t, __u32 bit_offset) { @@ -866,22 +971,33 @@ find_member_by_name(const struct btf *btf, const struct btf_type *t, return NULL; } +static int find_ksym_btf_id(struct bpf_object *obj, const char *ksym_name, + __u16 kind, struct btf **res_btf, + struct module_btf **res_mod_btf); + #define STRUCT_OPS_VALUE_PREFIX "bpf_struct_ops_" static int find_btf_by_prefix_kind(const struct btf *btf, const char *prefix, const char *name, __u32 kind); static int -find_struct_ops_kern_types(const struct btf *btf, const char *tname, +find_struct_ops_kern_types(struct bpf_object *obj, const char *tname_raw, + struct module_btf **mod_btf, const struct btf_type **type, __u32 *type_id, const struct btf_type **vtype, __u32 *vtype_id, const struct btf_member **data_member) { const struct btf_type *kern_type, *kern_vtype; const struct btf_member *kern_data_member; + struct btf *btf; __s32 kern_vtype_id, kern_type_id; + char tname[256]; __u32 i; - kern_type_id = btf__find_by_name_kind(btf, tname, BTF_KIND_STRUCT); + snprintf(tname, sizeof(tname), "%.*s", + (int)bpf_core_essential_name_len(tname_raw), tname_raw); + + kern_type_id = find_ksym_btf_id(obj, tname, BTF_KIND_STRUCT, + &btf, mod_btf); if (kern_type_id < 0) { pr_warn("struct_ops init_kern: struct %s is not found in kernel BTF\n", tname); @@ -934,15 +1050,72 @@ static bool bpf_map__is_struct_ops(const struct bpf_map *map) return map->def.type == BPF_MAP_TYPE_STRUCT_OPS; } +static bool is_valid_st_ops_program(struct bpf_object *obj, + const struct bpf_program *prog) +{ + int i; + + for (i = 0; i < obj->nr_programs; i++) { + if (&obj->programs[i] == prog) + return prog->type == BPF_PROG_TYPE_STRUCT_OPS; + } + + return false; +} + +/* For each struct_ops program P, referenced from some struct_ops map M, + * enable P.autoload if there are Ms for which M.autocreate is true, + * disable P.autoload if for all Ms M.autocreate is false. + * Don't change P.autoload for programs that are not referenced from any maps. + */ +static int bpf_object_adjust_struct_ops_autoload(struct bpf_object *obj) +{ + struct bpf_program *prog, *slot_prog; + struct bpf_map *map; + int i, j, k, vlen; + + for (i = 0; i < obj->nr_programs; ++i) { + int should_load = false; + int use_cnt = 0; + + prog = &obj->programs[i]; + if (prog->type != BPF_PROG_TYPE_STRUCT_OPS) + continue; + + for (j = 0; j < obj->nr_maps; ++j) { + map = &obj->maps[j]; + if (!bpf_map__is_struct_ops(map)) + continue; + + vlen = btf_vlen(map->st_ops->type); + for (k = 0; k < vlen; ++k) { + slot_prog = map->st_ops->progs[k]; + if (prog != slot_prog) + continue; + + use_cnt++; + if (map->autocreate) + should_load = true; + } + } + if (use_cnt) + prog->autoload = should_load; + } + + return 0; +} + /* Init the map's fields that depend on kern_btf */ -static int bpf_map__init_kern_struct_ops(struct bpf_map *map, - const struct btf *btf, - const struct btf *kern_btf) +static int bpf_map__init_kern_struct_ops(struct bpf_map *map) { const struct btf_member *member, *kern_member, *kern_data_member; const struct btf_type *type, *kern_type, *kern_vtype; __u32 i, kern_type_id, kern_vtype_id, kern_data_off; + struct bpf_object *obj = map->obj; + const struct btf *btf = obj->btf; struct bpf_struct_ops *st_ops; + const struct btf *kern_btf; + struct module_btf *mod_btf; void *data, *kern_data; const char *tname; int err; @@ -950,16 +1123,19 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map, st_ops = map->st_ops; type = st_ops->type; tname = st_ops->tname; - err = find_struct_ops_kern_types(kern_btf, tname, + err = find_struct_ops_kern_types(obj, tname, &mod_btf, &kern_type, &kern_type_id, &kern_vtype, &kern_vtype_id, &kern_data_member); if (err) return err; + kern_btf = mod_btf ? mod_btf->btf : obj->btf_vmlinux; + pr_debug("struct_ops init_kern %s: type_id:%u kern_type_id:%u kern_vtype_id:%u\n", map->name, st_ops->type_id, kern_type_id, kern_vtype_id); + map->mod_btf_fd = mod_btf ? mod_btf->fd : -1; map->def.value_size = kern_vtype->size; map->btf_vmlinux_value_type_id = kern_vtype_id; @@ -976,17 +1152,46 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map, const struct btf_type *mtype, *kern_mtype; __u32 mtype_id, kern_mtype_id; void *mdata, *kern_mdata; + struct bpf_program *prog; __s64 msize, kern_msize; __u32 moff, kern_moff; __u32 kern_member_idx; const char *mname; mname = btf__name_by_offset(btf, member->name_off); + moff = member->offset / 8; + mdata = data + moff; + msize = btf__resolve_size(btf, member->type); + if (msize < 0) { + pr_warn("struct_ops init_kern %s: failed to resolve the size of member %s\n", + map->name, mname); + return msize; + } + kern_member = find_member_by_name(kern_btf, kern_type, mname); if (!kern_member) { - pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + if (!libbpf_is_mem_zeroed(mdata, msize)) { + pr_warn("struct_ops init_kern %s: Cannot find member %s in kernel BTF\n", + map->name, mname); + return -ENOTSUP; + } + + if (st_ops->progs[i]) { + /* If we had declaratively set struct_ops callback, we need to + * force its autoload to false, because it doesn't have + * a chance of succeeding from POV of the current struct_ops map. + * If this program is still referenced somewhere else, though, + * then bpf_object_adjust_struct_ops_autoload() will update its + * autoload accordingly. + */ + st_ops->progs[i]->autoload = false; + st_ops->progs[i] = NULL; + } + + /* Skip all-zero/NULL fields if they are not present in the kernel BTF */ + pr_info("struct_ops %s: member %s not found in kernel, skipping it as it's set to zero\n", map->name, mname); - return -ENOTSUP; + continue; } kern_member_idx = kern_member - btf_members(kern_type); @@ -997,10 +1202,7 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map, return -ENOTSUP; } - moff = member->offset / 8; kern_moff = kern_member->offset / 8; - - mdata = data + moff; kern_mdata = kern_data + kern_moff; mtype = skip_mods_and_typedefs(btf, member->type, &mtype_id); @@ -1015,12 +1217,25 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map, } if (btf_is_ptr(mtype)) { - struct bpf_program *prog; + prog = *(void **)mdata; + /* just like for !kern_member case above, reset declaratively + * set (at compile time) program's autload to false, + * if user replaced it with another program or NULL + */ + if (st_ops->progs[i] && st_ops->progs[i] != prog) + st_ops->progs[i]->autoload = false; - prog = st_ops->progs[i]; + /* Update the value from the shadow type */ + st_ops->progs[i] = prog; if (!prog) continue; + if (!is_valid_st_ops_program(obj, prog)) { + pr_warn("struct_ops init_kern %s: member %s is not a struct_ops program\n", + map->name, mname); + return -ENOTSUP; + } + kern_mtype = skip_mods_and_typedefs(kern_btf, kern_mtype->type, &kern_mtype_id); @@ -1035,8 +1250,34 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map, return -ENOTSUP; } - prog->attach_btf_id = kern_type_id; - prog->expected_attach_type = kern_member_idx; + if (mod_btf) + prog->attach_btf_obj_fd = mod_btf->fd; + + /* if we haven't yet processed this BPF program, record proper + * attach_btf_id and member_idx + */ + if (!prog->attach_btf_id) { + prog->attach_btf_id = kern_type_id; + prog->expected_attach_type = kern_member_idx; + } + + /* struct_ops BPF prog can be re-used between multiple + * .struct_ops & .struct_ops.link as long as it's the + * same struct_ops struct definition and the same + * function pointer field + */ + if (prog->attach_btf_id != kern_type_id) { + pr_warn("struct_ops init_kern %s func ptr %s: invalid reuse of prog %s in sec %s with type %u: attach_btf_id %u != kern_type_id %u\n", + map->name, mname, prog->name, prog->sec_name, prog->type, + prog->attach_btf_id, kern_type_id); + return -EINVAL; + } + if (prog->expected_attach_type != kern_member_idx) { + pr_warn("struct_ops init_kern %s func ptr %s: invalid reuse of prog %s in sec %s with type %u: expected_attach_type %u != kern_member_idx %u\n", + map->name, mname, prog->name, prog->sec_name, prog->type, + prog->expected_attach_type, kern_member_idx); + return -EINVAL; + } st_ops->kern_func_off[i] = kern_data_off + kern_moff; @@ -1047,9 +1288,8 @@ static int bpf_map__init_kern_struct_ops(struct bpf_map *map, continue; } - msize = btf__resolve_size(btf, mtype_id); kern_msize = btf__resolve_size(kern_btf, kern_mtype_id); - if (msize < 0 || kern_msize < 0 || msize != kern_msize) { + if (kern_msize < 0 || msize != kern_msize) { pr_warn("struct_ops init_kern %s: Error in size of member %s: %zd != %zd(kernel)\n", map->name, mname, (ssize_t)msize, (ssize_t)kern_msize); @@ -1077,8 +1317,10 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj) if (!bpf_map__is_struct_ops(map)) continue; - err = bpf_map__init_kern_struct_ops(map, obj->btf, - obj->btf_vmlinux); + if (!map->autocreate) + continue; + + err = bpf_map__init_kern_struct_ops(map); if (err) return err; } @@ -1086,7 +1328,8 @@ static int bpf_object__init_kern_struct_ops_maps(struct bpf_object *obj) return 0; } -static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) +static int init_struct_ops_maps(struct bpf_object *obj, const char *sec_name, + int shndx, Elf_Data *data) { const struct btf_type *type, *datasec; const struct btf_var_secinfo *vsi; @@ -1097,15 +1340,15 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) struct bpf_map *map; __u32 i; - if (obj->efile.st_ops_shndx == -1) + if (shndx == -1) return 0; btf = obj->btf; - datasec_id = btf__find_by_name_kind(btf, STRUCT_OPS_SEC, + datasec_id = btf__find_by_name_kind(btf, sec_name, BTF_KIND_DATASEC); if (datasec_id < 0) { pr_warn("struct_ops init: DATASEC %s not found\n", - STRUCT_OPS_SEC); + sec_name); return -EINVAL; } @@ -1118,7 +1361,7 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) type_id = btf__resolve_type(obj->btf, vsi->type); if (type_id < 0) { pr_warn("struct_ops init: Cannot resolve var type_id %u in DATASEC %s\n", - vsi->type, STRUCT_OPS_SEC); + vsi->type, sec_name); return -EINVAL; } @@ -1137,16 +1380,28 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) if (IS_ERR(map)) return PTR_ERR(map); - map->sec_idx = obj->efile.st_ops_shndx; + map->sec_idx = shndx; map->sec_offset = vsi->offset; map->name = strdup(var_name); if (!map->name) return -ENOMEM; + map->btf_value_type_id = type_id; + + /* Follow same convention as for programs autoload: + * SEC("?.struct_ops") means map is not created by default. + */ + if (sec_name[0] == '?') { + map->autocreate = false; + /* from now on forget there was ? in section name */ + sec_name++; + } map->def.type = BPF_MAP_TYPE_STRUCT_OPS; map->def.key_size = sizeof(int); map->def.value_size = type->size; map->def.max_entries = 1; + map->def.map_flags = strcmp(sec_name, STRUCT_OPS_LINK_SEC) == 0 ? BPF_F_LINK : 0; + map->autoattach = true; map->st_ops = calloc(1, sizeof(*map->st_ops)); if (!map->st_ops) @@ -1159,14 +1414,14 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) if (!st_ops->data || !st_ops->progs || !st_ops->kern_func_off) return -ENOMEM; - if (vsi->offset + type->size > obj->efile.st_ops_data->d_size) { + if (vsi->offset + type->size > data->d_size) { pr_warn("struct_ops init: var %s is beyond the end of DATASEC %s\n", - var_name, STRUCT_OPS_SEC); + var_name, sec_name); return -EINVAL; } memcpy(st_ops->data, - obj->efile.st_ops_data->d_buf + vsi->offset, + data->d_buf + vsi->offset, type->size); st_ops->tname = tname; st_ops->type = type; @@ -1179,12 +1434,34 @@ static int bpf_object__init_struct_ops_maps(struct bpf_object *obj) return 0; } +static int bpf_object_init_struct_ops(struct bpf_object *obj) +{ + const char *sec_name; + int sec_idx, err; + + for (sec_idx = 0; sec_idx < obj->efile.sec_cnt; ++sec_idx) { + struct elf_sec_desc *desc = &obj->efile.secs[sec_idx]; + + if (desc->sec_type != SEC_ST_OPS) + continue; + + sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); + if (!sec_name) + return -LIBBPF_ERRNO__FORMAT; + + err = init_struct_ops_maps(obj, sec_name, sec_idx, desc->data); + if (err) + return err; + } + + return 0; +} + static struct bpf_object *bpf_object__new(const char *path, const void *obj_buf, size_t obj_buf_sz, const char *obj_name) { - bool strict = (libbpf_mode & LIBBPF_STRICT_NO_OBJECT_LIST); struct bpf_object *obj; char *end; @@ -1214,17 +1491,12 @@ static struct bpf_object *bpf_object__new(const char *path, */ obj->efile.obj_buf = obj_buf; obj->efile.obj_buf_sz = obj_buf_sz; - obj->efile.maps_shndx = -1; obj->efile.btf_maps_shndx = -1; - obj->efile.st_ops_shndx = -1; obj->kconfig_map_idx = -1; obj->kern_version = get_kernel_version(); obj->loaded = false; - INIT_LIST_HEAD(&obj->list); - if (!strict) - list_add(&obj->list, &bpf_objects_list); return obj; } @@ -1236,7 +1508,7 @@ static void bpf_object__elf_finish(struct bpf_object *obj) elf_end(obj->efile.elf); obj->efile.elf = NULL; obj->efile.symbols = NULL; - obj->efile.st_ops_data = NULL; + obj->efile.arena_data = NULL; zfree(&obj->efile.secs); obj->efile.sec_cnt = 0; @@ -1257,10 +1529,7 @@ static int bpf_object__elf_init(struct bpf_object *obj) } if (obj->efile.obj_buf_sz > 0) { - /* - * obj_buf should have been validated by - * bpf_object__open_buffer(). - */ + /* obj_buf should have been validated by bpf_object__open_mem(). */ elf = elf_memory((char *)obj->efile.obj_buf, obj->efile.obj_buf_sz); } else { obj->efile.fd = open(obj->path, O_RDONLY | O_CLOEXEC); @@ -1310,7 +1579,7 @@ static int bpf_object__elf_init(struct bpf_object *obj) goto errout; } - /* Elf is corrupted/truncated, avoid calling elf_strptr. */ + /* ELF is corrupted/truncated, avoid calling elf_strptr. */ if (!elf_rawdata(elf_getscn(elf, obj->efile.shstrndx), NULL)) { pr_warn("elf: failed to get section names strings from %s: %s\n", obj->path, elf_errmsg(-1)); @@ -1349,6 +1618,10 @@ static int bpf_object__check_endianness(struct bpf_object *obj) static int bpf_object__init_license(struct bpf_object *obj, void *data, size_t size) { + if (!data) { + pr_warn("invalid license section in %s\n", obj->path); + return -LIBBPF_ERRNO__FORMAT; + } /* libbpf_strlcpy() only copies first N - 1 bytes, so size + 1 won't * go over allowed ELF data section buffer */ @@ -1362,7 +1635,7 @@ bpf_object__init_kversion(struct bpf_object *obj, void *data, size_t size) { __u32 kver; - if (size != sizeof(kver)) { + if (!data || size != sizeof(kver)) { pr_warn("invalid kver section in %s\n", obj->path); return -LIBBPF_ERRNO__FORMAT; } @@ -1398,15 +1671,12 @@ static int find_elf_sec_sz(const struct bpf_object *obj, const char *name, __u32 return -ENOENT; } -static int find_elf_var_offset(const struct bpf_object *obj, const char *name, __u32 *off) +static Elf64_Sym *find_elf_var_sym(const struct bpf_object *obj, const char *name) { Elf_Data *symbols = obj->efile.symbols; const char *sname; size_t si; - if (!name || !off) - return -EINVAL; - for (si = 0; si < symbols->d_size / sizeof(Elf64_Sym); si++) { Elf64_Sym *sym = elf_sym_by_idx(obj, si); @@ -1420,61 +1690,115 @@ static int find_elf_var_offset(const struct bpf_object *obj, const char *name, _ sname = elf_sym_str(obj, sym->st_name); if (!sname) { pr_warn("failed to get sym name string for var %s\n", name); - return -EIO; - } - if (strcmp(name, sname) == 0) { - *off = sym->st_value; - return 0; + return ERR_PTR(-EIO); } + if (strcmp(name, sname) == 0) + return sym; } - return -ENOENT; + return ERR_PTR(-ENOENT); } -static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) +/* Some versions of Android don't provide memfd_create() in their libc + * implementation, so avoid complications and just go straight to Linux + * syscall. + */ +static int sys_memfd_create(const char *name, unsigned flags) { - struct bpf_map *new_maps; - size_t new_cap; - int i; + return syscall(__NR_memfd_create, name, flags); +} - if (obj->nr_maps < obj->maps_cap) - return &obj->maps[obj->nr_maps++]; +#ifndef MFD_CLOEXEC +#define MFD_CLOEXEC 0x0001U +#endif - new_cap = max((size_t)4, obj->maps_cap * 3 / 2); - new_maps = libbpf_reallocarray(obj->maps, new_cap, sizeof(*obj->maps)); - if (!new_maps) { - pr_warn("alloc maps for object failed\n"); - return ERR_PTR(-ENOMEM); - } +static int create_placeholder_fd(void) +{ + int fd; - obj->maps_cap = new_cap; - obj->maps = new_maps; + fd = ensure_good_fd(sys_memfd_create("libbpf-placeholder-fd", MFD_CLOEXEC)); + if (fd < 0) + return -errno; + return fd; +} - /* zero out new maps */ - memset(obj->maps + obj->nr_maps, 0, - (obj->maps_cap - obj->nr_maps) * sizeof(*obj->maps)); - /* - * fill all fd with -1 so won't close incorrect fd (fd=0 is stdin) - * when failure (zclose won't close negative fd)). +static struct bpf_map *bpf_object__add_map(struct bpf_object *obj) +{ + struct bpf_map *map; + int err; + + err = libbpf_ensure_mem((void **)&obj->maps, &obj->maps_cap, + sizeof(*obj->maps), obj->nr_maps + 1); + if (err) + return ERR_PTR(err); + + map = &obj->maps[obj->nr_maps++]; + map->obj = obj; + /* Preallocate map FD without actually creating BPF map just yet. + * These map FD "placeholders" will be reused later without changing + * FD value when map is actually created in the kernel. + * + * This is useful to be able to perform BPF program relocations + * without having to create BPF maps before that step. This allows us + * to finalize and load BTF very late in BPF object's loading phase, + * right before BPF maps have to be created and BPF programs have to + * be loaded. By having these map FD placeholders we can perform all + * the sanitizations, relocations, and any other adjustments before we + * start creating actual BPF kernel objects (BTF, maps, progs). */ - for (i = obj->nr_maps; i < obj->maps_cap; i++) { - obj->maps[i].fd = -1; - obj->maps[i].inner_map_fd = -1; - } + map->fd = create_placeholder_fd(); + if (map->fd < 0) + return ERR_PTR(map->fd); + map->inner_map_fd = -1; + map->autocreate = true; - return &obj->maps[obj->nr_maps++]; + return map; } -static size_t bpf_map_mmap_sz(const struct bpf_map *map) +static size_t array_map_mmap_sz(unsigned int value_sz, unsigned int max_entries) { - long page_sz = sysconf(_SC_PAGE_SIZE); + const long page_sz = sysconf(_SC_PAGE_SIZE); size_t map_sz; - map_sz = (size_t)roundup(map->def.value_size, 8) * map->def.max_entries; + map_sz = (size_t)roundup(value_sz, 8) * max_entries; map_sz = roundup(map_sz, page_sz); return map_sz; } +static size_t bpf_map_mmap_sz(const struct bpf_map *map) +{ + const long page_sz = sysconf(_SC_PAGE_SIZE); + + switch (map->def.type) { + case BPF_MAP_TYPE_ARRAY: + return array_map_mmap_sz(map->def.value_size, map->def.max_entries); + case BPF_MAP_TYPE_ARENA: + return page_sz * map->def.max_entries; + default: + return 0; /* not supported */ + } +} + +static int bpf_map_mmap_resize(struct bpf_map *map, size_t old_sz, size_t new_sz) +{ + void *mmaped; + + if (!map->mmaped) + return -EINVAL; + + if (old_sz == new_sz) + return 0; + + mmaped = mmap(NULL, new_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); + if (mmaped == MAP_FAILED) + return -errno; + + memcpy(mmaped, map->mmaped, min(old_sz, new_sz)); + munmap(map->mmaped, old_sz); + map->mmaped = mmaped; + return 0; +} + static char *internal_map_name(struct bpf_object *obj, const char *real_name) { char map_name[BPF_OBJ_NAME_LEN], *p; @@ -1534,7 +1858,38 @@ static char *internal_map_name(struct bpf_object *obj, const char *real_name) } static int -bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map); +map_fill_btf_type_info(struct bpf_object *obj, struct bpf_map *map); + +/* Internal BPF map is mmap()'able only if at least one of corresponding + * DATASEC's VARs are to be exposed through BPF skeleton. I.e., it's a GLOBAL + * variable and it's not marked as __hidden (which turns it into, effectively, + * a STATIC variable). + */ +static bool map_is_mmapable(struct bpf_object *obj, struct bpf_map *map) +{ + const struct btf_type *t, *vt; + struct btf_var_secinfo *vsi; + int i, n; + + if (!map->btf_value_type_id) + return false; + + t = btf__type_by_id(obj->btf, map->btf_value_type_id); + if (!btf_is_datasec(t)) + return false; + + vsi = btf_var_secinfos(t); + for (i = 0, n = btf_vlen(t); i < n; i++, vsi++) { + vt = btf__type_by_id(obj->btf, vsi->type); + if (!btf_is_var(vt)) + continue; + + if (btf_var(vt)->linkage != BTF_VAR_STATIC) + return true; + } + + return false; +} static int bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, @@ -1542,6 +1897,7 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, { struct bpf_map_def *def; struct bpf_map *map; + size_t mmap_sz; int err; map = bpf_object__add_map(obj); @@ -1565,13 +1921,19 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, def->value_size = data_sz; def->max_entries = 1; def->map_flags = type == LIBBPF_MAP_RODATA || type == LIBBPF_MAP_KCONFIG - ? BPF_F_RDONLY_PROG : 0; - def->map_flags |= BPF_F_MMAPABLE; + ? BPF_F_RDONLY_PROG : 0; + + /* failures are fine because of maps like .rodata.str1.1 */ + (void) map_fill_btf_type_info(obj, map); + + if (map_is_mmapable(obj, map)) + def->map_flags |= BPF_F_MMAPABLE; pr_debug("map '%s' (global data): at sec_idx %d, offset %zu, flags %x.\n", map->name, map->sec_idx, map->sec_offset, def->map_flags); - map->mmaped = mmap(NULL, bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE, + mmap_sz = bpf_map_mmap_sz(map); + map->mmaped = mmap(NULL, mmap_sz, PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0); if (map->mmaped == MAP_FAILED) { err = -errno; @@ -1583,9 +1945,6 @@ bpf_object__init_internal_map(struct bpf_object *obj, enum libbpf_map_type type, return err; } - /* failures are fine because of maps like .rodata.str1.1 */ - (void) bpf_map_find_btf_info(obj, map); - if (data) memcpy(map->mmaped, data, data_sz); @@ -1605,6 +1964,10 @@ static int bpf_object__init_global_data_maps(struct bpf_object *obj) for (sec_idx = 1; sec_idx < obj->efile.sec_cnt; sec_idx++) { sec_desc = &obj->efile.secs[sec_idx]; + /* Skip recognized sections with size 0. */ + if (!sec_desc->data || sec_desc->data->d_size == 0) + continue; + switch (sec_desc->sec_type) { case SEC_DATA: sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, sec_idx)); @@ -1651,13 +2014,27 @@ static struct extern_desc *find_extern_by_name(const struct bpf_object *obj, return NULL; } +static struct extern_desc *find_extern_by_name_with_len(const struct bpf_object *obj, + const void *name, int len) +{ + const char *ext_name; + int i; + + for (i = 0; i < obj->nr_extern; i++) { + ext_name = obj->externs[i].name; + if (strlen(ext_name) == len && strncmp(ext_name, name, len) == 0) + return &obj->externs[i]; + } + return NULL; +} + static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, char value) { switch (ext->kcfg.type) { case KCFG_BOOL: if (value == 'm') { - pr_warn("extern (kcfg) %s=%c should be tristate or char\n", + pr_warn("extern (kcfg) '%s': value '%c' implies tristate or char type\n", ext->name, value); return -EINVAL; } @@ -1678,7 +2055,7 @@ static int set_kcfg_value_tri(struct extern_desc *ext, void *ext_val, case KCFG_INT: case KCFG_CHAR_ARR: default: - pr_warn("extern (kcfg) %s=%c should be bool, tristate, or char\n", + pr_warn("extern (kcfg) '%s': value '%c' implies bool, tristate, or char type\n", ext->name, value); return -EINVAL; } @@ -1692,7 +2069,8 @@ static int set_kcfg_value_str(struct extern_desc *ext, char *ext_val, size_t len; if (ext->kcfg.type != KCFG_CHAR_ARR) { - pr_warn("extern (kcfg) %s=%s should be char array\n", ext->name, value); + pr_warn("extern (kcfg) '%s': value '%s' implies char array type\n", + ext->name, value); return -EINVAL; } @@ -1706,7 +2084,7 @@ static int set_kcfg_value_str(struct extern_desc *ext, char *ext_val, /* strip quotes */ len -= 2; if (len >= ext->kcfg.sz) { - pr_warn("extern (kcfg) '%s': long string config %s of (%zu bytes) truncated to %d bytes\n", + pr_warn("extern (kcfg) '%s': long string '%s' of (%zu bytes) truncated to %d bytes\n", ext->name, value, len, ext->kcfg.sz - 1); len = ext->kcfg.sz - 1; } @@ -1763,26 +2141,41 @@ static bool is_kcfg_value_in_range(const struct extern_desc *ext, __u64 v) static int set_kcfg_value_num(struct extern_desc *ext, void *ext_val, __u64 value) { - if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR) { - pr_warn("extern (kcfg) %s=%llu should be integer\n", + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR && + ext->kcfg.type != KCFG_BOOL) { + pr_warn("extern (kcfg) '%s': value '%llu' implies integer, char, or boolean type\n", + ext->name, (unsigned long long)value); + return -EINVAL; + } + if (ext->kcfg.type == KCFG_BOOL && value > 1) { + pr_warn("extern (kcfg) '%s': value '%llu' isn't boolean compatible\n", ext->name, (unsigned long long)value); return -EINVAL; + } if (!is_kcfg_value_in_range(ext, value)) { - pr_warn("extern (kcfg) %s=%llu value doesn't fit in %d bytes\n", + pr_warn("extern (kcfg) '%s': value '%llu' doesn't fit in %d bytes\n", ext->name, (unsigned long long)value, ext->kcfg.sz); return -ERANGE; } switch (ext->kcfg.sz) { - case 1: *(__u8 *)ext_val = value; break; - case 2: *(__u16 *)ext_val = value; break; - case 4: *(__u32 *)ext_val = value; break; - case 8: *(__u64 *)ext_val = value; break; - default: - return -EINVAL; - } - ext->is_set = true; - return 0; + case 1: + *(__u8 *)ext_val = value; + break; + case 2: + *(__u16 *)ext_val = value; + break; + case 4: + *(__u32 *)ext_val = value; + break; + case 8: + *(__u64 *)ext_val = value; + break; + default: + return -EINVAL; + } + ext->is_set = true; + return 0; } static int bpf_object__process_kconfig_line(struct bpf_object *obj, @@ -1833,16 +2226,19 @@ static int bpf_object__process_kconfig_line(struct bpf_object *obj, /* assume integer */ err = parse_u64(value, &num); if (err) { - pr_warn("extern (kcfg) %s=%s should be integer\n", - ext->name, value); + pr_warn("extern (kcfg) '%s': value '%s' isn't a valid integer\n", ext->name, value); return err; } + if (ext->kcfg.type != KCFG_INT && ext->kcfg.type != KCFG_CHAR) { + pr_warn("extern (kcfg) '%s': value '%s' implies integer type\n", ext->name, value); + return -EINVAL; + } err = set_kcfg_value_num(ext, ext_val, num); break; } if (err) return err; - pr_debug("extern (kcfg) %s=%s\n", ext->name, value); + pr_debug("extern (kcfg) '%s': set to %s\n", ext->name, value); return 0; } @@ -1861,9 +2257,9 @@ static int bpf_object__read_kconfig_file(struct bpf_object *obj, void *data) return -ENAMETOOLONG; /* gzopen also accepts uncompressed files. */ - file = gzopen(buf, "r"); + file = gzopen(buf, "re"); if (!file) - file = gzopen("/proc/config.gz", "r"); + file = gzopen("/proc/config.gz", "re"); if (!file) { pr_warn("failed to open system Kconfig\n"); @@ -1938,143 +2334,6 @@ static int bpf_object__init_kconfig_map(struct bpf_object *obj) return 0; } -static int bpf_object__init_user_maps(struct bpf_object *obj, bool strict) -{ - Elf_Data *symbols = obj->efile.symbols; - int i, map_def_sz = 0, nr_maps = 0, nr_syms; - Elf_Data *data = NULL; - Elf_Scn *scn; - - if (obj->efile.maps_shndx < 0) - return 0; - - if (libbpf_mode & LIBBPF_STRICT_MAP_DEFINITIONS) { - pr_warn("legacy map definitions in SEC(\"maps\") are not supported\n"); - return -EOPNOTSUPP; - } - - if (!symbols) - return -EINVAL; - - scn = elf_sec_by_idx(obj, obj->efile.maps_shndx); - data = elf_sec_data(obj, scn); - if (!scn || !data) { - pr_warn("elf: failed to get legacy map definitions for %s\n", - obj->path); - return -EINVAL; - } - - /* - * Count number of maps. Each map has a name. - * Array of maps is not supported: only the first element is - * considered. - * - * TODO: Detect array of map and report error. - */ - nr_syms = symbols->d_size / sizeof(Elf64_Sym); - for (i = 0; i < nr_syms; i++) { - Elf64_Sym *sym = elf_sym_by_idx(obj, i); - - if (sym->st_shndx != obj->efile.maps_shndx) - continue; - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) - continue; - nr_maps++; - } - /* Assume equally sized map definitions */ - pr_debug("elf: found %d legacy map definitions (%zd bytes) in %s\n", - nr_maps, data->d_size, obj->path); - - if (!data->d_size || nr_maps == 0 || (data->d_size % nr_maps) != 0) { - pr_warn("elf: unable to determine legacy map definition size in %s\n", - obj->path); - return -EINVAL; - } - map_def_sz = data->d_size / nr_maps; - - /* Fill obj->maps using data in "maps" section. */ - for (i = 0; i < nr_syms; i++) { - Elf64_Sym *sym = elf_sym_by_idx(obj, i); - const char *map_name; - struct bpf_map_def *def; - struct bpf_map *map; - - if (sym->st_shndx != obj->efile.maps_shndx) - continue; - if (ELF64_ST_TYPE(sym->st_info) == STT_SECTION) - continue; - - map = bpf_object__add_map(obj); - if (IS_ERR(map)) - return PTR_ERR(map); - - map_name = elf_sym_str(obj, sym->st_name); - if (!map_name) { - pr_warn("failed to get map #%d name sym string for obj %s\n", - i, obj->path); - return -LIBBPF_ERRNO__FORMAT; - } - - pr_warn("map '%s' (legacy): legacy map definitions are deprecated, use BTF-defined maps instead\n", map_name); - - if (ELF64_ST_BIND(sym->st_info) == STB_LOCAL) { - pr_warn("map '%s' (legacy): static maps are not supported\n", map_name); - return -ENOTSUP; - } - - map->libbpf_type = LIBBPF_MAP_UNSPEC; - map->sec_idx = sym->st_shndx; - map->sec_offset = sym->st_value; - pr_debug("map '%s' (legacy): at sec_idx %d, offset %zu.\n", - map_name, map->sec_idx, map->sec_offset); - if (sym->st_value + map_def_sz > data->d_size) { - pr_warn("corrupted maps section in %s: last map \"%s\" too small\n", - obj->path, map_name); - return -EINVAL; - } - - map->name = strdup(map_name); - if (!map->name) { - pr_warn("map '%s': failed to alloc map name\n", map_name); - return -ENOMEM; - } - pr_debug("map %d is \"%s\"\n", i, map->name); - def = (struct bpf_map_def *)(data->d_buf + sym->st_value); - /* - * If the definition of the map in the object file fits in - * bpf_map_def, copy it. Any extra fields in our version - * of bpf_map_def will default to zero as a result of the - * calloc above. - */ - if (map_def_sz <= sizeof(struct bpf_map_def)) { - memcpy(&map->def, def, map_def_sz); - } else { - /* - * Here the map structure being read is bigger than what - * we expect, truncate if the excess bits are all zero. - * If they are not zero, reject this map as - * incompatible. - */ - char *b; - - for (b = ((char *)def) + sizeof(struct bpf_map_def); - b < ((char *)def) + map_def_sz; b++) { - if (*b != 0) { - pr_warn("maps section in %s: \"%s\" has unrecognized, non-zero options\n", - obj->path, map_name); - if (strict) - return -EINVAL; - } - } - memcpy(&map->def, def, sizeof(struct bpf_map_def)); - } - - /* btf info may not exist but fill it in if it does exist */ - (void) bpf_map_find_btf_info(obj, map); - } - return 0; -} - const struct btf_type * skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id) { @@ -2128,6 +2387,7 @@ static const char *__btf_kind_str(__u16 kind) case BTF_KIND_FLOAT: return "float"; case BTF_KIND_DECL_TAG: return "decl_tag"; case BTF_KIND_TYPE_TAG: return "type_tag"; + case BTF_KIND_ENUM64: return "enum64"; default: return "unknown"; } } @@ -2174,23 +2434,81 @@ static bool get_map_field_int(const char *map_name, const struct btf *btf, return true; } -static int build_map_pin_path(struct bpf_map *map, const char *path) +static bool get_map_field_long(const char *map_name, const struct btf *btf, + const struct btf_member *m, __u64 *res) { - char buf[PATH_MAX]; - int len; + const struct btf_type *t = skip_mods_and_typedefs(btf, m->type, NULL); + const char *name = btf__name_by_offset(btf, m->name_off); - if (!path) - path = "/sys/fs/bpf"; + if (btf_is_ptr(t)) { + __u32 res32; + bool ret; + + ret = get_map_field_int(map_name, btf, m, &res32); + if (ret) + *res = (__u64)res32; + return ret; + } + + if (!btf_is_enum(t) && !btf_is_enum64(t)) { + pr_warn("map '%s': attr '%s': expected ENUM or ENUM64, got %s.\n", + map_name, name, btf_kind_str(t)); + return false; + } - len = snprintf(buf, PATH_MAX, "%s/%s", path, bpf_map__name(map)); + if (btf_vlen(t) != 1) { + pr_warn("map '%s': attr '%s': invalid __ulong\n", + map_name, name); + return false; + } + + if (btf_is_enum(t)) { + const struct btf_enum *e = btf_enum(t); + + *res = e->val; + } else { + const struct btf_enum64 *e = btf_enum64(t); + + *res = btf_enum64_value(e); + } + return true; +} + +static int pathname_concat(char *buf, size_t buf_sz, const char *path, const char *name) +{ + int len; + + len = snprintf(buf, buf_sz, "%s/%s", path, name); if (len < 0) return -EINVAL; - else if (len >= PATH_MAX) + if (len >= buf_sz) return -ENAMETOOLONG; + return 0; +} + +static int build_map_pin_path(struct bpf_map *map, const char *path) +{ + char buf[PATH_MAX]; + int err; + + if (!path) + path = BPF_FS_DEFAULT_PATH; + + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return err; + return bpf_map__set_pin_path(map, buf); } +/* should match definition in bpf_helpers.h */ +enum libbpf_pin_type { + LIBBPF_PIN_NONE, + /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ + LIBBPF_PIN_BY_NAME, +}; + int parse_btf_map_def(const char *map_name, struct btf *btf, const struct btf_type *def_t, bool strict, struct btf_map_def *map_def, struct btf_map_def *inner_def) @@ -2389,9 +2707,9 @@ int parse_btf_map_def(const char *map_name, struct btf *btf, map_def->pinning = val; map_def->parts |= MAP_DEF_PINNING; } else if (strcmp(name, "map_extra") == 0) { - __u32 map_extra; + __u64 map_extra; - if (!get_map_field_int(map_name, btf, m, &map_extra)) + if (!get_map_field_long(map_name, btf, m, &map_extra)) return -EINVAL; map_def->map_extra = map_extra; map_def->parts |= MAP_DEF_MAP_EXTRA; @@ -2412,6 +2730,43 @@ int parse_btf_map_def(const char *map_name, struct btf *btf, return 0; } +static size_t adjust_ringbuf_sz(size_t sz) +{ + __u32 page_sz = sysconf(_SC_PAGE_SIZE); + __u32 mul; + + /* if user forgot to set any size, make sure they see error */ + if (sz == 0) + return 0; + /* Kernel expects BPF_MAP_TYPE_RINGBUF's max_entries to be + * a power-of-2 multiple of kernel's page size. If user diligently + * satisified these conditions, pass the size through. + */ + if ((sz % page_sz) == 0 && is_pow_of_2(sz / page_sz)) + return sz; + + /* Otherwise find closest (page_sz * power_of_2) product bigger than + * user-set size to satisfy both user size request and kernel + * requirements and substitute correct max_entries for map creation. + */ + for (mul = 1; mul <= UINT_MAX / page_sz; mul <<= 1) { + if (mul * page_sz > sz) + return mul * page_sz; + } + + /* if it's impossible to satisfy the conditions (i.e., user size is + * very close to UINT_MAX but is not a power-of-2 multiple of + * page_size) then just return original size and let kernel reject it + */ + return sz; +} + +static bool map_is_ringbuf(const struct bpf_map *map) +{ + return map->def.type == BPF_MAP_TYPE_RINGBUF || + map->def.type == BPF_MAP_TYPE_USER_RINGBUF; +} + static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def) { map->def.type = def->map_type; @@ -2425,6 +2780,10 @@ static void fill_map_from_def(struct bpf_map *map, const struct btf_map_def *def map->btf_key_type_id = def->key_type_id; map->btf_value_type_id = def->value_type_id; + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map_is_ringbuf(map)) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + if (def->parts & MAP_DEF_MAP_TYPE) pr_debug("map '%s': found type = %u.\n", map->name, def->map_type); @@ -2549,7 +2908,9 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, map->inner_map = calloc(1, sizeof(*map->inner_map)); if (!map->inner_map) return -ENOMEM; - map->inner_map->fd = -1; + map->inner_map->fd = create_placeholder_fd(); + if (map->inner_map->fd < 0) + return map->inner_map->fd; map->inner_map->sec_idx = sec_idx; map->inner_map->name = malloc(strlen(map_name) + sizeof(".inner") + 1); if (!map->inner_map->name) @@ -2559,13 +2920,39 @@ static int bpf_object__init_user_btf_map(struct bpf_object *obj, fill_map_from_def(map->inner_map, &inner_def); } - err = bpf_map_find_btf_info(obj, map); + err = map_fill_btf_type_info(obj, map); if (err) return err; return 0; } +static int init_arena_map_data(struct bpf_object *obj, struct bpf_map *map, + const char *sec_name, int sec_idx, + void *data, size_t data_sz) +{ + const long page_sz = sysconf(_SC_PAGE_SIZE); + size_t mmap_sz; + + mmap_sz = bpf_map_mmap_sz(obj->arena_map); + if (roundup(data_sz, page_sz) > mmap_sz) { + pr_warn("elf: sec '%s': declared ARENA map size (%zu) is too small to hold global __arena variables of size %zu\n", + sec_name, mmap_sz, data_sz); + return -E2BIG; + } + + obj->arena_data = malloc(data_sz); + if (!obj->arena_data) + return -ENOMEM; + memcpy(obj->arena_data, data, data_sz); + obj->arena_data_sz = data_sz; + + /* make bpf_map__init_value() work for ARENA maps */ + map->mmaped = obj->arena_data; + + return 0; +} + static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, const char *pin_root_path) { @@ -2615,6 +3002,33 @@ static int bpf_object__init_user_btf_maps(struct bpf_object *obj, bool strict, return err; } + for (i = 0; i < obj->nr_maps; i++) { + struct bpf_map *map = &obj->maps[i]; + + if (map->def.type != BPF_MAP_TYPE_ARENA) + continue; + + if (obj->arena_map) { + pr_warn("map '%s': only single ARENA map is supported (map '%s' is also ARENA)\n", + map->name, obj->arena_map->name); + return -EINVAL; + } + obj->arena_map = map; + + if (obj->efile.arena_data) { + err = init_arena_map_data(obj, map, ARENA_SEC, obj->efile.arena_data_shndx, + obj->efile.arena_data->d_buf, + obj->efile.arena_data->d_size); + if (err) + return err; + } + } + if (obj->efile.arena_data && !obj->arena_map) { + pr_warn("elf: sec '%s': to use global __arena variables the ARENA map should be explicitly declared in SEC(\".maps\")\n", + ARENA_SEC); + return -ENOENT; + } + return 0; } @@ -2623,16 +3037,15 @@ static int bpf_object__init_maps(struct bpf_object *obj, { const char *pin_root_path; bool strict; - int err; + int err = 0; strict = !OPTS_GET(opts, relaxed_maps, false); pin_root_path = OPTS_GET(opts, pin_root_path, NULL); - err = bpf_object__init_user_maps(obj, strict); - err = err ?: bpf_object__init_user_btf_maps(obj, strict, pin_root_path); + err = bpf_object__init_user_btf_maps(obj, strict, pin_root_path); err = err ?: bpf_object__init_global_data_maps(obj); err = err ?: bpf_object__init_kconfig_map(obj); - err = err ?: bpf_object__init_struct_ops_maps(obj); + err = err ?: bpf_object_init_struct_ops(obj); return err; } @@ -2648,6 +3061,11 @@ static bool section_have_execinstr(struct bpf_object *obj, int idx) return sh->sh_flags & SHF_EXECINSTR; } +static bool starts_with_qmark(const char *s) +{ + return s && s[0] == '?'; +} + static bool btf_needs_sanitization(struct bpf_object *obj) { bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC); @@ -2656,12 +3074,14 @@ static bool btf_needs_sanitization(struct bpf_object *obj) bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); + bool has_qmark_datasec = kernel_supports(obj, FEAT_BTF_QMARK_DATASEC); return !has_func || !has_datasec || !has_func_global || !has_float || - !has_decl_tag || !has_type_tag; + !has_decl_tag || !has_type_tag || !has_enum64 || !has_qmark_datasec; } -static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) +static int bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) { bool has_func_global = kernel_supports(obj, FEAT_BTF_GLOBAL_FUNC); bool has_datasec = kernel_supports(obj, FEAT_BTF_DATASEC); @@ -2669,6 +3089,9 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) bool has_func = kernel_supports(obj, FEAT_BTF_FUNC); bool has_decl_tag = kernel_supports(obj, FEAT_BTF_DECL_TAG); bool has_type_tag = kernel_supports(obj, FEAT_BTF_TYPE_TAG); + bool has_enum64 = kernel_supports(obj, FEAT_BTF_ENUM64); + bool has_qmark_datasec = kernel_supports(obj, FEAT_BTF_QMARK_DATASEC); + int enum64_placeholder_id = 0; struct btf_type *t; int i, j, vlen; @@ -2694,7 +3117,7 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) name = (char *)btf__name_by_offset(btf, t->name_off); while (*name) { - if (*name == '.') + if (*name == '.' || *name == '?') *name = '_'; name++; } @@ -2709,6 +3132,14 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) vt = (void *)btf__type_by_id(btf, v->type); m->name_off = vt->name_off; } + } else if (!has_qmark_datasec && btf_is_datasec(t) && + starts_with_qmark(btf__name_by_offset(btf, t->name_off))) { + /* replace '?' prefix with '_' for DATASEC names */ + char *name; + + name = (char *)btf__name_by_offset(btf, t->name_off); + if (name[0] == '?') + name[0] = '_'; } else if (!has_func && btf_is_func_proto(t)) { /* replace FUNC_PROTO with ENUM */ vlen = btf_vlen(t); @@ -2731,20 +3162,44 @@ static void bpf_object__sanitize_btf(struct bpf_object *obj, struct btf *btf) /* replace TYPE_TAG with a CONST */ t->name_off = 0; t->info = BTF_INFO_ENC(BTF_KIND_CONST, 0, 0); + } else if (!has_enum64 && btf_is_enum(t)) { + /* clear the kflag */ + t->info = btf_type_info(btf_kind(t), btf_vlen(t), false); + } else if (!has_enum64 && btf_is_enum64(t)) { + /* replace ENUM64 with a union */ + struct btf_member *m; + + if (enum64_placeholder_id == 0) { + enum64_placeholder_id = btf__add_int(btf, "enum64_placeholder", 1, 0); + if (enum64_placeholder_id < 0) + return enum64_placeholder_id; + + t = (struct btf_type *)btf__type_by_id(btf, i); + } + + m = btf_members(t); + vlen = btf_vlen(t); + t->info = BTF_INFO_ENC(BTF_KIND_UNION, 0, vlen); + for (j = 0; j < vlen; j++, m++) { + m->type = enum64_placeholder_id; + m->offset = 0; + } } } + + return 0; } static bool libbpf_needs_btf(const struct bpf_object *obj) { return obj->efile.btf_maps_shndx >= 0 || - obj->efile.st_ops_shndx >= 0 || + obj->efile.has_st_ops || obj->nr_extern > 0; } static bool kernel_needs_btf(const struct bpf_object *obj) { - return obj->efile.st_ops_shndx >= 0; + return obj->efile.has_st_ops; } static int bpf_object__init_btf(struct bpf_object *obj, @@ -2838,57 +3293,89 @@ static int compare_vsi_off(const void *_a, const void *_b) static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf, struct btf_type *t) { - __u32 size = 0, off = 0, i, vars = btf_vlen(t); - const char *name = btf__name_by_offset(btf, t->name_off); - const struct btf_type *t_var; + __u32 size = 0, i, vars = btf_vlen(t); + const char *sec_name = btf__name_by_offset(btf, t->name_off); struct btf_var_secinfo *vsi; - const struct btf_var *var; - int ret; + bool fixup_offsets = false; + int err; - if (!name) { + if (!sec_name) { pr_debug("No name found in string section for DATASEC kind.\n"); return -ENOENT; } - /* .extern datasec size and var offsets were set correctly during - * extern collection step, so just skip straight to sorting variables + /* Extern-backing datasecs (.ksyms, .kconfig) have their size and + * variable offsets set at the previous step. Further, not every + * extern BTF VAR has corresponding ELF symbol preserved, so we skip + * all fixups altogether for such sections and go straight to sorting + * VARs within their DATASEC. */ - if (t->size) + if (strcmp(sec_name, KCONFIG_SEC) == 0 || strcmp(sec_name, KSYMS_SEC) == 0) goto sort_vars; - ret = find_elf_sec_sz(obj, name, &size); - if (ret || !size) { - pr_debug("Invalid size for section %s: %u bytes\n", name, size); - return -ENOENT; - } + /* Clang leaves DATASEC size and VAR offsets as zeroes, so we need to + * fix this up. But BPF static linker already fixes this up and fills + * all the sizes and offsets during static linking. So this step has + * to be optional. But the STV_HIDDEN handling is non-optional for any + * non-extern DATASEC, so the variable fixup loop below handles both + * functions at the same time, paying the cost of BTF VAR <-> ELF + * symbol matching just once. + */ + if (t->size == 0) { + err = find_elf_sec_sz(obj, sec_name, &size); + if (err || !size) { + pr_debug("sec '%s': failed to determine size from ELF: size %u, err %d\n", + sec_name, size, err); + return -ENOENT; + } - t->size = size; + t->size = size; + fixup_offsets = true; + } for (i = 0, vsi = btf_var_secinfos(t); i < vars; i++, vsi++) { + const struct btf_type *t_var; + struct btf_var *var; + const char *var_name; + Elf64_Sym *sym; + t_var = btf__type_by_id(btf, vsi->type); if (!t_var || !btf_is_var(t_var)) { - pr_debug("Non-VAR type seen in section %s\n", name); + pr_debug("sec '%s': unexpected non-VAR type found\n", sec_name); return -EINVAL; } var = btf_var(t_var); - if (var->linkage == BTF_VAR_STATIC) + if (var->linkage == BTF_VAR_STATIC || var->linkage == BTF_VAR_GLOBAL_EXTERN) continue; - name = btf__name_by_offset(btf, t_var->name_off); - if (!name) { - pr_debug("No name found in string section for VAR kind\n"); + var_name = btf__name_by_offset(btf, t_var->name_off); + if (!var_name) { + pr_debug("sec '%s': failed to find name of DATASEC's member #%d\n", + sec_name, i); return -ENOENT; } - ret = find_elf_var_offset(obj, name, &off); - if (ret) { - pr_debug("No offset found in symbol table for VAR %s\n", - name); + sym = find_elf_var_sym(obj, var_name); + if (IS_ERR(sym)) { + pr_debug("sec '%s': failed to find ELF symbol for VAR '%s'\n", + sec_name, var_name); return -ENOENT; } - vsi->offset = off; + if (fixup_offsets) + vsi->offset = sym->st_value; + + /* if variable is a global/weak symbol, but has restricted + * (STV_HIDDEN or STV_INTERNAL) visibility, mark its BTF VAR + * as static. This follows similar logic for functions (BPF + * subprogs) and influences libbpf's further decisions about + * whether to make global data BPF array maps as + * BPF_F_MMAPABLE. + */ + if (ELF64_ST_VISIBILITY(sym->st_other) == STV_HIDDEN + || ELF64_ST_VISIBILITY(sym->st_other) == STV_INTERNAL) + var->linkage = BTF_VAR_STATIC; } sort_vars: @@ -2896,13 +3383,16 @@ static int btf_fixup_datasec(struct bpf_object *obj, struct btf *btf, return 0; } -static int btf_finalize_data(struct bpf_object *obj, struct btf *btf) +static int bpf_object_fixup_btf(struct bpf_object *obj) { - int err = 0; - __u32 i, n = btf__type_cnt(btf); + int i, n, err = 0; + + if (!obj->btf) + return 0; + n = btf__type_cnt(obj->btf); for (i = 1; i < n; i++) { - struct btf_type *t = btf_type_by_id(btf, i); + struct btf_type *t = btf_type_by_id(obj->btf, i); /* Loader needs to fix up some of the things compiler * couldn't get its hands on while emitting BTF. This @@ -2910,33 +3400,12 @@ static int btf_finalize_data(struct bpf_object *obj, struct btf *btf) * the info from the ELF itself for this purpose. */ if (btf_is_datasec(t)) { - err = btf_fixup_datasec(obj, btf, t); + err = btf_fixup_datasec(obj, obj->btf, t); if (err) - break; + return err; } } - return libbpf_err(err); -} - -int btf__finalize_data(struct bpf_object *obj, struct btf *btf) -{ - return btf_finalize_data(obj, btf); -} - -static int bpf_object__finalize_btf(struct bpf_object *obj) -{ - int err; - - if (!obj->btf) - return 0; - - err = btf_finalize_data(obj, obj->btf); - if (err) { - pr_warn("Error finalizing %s: %d.\n", BTF_ELF_SEC, err); - return err; - } - return 0; } @@ -2955,9 +3424,15 @@ static bool prog_needs_vmlinux_btf(struct bpf_program *prog) return false; } +static bool map_needs_vmlinux_btf(struct bpf_map *map) +{ + return bpf_map__is_struct_ops(map); +} + static bool obj_needs_vmlinux_btf(const struct bpf_object *obj) { struct bpf_program *prog; + struct bpf_map *map; int i; /* CO-RE relocations need kernel BTF, only when btf_custom_path @@ -2982,6 +3457,11 @@ static bool obj_needs_vmlinux_btf(const struct bpf_object *obj) return true; } + bpf_object__for_each_map(map, obj) { + if (map_needs_vmlinux_btf(map)) + return true; + } + return false; } @@ -3070,7 +3550,9 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) /* enforce 8-byte pointers for BPF-targeted BTFs */ btf__set_pointer_size(obj->btf, 8); - bpf_object__sanitize_btf(obj, kern_btf); + err = bpf_object__sanitize_btf(obj, kern_btf); + if (err) + return err; } if (obj->gen_loader) { @@ -3087,7 +3569,7 @@ static int bpf_object__sanitize_and_load_btf(struct bpf_object *obj) } else { /* currently BPF_BTF_LOAD only supports log_level 1 */ err = btf_load_into_kernel(kern_btf, obj->log_buf, obj->log_size, - obj->log_level ? 1 : 0); + obj->log_level ? 1 : 0, obj->token_fd); } if (sanitize) { if (!err) { @@ -3307,10 +3789,15 @@ static int bpf_object__elf_collect(struct bpf_object *obj) Elf64_Shdr *sh; /* ELF section indices are 0-based, but sec #0 is special "invalid" - * section. e_shnum does include sec #0, so e_shnum is the necessary - * size of an array to keep all the sections. + * section. Since section count retrieved by elf_getshdrnum() does + * include sec #0, it is already the necessary size of an array to keep + * all the sections. */ - obj->efile.sec_cnt = obj->efile.ehdr->e_shnum; + if (elf_getshdrnum(obj->efile.elf, &obj->efile.sec_cnt)) { + pr_warn("elf: failed to get the number of sections for %s: %s\n", + obj->path, elf_errmsg(-1)); + return -LIBBPF_ERRNO__FORMAT; + } obj->efile.secs = calloc(obj->efile.sec_cnt, sizeof(*obj->efile.secs)); if (!obj->efile.secs) return -ENOMEM; @@ -3382,7 +3869,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) if (err) return err; } else if (strcmp(name, "maps") == 0) { - obj->efile.maps_shndx = idx; + pr_warn("elf: legacy map definitions in 'maps' section are not supported by libbpf v1.0+\n"); + return -ENOTSUP; } else if (strcmp(name, MAPS_ELF_SEC) == 0) { obj->efile.btf_maps_shndx = idx; } else if (strcmp(name, BTF_ELF_SEC) == 0) { @@ -3412,9 +3900,17 @@ static int bpf_object__elf_collect(struct bpf_object *obj) sec_desc->sec_type = SEC_RODATA; sec_desc->shdr = sh; sec_desc->data = data; - } else if (strcmp(name, STRUCT_OPS_SEC) == 0) { - obj->efile.st_ops_data = data; - obj->efile.st_ops_shndx = idx; + } else if (strcmp(name, STRUCT_OPS_SEC) == 0 || + strcmp(name, STRUCT_OPS_LINK_SEC) == 0 || + strcmp(name, "?" STRUCT_OPS_SEC) == 0 || + strcmp(name, "?" STRUCT_OPS_LINK_SEC) == 0) { + sec_desc->sec_type = SEC_ST_OPS; + sec_desc->shdr = sh; + sec_desc->data = data; + obj->efile.has_st_ops = true; + } else if (strcmp(name, ARENA_SEC) == 0) { + obj->efile.arena_data = data; + obj->efile.arena_data_shndx = idx; } else { pr_info("elf: skipping unrecognized data section(%d) %s\n", idx, name); @@ -3429,6 +3925,9 @@ static int bpf_object__elf_collect(struct bpf_object *obj) /* Only do relo for section with exec instructions */ if (!section_have_execinstr(obj, targ_sec_idx) && strcmp(name, ".rel" STRUCT_OPS_SEC) && + strcmp(name, ".rel" STRUCT_OPS_LINK_SEC) && + strcmp(name, ".rel?" STRUCT_OPS_SEC) && + strcmp(name, ".rel?" STRUCT_OPS_LINK_SEC) && strcmp(name, ".rel" MAPS_ELF_SEC)) { pr_info("elf: skipping relo section(%d) %s for section(%d) %s\n", idx, name, targ_sec_idx, @@ -3439,7 +3938,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) sec_desc->sec_type = SEC_RELO; sec_desc->shdr = sh; sec_desc->data = data; - } else if (sh->sh_type == SHT_NOBITS && strcmp(name, BSS_SEC) == 0) { + } else if (sh->sh_type == SHT_NOBITS && (strcmp(name, BSS_SEC) == 0 || + str_has_pfx(name, BSS_SEC "."))) { sec_desc->sec_type = SEC_BSS; sec_desc->shdr = sh; sec_desc->data = data; @@ -3455,7 +3955,8 @@ static int bpf_object__elf_collect(struct bpf_object *obj) } /* sort BPF programs by section name and in-section instruction offset - * for faster search */ + * for faster search + */ if (obj->nr_programs) qsort(obj->programs, obj->nr_programs, sizeof(*obj->programs), cmp_progs); @@ -3577,6 +4078,10 @@ static enum kcfg_type find_kcfg_type(const struct btf *btf, int id, if (strcmp(name, "libbpf_tristate")) return KCFG_UNKNOWN; return KCFG_TRISTATE; + case BTF_KIND_ENUM64: + if (strcmp(name, "libbpf_tristate")) + return KCFG_UNKNOWN; + return KCFG_TRISTATE; case BTF_KIND_ARRAY: if (btf_array(t)->nelems == 0) return KCFG_UNKNOWN; @@ -3671,6 +4176,7 @@ static int bpf_object__collect_externs(struct bpf_object *obj) struct extern_desc *ext; int i, n, off, dummy_var_btf_id; const char *ext_name, *sec_name; + size_t ext_essent_len; Elf_Scn *scn; Elf64_Shdr *sh; @@ -3720,6 +4226,14 @@ static int bpf_object__collect_externs(struct bpf_object *obj) ext->sym_idx = i; ext->is_weak = ELF64_ST_BIND(sym->st_info) == STB_WEAK; + ext_essent_len = bpf_core_essential_name_len(ext->name); + ext->essent_name = NULL; + if (ext_essent_len != strlen(ext->name)) { + ext->essent_name = strndup(ext->name, ext_essent_len); + if (!ext->essent_name) + return -ENOMEM; + } + ext->sec_btf_id = find_extern_sec_btf_id(obj->btf, ext->btf_id); if (ext->sec_btf_id <= 0) { pr_warn("failed to find BTF for extern '%s' [%d] section: %d\n", @@ -3750,9 +4264,9 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return -EINVAL; } ext->kcfg.type = find_kcfg_type(obj->btf, t->type, - &ext->kcfg.is_signed); + &ext->kcfg.is_signed); if (ext->kcfg.type == KCFG_UNKNOWN) { - pr_warn("extern (kcfg) '%s' type is unsupported\n", ext_name); + pr_warn("extern (kcfg) '%s': type is unsupported\n", ext_name); return -ENOTSUP; } } else if (strcmp(sec_name, KSYMS_SEC) == 0) { @@ -3874,41 +4388,8 @@ static int bpf_object__collect_externs(struct bpf_object *obj) return 0; } -struct bpf_program * -bpf_object__find_program_by_title(const struct bpf_object *obj, - const char *title) +static bool prog_is_subprog(const struct bpf_object *obj, const struct bpf_program *prog) { - struct bpf_program *pos; - - bpf_object__for_each_program(pos, obj) { - if (pos->sec_name && !strcmp(pos->sec_name, title)) - return pos; - } - return errno = ENOENT, NULL; -} - -static bool prog_is_subprog(const struct bpf_object *obj, - const struct bpf_program *prog) -{ - /* For legacy reasons, libbpf supports an entry-point BPF programs - * without SEC() attribute, i.e., those in the .text section. But if - * there are 2 or more such programs in the .text section, they all - * must be subprograms called from entry-point BPF programs in - * designated SEC()'tions, otherwise there is no way to distinguish - * which of those programs should be loaded vs which are a subprogram. - * Similarly, if there is a function/program in .text and at least one - * other BPF program with custom SEC() attribute, then we just assume - * .text programs are subprograms (even if they are not called from - * other programs), because libbpf never explicitly supported mixing - * SEC()-designated BPF programs and .text entry-point BPF programs. - * - * In libbpf 1.0 strict mode, we always consider .text - * programs to be subprograms. - */ - - if (libbpf_mode & LIBBPF_STRICT_SEC_NAME) - return prog->sec_idx == obj->efile.text_shndx; - return prog->sec_idx == obj->efile.text_shndx && obj->nr_programs > 1; } @@ -3943,8 +4424,7 @@ static bool bpf_object__shndx_is_data(const struct bpf_object *obj, static bool bpf_object__shndx_is_maps(const struct bpf_object *obj, int shndx) { - return shndx == obj->efile.maps_shndx || - shndx == obj->efile.btf_maps_shndx; + return shndx == obj->efile.btf_maps_shndx; } static enum libbpf_map_type @@ -4002,11 +4482,11 @@ static int bpf_program__record_reloc(struct bpf_program *prog, pr_debug("prog '%s': found extern #%d '%s' (sym %d) for insn #%u\n", prog->name, i, ext->name, ext->sym_idx, insn_idx); if (insn->code == (BPF_JMP | BPF_CALL)) - reloc_desc->type = RELO_EXTERN_FUNC; + reloc_desc->type = RELO_EXTERN_CALL; else - reloc_desc->type = RELO_EXTERN_VAR; + reloc_desc->type = RELO_EXTERN_LD64; reloc_desc->insn_idx = insn_idx; - reloc_desc->sym_off = i; /* sym_off stores extern index */ + reloc_desc->ext_idx = i; return 0; } @@ -4060,6 +4540,15 @@ static int bpf_program__record_reloc(struct bpf_program *prog, type = bpf_object__section_to_libbpf_map_type(obj, shdr_idx); sym_sec_name = elf_sec_name(obj, elf_sec_by_idx(obj, shdr_idx)); + /* arena data relocation */ + if (shdr_idx == obj->efile.arena_data_shndx) { + reloc_desc->type = RELO_DATA; + reloc_desc->insn_idx = insn_idx; + reloc_desc->map_idx = obj->arena_map - obj->maps; + reloc_desc->sym_off = sym->st_value; + return 0; + } + /* generic map reference relocation */ if (type == LIBBPF_MAP_UNSPEC) { if (!bpf_object__shndx_is_maps(obj, shdr_idx)) { @@ -4130,6 +4619,9 @@ static struct bpf_program *find_prog_by_sec_insn(const struct bpf_object *obj, int l = 0, r = obj->nr_programs - 1, m; struct bpf_program *prog; + if (!obj->nr_programs) + return NULL; + while (l < r) { m = l + (r - l + 1) / 2; prog = &obj->programs[m]; @@ -4169,6 +4661,8 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Dat scn = elf_sec_by_idx(obj, sec_idx); scn_data = elf_sec_data(obj, scn); + if (!scn_data) + return -LIBBPF_ERRNO__FORMAT; relo_sec_name = elf_sec_str(obj, shdr->sh_name); sec_name = elf_sec_name(obj, scn); @@ -4247,11 +4741,9 @@ bpf_object__collect_prog_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Dat return 0; } -static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map) +static int map_fill_btf_type_info(struct bpf_object *obj, struct bpf_map *map) { - struct bpf_map_def *def = &map->def; - __u32 key_type_id = 0, value_type_id = 0; - int ret; + int id; if (!obj->btf) return -ENOENT; @@ -4260,31 +4752,22 @@ static int bpf_map_find_btf_info(struct bpf_object *obj, struct bpf_map *map) * For struct_ops map, it does not need btf_key_type_id and * btf_value_type_id. */ - if (map->sec_idx == obj->efile.btf_maps_shndx || - bpf_map__is_struct_ops(map)) + if (map->sec_idx == obj->efile.btf_maps_shndx || bpf_map__is_struct_ops(map)) return 0; - if (!bpf_map__is_internal(map)) { - pr_warn("Use of BPF_ANNOTATE_KV_PAIR is deprecated, use BTF-defined maps in .maps section instead\n"); -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - ret = btf__get_map_kv_tids(obj->btf, map->name, def->key_size, - def->value_size, &key_type_id, - &value_type_id); -#pragma GCC diagnostic pop - } else { - /* - * LLVM annotates global data differently in BTF, that is, - * only as '.data', '.bss' or '.rodata'. - */ - ret = btf__find_by_name(obj->btf, map->real_name); - } - if (ret < 0) - return ret; + /* + * LLVM annotates global data differently in BTF, that is, + * only as '.data', '.bss' or '.rodata'. + */ + if (!bpf_map__is_internal(map)) + return -ENOENT; + + id = btf__find_by_name(obj->btf, map->real_name); + if (id < 0) + return id; - map->btf_key_type_id = key_type_id; - map->btf_value_type_id = bpf_map__is_internal(map) ? - ret : value_type_id; + map->btf_key_type_id = 0; + map->btf_value_type_id = id; return 0; } @@ -4298,7 +4781,7 @@ static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) snprintf(file, sizeof(file), "/proc/%d/fdinfo/%d", getpid(), fd); memset(info, 0, sizeof(*info)); - fp = fopen(file, "r"); + fp = fopen(file, "re"); if (!fp) { err = -errno; pr_warn("failed to open %s: %d. No procfs support?\n", file, @@ -4324,43 +4807,74 @@ static int bpf_get_map_info_from_fdinfo(int fd, struct bpf_map_info *info) return 0; } +bool bpf_map__autocreate(const struct bpf_map *map) +{ + return map->autocreate; +} + +int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate) +{ + if (map->obj->loaded) + return libbpf_err(-EBUSY); + + map->autocreate = autocreate; + return 0; +} + +int bpf_map__set_autoattach(struct bpf_map *map, bool autoattach) +{ + if (!bpf_map__is_struct_ops(map)) + return libbpf_err(-EINVAL); + + map->autoattach = autoattach; + return 0; +} + +bool bpf_map__autoattach(const struct bpf_map *map) +{ + return map->autoattach; +} + int bpf_map__reuse_fd(struct bpf_map *map, int fd) { - struct bpf_map_info info = {}; - __u32 len = sizeof(info); + struct bpf_map_info info; + __u32 len = sizeof(info), name_len; int new_fd, err; char *new_name; - err = bpf_obj_get_info_by_fd(fd, &info, &len); + memset(&info, 0, len); + err = bpf_map_get_info_by_fd(fd, &info, &len); if (err && errno == EINVAL) err = bpf_get_map_info_from_fdinfo(fd, &info); if (err) return libbpf_err(err); - new_name = strdup(info.name); + name_len = strlen(info.name); + if (name_len == BPF_OBJ_NAME_LEN - 1 && strncmp(map->name, info.name, name_len) == 0) + new_name = strdup(map->name); + else + new_name = strdup(info.name); + if (!new_name) return libbpf_err(-errno); - new_fd = open("/", O_RDONLY | O_CLOEXEC); + /* + * Like dup(), but make sure new FD is >= 3 and has O_CLOEXEC set. + * This is similar to what we do in ensure_good_fd(), but without + * closing original FD. + */ + new_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); if (new_fd < 0) { err = -errno; goto err_free_new_name; } - new_fd = dup3(fd, new_fd, O_CLOEXEC); - if (new_fd < 0) { - err = -errno; - goto err_close_new_fd; - } + err = reuse_fd(map->fd, new_fd); + if (err) + goto err_free_new_name; - err = zclose(map->fd); - if (err) { - err = -errno; - goto err_close_new_fd; - } free(map->name); - map->fd = new_fd; map->name = new_name; map->def.type = info.type; map->def.key_size = info.key_size; @@ -4374,8 +4888,6 @@ int bpf_map__reuse_fd(struct bpf_map *map, int fd) return 0; -err_close_new_fd: - close(new_fd); err_free_new_name: free(new_name); return libbpf_err(err); @@ -4396,18 +4908,68 @@ struct bpf_map *bpf_map__inner_map(struct bpf_map *map) int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries) { - if (map->fd >= 0) + if (map->obj->loaded) return libbpf_err(-EBUSY); + map->def.max_entries = max_entries; + + /* auto-adjust BPF ringbuf map max_entries to be a multiple of page size */ + if (map_is_ringbuf(map)) + map->def.max_entries = adjust_ringbuf_sz(map->def.max_entries); + return 0; } -int bpf_map__resize(struct bpf_map *map, __u32 max_entries) +static int bpf_object_prepare_token(struct bpf_object *obj) { - if (!map || !max_entries) - return libbpf_err(-EINVAL); + const char *bpffs_path; + int bpffs_fd = -1, token_fd, err; + bool mandatory; + enum libbpf_print_level level; - return bpf_map__set_max_entries(map, max_entries); + /* token is explicitly prevented */ + if (obj->token_path && obj->token_path[0] == '\0') { + pr_debug("object '%s': token is prevented, skipping...\n", obj->name); + return 0; + } + + mandatory = obj->token_path != NULL; + level = mandatory ? LIBBPF_WARN : LIBBPF_DEBUG; + + bpffs_path = obj->token_path ?: BPF_FS_DEFAULT_PATH; + bpffs_fd = open(bpffs_path, O_DIRECTORY, O_RDWR); + if (bpffs_fd < 0) { + err = -errno; + __pr(level, "object '%s': failed (%d) to open BPF FS mount at '%s'%s\n", + obj->name, err, bpffs_path, + mandatory ? "" : ", skipping optional step..."); + return mandatory ? err : 0; + } + + token_fd = bpf_token_create(bpffs_fd, 0); + close(bpffs_fd); + if (token_fd < 0) { + if (!mandatory && token_fd == -ENOENT) { + pr_debug("object '%s': BPF FS at '%s' doesn't have BPF token delegation set up, skipping...\n", + obj->name, bpffs_path); + return 0; + } + __pr(level, "object '%s': failed (%d) to create BPF token from '%s'%s\n", + obj->name, token_fd, bpffs_path, + mandatory ? "" : ", skipping optional step..."); + return mandatory ? token_fd : 0; + } + + obj->feat_cache = calloc(1, sizeof(*obj->feat_cache)); + if (!obj->feat_cache) { + close(token_fd); + return -ENOMEM; + } + + obj->token_fd = token_fd; + obj->feat_cache->token_fd = token_fd; + + return 0; } static int @@ -4419,6 +4981,10 @@ bpf_object__probe_loading(struct bpf_object *obj) BPF_EXIT_INSN(), }; int ret, insn_cnt = ARRAY_SIZE(insns); + LIBBPF_OPTS(bpf_prog_load_opts, opts, + .token_fd = obj->token_fd, + .prog_flags = obj->token_fd ? BPF_F_TOKEN_FD : 0, + ); if (obj->gen_loader) return 0; @@ -4428,9 +4994,9 @@ bpf_object__probe_loading(struct bpf_object *obj) pr_warn("Failed to bump RLIMIT_MEMLOCK (err = %d), you might need to do it explicitly!\n", ret); /* make sure basic loading works */ - ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); + ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, &opts); if (ret < 0) - ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); + ret = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, &opts); if (ret < 0) { ret = errno; cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); @@ -4445,410 +5011,29 @@ bpf_object__probe_loading(struct bpf_object *obj) return 0; } -static int probe_fd(int fd) -{ - if (fd >= 0) - close(fd); - return fd >= 0; -} - -static int probe_kern_prog_name(void) -{ - struct bpf_insn insns[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - int ret, insn_cnt = ARRAY_SIZE(insns); - - /* make sure loading with name works */ - ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, "test", "GPL", insns, insn_cnt, NULL); - return probe_fd(ret); -} - -static int probe_kern_global_data(void) -{ - char *cp, errmsg[STRERR_BUFSIZE]; - struct bpf_insn insns[] = { - BPF_LD_MAP_VALUE(BPF_REG_1, 0, 16), - BPF_ST_MEM(BPF_DW, BPF_REG_1, 0, 42), - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - int ret, map, insn_cnt = ARRAY_SIZE(insns); - - map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), 32, 1, NULL); - if (map < 0) { - ret = -errno; - cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); - pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", - __func__, cp, -ret); - return ret; - } - - insns[0].imm = map; - - ret = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); - close(map); - return probe_fd(ret); -} - -static int probe_kern_btf(void) -{ - static const char strs[] = "\0int"; - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs))); -} - -static int probe_kern_btf_func(void) -{ - static const char strs[] = "\0int\0x\0a"; - /* void x(int a) {} */ - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* FUNC_PROTO */ /* [2] */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), - BTF_PARAM_ENC(7, 1), - /* FUNC x */ /* [3] */ - BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, 0), 2), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs))); -} - -static int probe_kern_btf_func_global(void) -{ - static const char strs[] = "\0int\0x\0a"; - /* static void x(int a) {} */ - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* FUNC_PROTO */ /* [2] */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_FUNC_PROTO, 0, 1), 0), - BTF_PARAM_ENC(7, 1), - /* FUNC x BTF_FUNC_GLOBAL */ /* [3] */ - BTF_TYPE_ENC(5, BTF_INFO_ENC(BTF_KIND_FUNC, 0, BTF_FUNC_GLOBAL), 2), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs))); -} - -static int probe_kern_btf_datasec(void) -{ - static const char strs[] = "\0x\0.data"; - /* static int a; */ - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* VAR x */ /* [2] */ - BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), - BTF_VAR_STATIC, - /* DATASEC val */ /* [3] */ - BTF_TYPE_ENC(3, BTF_INFO_ENC(BTF_KIND_DATASEC, 0, 1), 4), - BTF_VAR_SECINFO_ENC(2, 0, 4), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs))); -} - -static int probe_kern_btf_float(void) -{ - static const char strs[] = "\0float"; - __u32 types[] = { - /* float */ - BTF_TYPE_FLOAT_ENC(1, 4), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs))); -} - -static int probe_kern_btf_decl_tag(void) -{ - static const char strs[] = "\0tag"; - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* VAR x */ /* [2] */ - BTF_TYPE_ENC(1, BTF_INFO_ENC(BTF_KIND_VAR, 0, 0), 1), - BTF_VAR_STATIC, - /* attr */ - BTF_TYPE_DECL_TAG_ENC(1, 2, -1), - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs))); -} - -static int probe_kern_btf_type_tag(void) -{ - static const char strs[] = "\0tag"; - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(0, BTF_INT_SIGNED, 0, 32, 4), /* [1] */ - /* attr */ - BTF_TYPE_TYPE_TAG_ENC(1, 1), /* [2] */ - /* ptr */ - BTF_TYPE_ENC(0, BTF_INFO_ENC(BTF_KIND_PTR, 0, 0), 2), /* [3] */ - }; - - return probe_fd(libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs))); -} - -static int probe_kern_array_mmap(void) -{ - LIBBPF_OPTS(bpf_map_create_opts, opts, .map_flags = BPF_F_MMAPABLE); - int fd; - - fd = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), sizeof(int), 1, &opts); - return probe_fd(fd); -} - -static int probe_kern_exp_attach_type(void) -{ - LIBBPF_OPTS(bpf_prog_load_opts, opts, .expected_attach_type = BPF_CGROUP_INET_SOCK_CREATE); - struct bpf_insn insns[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - int fd, insn_cnt = ARRAY_SIZE(insns); - - /* use any valid combination of program type and (optional) - * non-zero expected attach type (i.e., not a BPF_CGROUP_INET_INGRESS) - * to see if kernel supports expected_attach_type field for - * BPF_PROG_LOAD command - */ - fd = bpf_prog_load(BPF_PROG_TYPE_CGROUP_SOCK, NULL, "GPL", insns, insn_cnt, &opts); - return probe_fd(fd); -} - -static int probe_kern_probe_read_kernel(void) -{ - struct bpf_insn insns[] = { - BPF_MOV64_REG(BPF_REG_1, BPF_REG_10), /* r1 = r10 (fp) */ - BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -8), /* r1 += -8 */ - BPF_MOV64_IMM(BPF_REG_2, 8), /* r2 = 8 */ - BPF_MOV64_IMM(BPF_REG_3, 0), /* r3 = 0 */ - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_probe_read_kernel), - BPF_EXIT_INSN(), - }; - int fd, insn_cnt = ARRAY_SIZE(insns); - - fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", insns, insn_cnt, NULL); - return probe_fd(fd); -} - -static int probe_prog_bind_map(void) -{ - char *cp, errmsg[STRERR_BUFSIZE]; - struct bpf_insn insns[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - int ret, map, prog, insn_cnt = ARRAY_SIZE(insns); - - map = bpf_map_create(BPF_MAP_TYPE_ARRAY, NULL, sizeof(int), 32, 1, NULL); - if (map < 0) { - ret = -errno; - cp = libbpf_strerror_r(ret, errmsg, sizeof(errmsg)); - pr_warn("Error in %s():%s(%d). Couldn't create simple array map.\n", - __func__, cp, -ret); - return ret; - } - - prog = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, NULL, "GPL", insns, insn_cnt, NULL); - if (prog < 0) { - close(map); - return 0; - } - - ret = bpf_prog_bind_map(prog, map, NULL); - - close(map); - close(prog); - - return ret >= 0; -} - -static int probe_module_btf(void) -{ - static const char strs[] = "\0int"; - __u32 types[] = { - /* int */ - BTF_TYPE_INT_ENC(1, BTF_INT_SIGNED, 0, 32, 4), - }; - struct bpf_btf_info info; - __u32 len = sizeof(info); - char name[16]; - int fd, err; - - fd = libbpf__load_raw_btf((char *)types, sizeof(types), strs, sizeof(strs)); - if (fd < 0) - return 0; /* BTF not supported at all */ - - memset(&info, 0, sizeof(info)); - info.name = ptr_to_u64(name); - info.name_len = sizeof(name); - - /* check that BPF_OBJ_GET_INFO_BY_FD supports specifying name pointer; - * kernel's module BTF support coincides with support for - * name/name_len fields in struct bpf_btf_info. - */ - err = bpf_obj_get_info_by_fd(fd, &info, &len); - close(fd); - return !err; -} - -static int probe_perf_link(void) -{ - struct bpf_insn insns[] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN(), - }; - int prog_fd, link_fd, err; - - prog_fd = bpf_prog_load(BPF_PROG_TYPE_TRACEPOINT, NULL, "GPL", - insns, ARRAY_SIZE(insns), NULL); - if (prog_fd < 0) - return -errno; - - /* use invalid perf_event FD to get EBADF, if link is supported; - * otherwise EINVAL should be returned - */ - link_fd = bpf_link_create(prog_fd, -1, BPF_PERF_EVENT, NULL); - err = -errno; /* close() can clobber errno */ - - if (link_fd >= 0) - close(link_fd); - close(prog_fd); - - return link_fd < 0 && err == -EBADF; -} - -static int probe_kern_bpf_cookie(void) -{ - struct bpf_insn insns[] = { - BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_get_attach_cookie), - BPF_EXIT_INSN(), - }; - int ret, insn_cnt = ARRAY_SIZE(insns); - - ret = bpf_prog_load(BPF_PROG_TYPE_KPROBE, NULL, "GPL", insns, insn_cnt, NULL); - return probe_fd(ret); -} - -enum kern_feature_result { - FEAT_UNKNOWN = 0, - FEAT_SUPPORTED = 1, - FEAT_MISSING = 2, -}; - -typedef int (*feature_probe_fn)(void); - -static struct kern_feature_desc { - const char *desc; - feature_probe_fn probe; - enum kern_feature_result res; -} feature_probes[__FEAT_CNT] = { - [FEAT_PROG_NAME] = { - "BPF program name", probe_kern_prog_name, - }, - [FEAT_GLOBAL_DATA] = { - "global variables", probe_kern_global_data, - }, - [FEAT_BTF] = { - "minimal BTF", probe_kern_btf, - }, - [FEAT_BTF_FUNC] = { - "BTF functions", probe_kern_btf_func, - }, - [FEAT_BTF_GLOBAL_FUNC] = { - "BTF global function", probe_kern_btf_func_global, - }, - [FEAT_BTF_DATASEC] = { - "BTF data section and variable", probe_kern_btf_datasec, - }, - [FEAT_ARRAY_MMAP] = { - "ARRAY map mmap()", probe_kern_array_mmap, - }, - [FEAT_EXP_ATTACH_TYPE] = { - "BPF_PROG_LOAD expected_attach_type attribute", - probe_kern_exp_attach_type, - }, - [FEAT_PROBE_READ_KERN] = { - "bpf_probe_read_kernel() helper", probe_kern_probe_read_kernel, - }, - [FEAT_PROG_BIND_MAP] = { - "BPF_PROG_BIND_MAP support", probe_prog_bind_map, - }, - [FEAT_MODULE_BTF] = { - "module BTF support", probe_module_btf, - }, - [FEAT_BTF_FLOAT] = { - "BTF_KIND_FLOAT support", probe_kern_btf_float, - }, - [FEAT_PERF_LINK] = { - "BPF perf link support", probe_perf_link, - }, - [FEAT_BTF_DECL_TAG] = { - "BTF_KIND_DECL_TAG support", probe_kern_btf_decl_tag, - }, - [FEAT_BTF_TYPE_TAG] = { - "BTF_KIND_TYPE_TAG support", probe_kern_btf_type_tag, - }, - [FEAT_MEMCG_ACCOUNT] = { - "memcg-based memory accounting", probe_memcg_account, - }, - [FEAT_BPF_COOKIE] = { - "BPF cookie support", probe_kern_bpf_cookie, - }, -}; - bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id) { - struct kern_feature_desc *feat = &feature_probes[feat_id]; - int ret; - - if (obj && obj->gen_loader) + if (obj->gen_loader) /* To generate loader program assume the latest kernel * to avoid doing extra prog_load, map_create syscalls. */ return true; - if (READ_ONCE(feat->res) == FEAT_UNKNOWN) { - ret = feat->probe(); - if (ret > 0) { - WRITE_ONCE(feat->res, FEAT_SUPPORTED); - } else if (ret == 0) { - WRITE_ONCE(feat->res, FEAT_MISSING); - } else { - pr_warn("Detection of kernel %s support failed: %d\n", feat->desc, ret); - WRITE_ONCE(feat->res, FEAT_MISSING); - } - } + if (obj->token_fd) + return feat_supported(obj->feat_cache, feat_id); - return READ_ONCE(feat->res) == FEAT_SUPPORTED; + return feat_supported(NULL, feat_id); } static bool map_is_reuse_compat(const struct bpf_map *map, int map_fd) { - struct bpf_map_info map_info = {}; + struct bpf_map_info map_info; char msg[STRERR_BUFSIZE]; - __u32 map_info_len; + __u32 map_info_len = sizeof(map_info); int err; - map_info_len = sizeof(map_info); - - err = bpf_obj_get_info_by_fd(map_fd, &map_info, &map_info_len); + memset(&map_info, 0, map_info_len); + err = bpf_map_get_info_by_fd(map_fd, &map_info, &map_info_len); if (err && errno == EINVAL) err = bpf_get_map_info_from_fdinfo(map_fd, &map_info); if (err) { @@ -4895,9 +5080,9 @@ bpf_object__reuse_map(struct bpf_map *map) err = bpf_map__reuse_fd(map, pin_fd); close(pin_fd); - if (err) { + if (err) return err; - } + map->pinned = true; pr_debug("reused pinned map at '%s'\n", map->pin_path); @@ -4918,6 +5103,7 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) bpf_gen__map_freeze(obj->gen_loader, map - obj->maps); return 0; } + err = bpf_map_update_elem(map->fd, &zero, map->mmaped, 0); if (err) { err = -errno; @@ -4943,12 +5129,17 @@ bpf_object__populate_internal_map(struct bpf_object *obj, struct bpf_map *map) static void bpf_map__destroy(struct bpf_map *map); +static bool map_is_created(const struct bpf_map *map) +{ + return map->obj->loaded || map->reused; +} + static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, bool is_inner) { LIBBPF_OPTS(bpf_map_create_opts, create_attr); struct bpf_map_def *def = &map->def; const char *map_name = NULL; - int err = 0; + int err = 0, map_fd; if (kernel_supports(obj, FEAT_PROG_NAME)) map_name = map->name; @@ -4956,9 +5147,17 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b create_attr.map_flags = def->map_flags; create_attr.numa_node = map->numa_node; create_attr.map_extra = map->map_extra; + create_attr.token_fd = obj->token_fd; + if (obj->token_fd) + create_attr.map_flags |= BPF_F_TOKEN_FD; - if (bpf_map__is_struct_ops(map)) + if (bpf_map__is_struct_ops(map)) { create_attr.btf_vmlinux_value_type_id = map->btf_vmlinux_value_type_id; + if (map->mod_btf_fd >= 0) { + create_attr.value_type_btf_obj_fd = map->mod_btf_fd; + create_attr.map_flags |= BPF_F_VTYPE_BTF_OBJ_FD; + } + } if (obj->btf && btf__fd(obj->btf) >= 0) { create_attr.btf_fd = btf__fd(obj->btf); @@ -4968,13 +5167,16 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b if (bpf_map_type__is_map_in_map(def->type)) { if (map->inner_map) { + err = map_set_def_max_entries(map->inner_map); + if (err) + return err; err = bpf_object__create_map(obj, map->inner_map, true); if (err) { pr_warn("map '%s': failed to create inner map: %d\n", map->name, err); return err; } - map->inner_map_fd = bpf_map__fd(map->inner_map); + map->inner_map_fd = map->inner_map->fd; } if (map->inner_map_fd >= 0) create_attr.inner_map_fd = map->inner_map_fd; @@ -4994,12 +5196,16 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b case BPF_MAP_TYPE_SOCKHASH: case BPF_MAP_TYPE_QUEUE: case BPF_MAP_TYPE_STACK: - case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_ARENA: create_attr.btf_fd = 0; create_attr.btf_key_type_id = 0; create_attr.btf_value_type_id = 0; map->btf_key_type_id = 0; map->btf_value_type_id = 0; + break; + case BPF_MAP_TYPE_STRUCT_OPS: + create_attr.btf_value_type_id = 0; + break; default: break; } @@ -5008,17 +5214,19 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b bpf_gen__map_create(obj->gen_loader, def->type, map_name, def->key_size, def->value_size, def->max_entries, &create_attr, is_inner ? -1 : map - obj->maps); - /* Pretend to have valid FD to pass various fd >= 0 checks. - * This fd == 0 will not be used with any syscall and will be reset to -1 eventually. + /* We keep pretenting we have valid FD to pass various fd >= 0 + * checks by just keeping original placeholder FDs in place. + * See bpf_object__add_map() comment. + * This placeholder fd will not be used with any syscall and + * will be reset to -1 eventually. */ - map->fd = 0; + map_fd = map->fd; } else { - map->fd = bpf_map_create(def->type, map_name, - def->key_size, def->value_size, - def->max_entries, &create_attr); + map_fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); } - if (map->fd < 0 && (create_attr.btf_key_type_id || - create_attr.btf_value_type_id)) { + if (map_fd < 0 && (create_attr.btf_key_type_id || create_attr.btf_value_type_id)) { char *cp, errmsg[STRERR_BUFSIZE]; err = -errno; @@ -5030,13 +5238,11 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b create_attr.btf_value_type_id = 0; map->btf_key_type_id = 0; map->btf_value_type_id = 0; - map->fd = bpf_map_create(def->type, map_name, - def->key_size, def->value_size, - def->max_entries, &create_attr); + map_fd = bpf_map_create(def->type, map_name, + def->key_size, def->value_size, + def->max_entries, &create_attr); } - err = map->fd < 0 ? -errno : 0; - if (bpf_map_type__is_map_in_map(def->type) && map->inner_map) { if (obj->gen_loader) map->inner_map->fd = -1; @@ -5044,7 +5250,19 @@ static int bpf_object__create_map(struct bpf_object *obj, struct bpf_map *map, b zfree(&map->inner_map); } - return err; + if (map_fd < 0) + return map_fd; + + /* obj->gen_loader case, prevent reuse_fd() from closing map_fd */ + if (map->fd == map_fd) + return 0; + + /* Keep placeholder FD value but now point it to the BPF map object. + * This way everything that relied on this map's FD (e.g., relocated + * ldimm64 instructions) will stay valid and won't need adjustments. + * map->fd stays valid but now point to what map_fd points to. + */ + return reuse_fd(map->fd, map_fd); } static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map) @@ -5058,7 +5276,7 @@ static int init_map_in_map_slots(struct bpf_object *obj, struct bpf_map *map) continue; targ_map = map->init_slots[i]; - fd = bpf_map__fd(targ_map); + fd = targ_map->fd; if (obj->gen_loader) { bpf_gen__populate_outer_map(obj->gen_loader, @@ -5128,10 +5346,8 @@ static int bpf_object_init_prog_arrays(struct bpf_object *obj) continue; err = init_prog_array_slots(obj, map); - if (err < 0) { - zclose(map->fd); + if (err < 0) return err; - } } return 0; } @@ -5180,9 +5396,11 @@ bpf_object__create_maps(struct bpf_object *obj) * bpf_object loading will succeed just fine even on old * kernels. */ - if (bpf_map__is_internal(map) && - !kernel_supports(obj, FEAT_GLOBAL_DATA)) { - map->skipped = true; + if (bpf_map__is_internal(map) && !kernel_supports(obj, FEAT_GLOBAL_DATA)) + map->autocreate = false; + + if (!map->autocreate) { + pr_debug("map '%s': skipped auto-creating...\n", map->name); continue; } @@ -5207,7 +5425,7 @@ bpf_object__create_maps(struct bpf_object *obj) } } - if (map->fd >= 0) { + if (map->reused) { pr_debug("map '%s': skipping creation (preset fd=%d)\n", map->name, map->fd); } else { @@ -5220,25 +5438,36 @@ bpf_object__create_maps(struct bpf_object *obj) if (bpf_map__is_internal(map)) { err = bpf_object__populate_internal_map(obj, map); - if (err < 0) { - zclose(map->fd); + if (err < 0) goto err_out; + } + if (map->def.type == BPF_MAP_TYPE_ARENA) { + map->mmaped = mmap((void *)(long)map->map_extra, + bpf_map_mmap_sz(map), PROT_READ | PROT_WRITE, + map->map_extra ? MAP_SHARED | MAP_FIXED : MAP_SHARED, + map->fd, 0); + if (map->mmaped == MAP_FAILED) { + err = -errno; + map->mmaped = NULL; + pr_warn("map '%s': failed to mmap arena: %d\n", + map->name, err); + return err; + } + if (obj->arena_data) { + memcpy(map->mmaped, obj->arena_data, obj->arena_data_sz); + zfree(&obj->arena_data); } } - if (map->init_slots_sz && map->def.type != BPF_MAP_TYPE_PROG_ARRAY) { err = init_map_in_map_slots(obj, map); - if (err < 0) { - zclose(map->fd); + if (err < 0) goto err_out; - } } } if (map->pin_path && !map->pinned) { err = bpf_map__pin(map, NULL); if (err) { - zclose(map->fd); if (!retried && err == -EEXIST) { retried = true; goto retry; @@ -5313,7 +5542,7 @@ int bpf_core_add_cands(struct bpf_core_cand *local_cand, n = btf__type_cnt(targ_btf); for (i = targ_start_id; i < n; i++) { t = btf__type_by_id(targ_btf, i); - if (btf_kind(t) != btf_kind(local_t)) + if (!btf_kind_core_compat(t, local_t)) continue; targ_name = btf__name_by_offset(targ_btf, t->name_off); @@ -5372,6 +5601,10 @@ static int load_module_btfs(struct bpf_object *obj) err = bpf_btf_get_next_id(id, &id); if (err && errno == ENOENT) return 0; + if (err && errno == EPERM) { + pr_debug("skipping module BTFs loading, missing privileges\n"); + return 0; + } if (err) { err = -errno; pr_warn("failed to iterate BTF objects: %d\n", err); @@ -5392,7 +5625,7 @@ static int load_module_btfs(struct bpf_object *obj) info.name = ptr_to_u64(name); info.name_len = sizeof(name); - err = bpf_obj_get_info_by_fd(fd, &info, &len); + err = bpf_btf_get_info_by_fd(fd, &info, &len); if (err) { err = -errno; pr_warn("failed to get BTF object #%d info: %d\n", id, err); @@ -5414,7 +5647,7 @@ static int load_module_btfs(struct bpf_object *obj) } err = libbpf_ensure_mem((void **)&obj->btf_modules, &obj->btf_module_cap, - sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); + sizeof(*obj->btf_modules), obj->btf_module_cnt + 1); if (err) goto err_out; @@ -5521,93 +5754,25 @@ bpf_core_find_cands(struct bpf_object *obj, const struct btf *local_btf, __u32 l int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id) { - const struct btf_type *local_type, *targ_type; - int depth = 32; /* max recursion depth */ - - /* caller made sure that names match (ignoring flavor suffix) */ - local_type = btf__type_by_id(local_btf, local_id); - targ_type = btf__type_by_id(targ_btf, targ_id); - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; + return __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, 32); +} -recur: - depth--; - if (depth < 0) - return -EINVAL; +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id) +{ + return __bpf_core_types_match(local_btf, local_id, targ_btf, targ_id, false, 32); +} - local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); - targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); - if (!local_type || !targ_type) - return -EINVAL; +static size_t bpf_core_hash_fn(const long key, void *ctx) +{ + return key; +} - if (btf_kind(local_type) != btf_kind(targ_type)) - return 0; - - switch (btf_kind(local_type)) { - case BTF_KIND_UNKN: - case BTF_KIND_STRUCT: - case BTF_KIND_UNION: - case BTF_KIND_ENUM: - case BTF_KIND_FWD: - return 1; - case BTF_KIND_INT: - /* just reject deprecated bitfield-like integers; all other - * integers are by default compatible between each other - */ - return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; - case BTF_KIND_PTR: - local_id = local_type->type; - targ_id = targ_type->type; - goto recur; - case BTF_KIND_ARRAY: - local_id = btf_array(local_type)->type; - targ_id = btf_array(targ_type)->type; - goto recur; - case BTF_KIND_FUNC_PROTO: { - struct btf_param *local_p = btf_params(local_type); - struct btf_param *targ_p = btf_params(targ_type); - __u16 local_vlen = btf_vlen(local_type); - __u16 targ_vlen = btf_vlen(targ_type); - int i, err; - - if (local_vlen != targ_vlen) - return 0; - - for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { - skip_mods_and_typedefs(local_btf, local_p->type, &local_id); - skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); - err = bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id); - if (err <= 0) - return err; - } - - /* tail recurse for return type check */ - skip_mods_and_typedefs(local_btf, local_type->type, &local_id); - skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); - goto recur; - } - default: - pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", - btf_kind_str(local_type), local_id, targ_id); - return 0; - } -} - -static size_t bpf_core_hash_fn(const void *key, void *ctx) -{ - return (size_t)key; -} - -static bool bpf_core_equal_fn(const void *k1, const void *k2, void *ctx) +static bool bpf_core_equal_fn(const long k1, const long k2, void *ctx) { return k1 == k2; } -static void *u32_as_hash_key(__u32 x) -{ - return (void *)(uintptr_t)x; -} - static int record_relo_core(struct bpf_program *prog, const struct bpf_core_relo *core_relo, int insn_idx) { @@ -5650,7 +5815,6 @@ static int bpf_core_resolve_relo(struct bpf_program *prog, struct bpf_core_relo_res *targ_res) { struct bpf_core_spec specs_scratch[3] = {}; - const void *type_key = u32_as_hash_key(relo->type_id); struct bpf_core_cand_list *cands = NULL; const char *prog_name = prog->name; const struct btf_type *local_type; @@ -5667,7 +5831,7 @@ static int bpf_core_resolve_relo(struct bpf_program *prog, return -EINVAL; if (relo->kind != BPF_CORE_TYPE_ID_LOCAL && - !hashmap__find(cand_cache, type_key, (void **)&cands)) { + !hashmap__find(cand_cache, local_id, &cands)) { cands = bpf_core_find_cands(prog->obj, local_btf, local_id); if (IS_ERR(cands)) { pr_warn("prog '%s': relo #%d: target candidate search failed for [%d] %s %s: %ld\n", @@ -5675,7 +5839,7 @@ static int bpf_core_resolve_relo(struct bpf_program *prog, local_name, PTR_ERR(cands)); return PTR_ERR(cands); } - err = hashmap__set(cand_cache, type_key, cands, NULL, NULL); + err = hashmap__set(cand_cache, local_id, cands, NULL, NULL); if (err) { bpf_core_free_cands(cands); return err; @@ -5798,13 +5962,67 @@ bpf_object__relocate_core(struct bpf_object *obj, const char *targ_btf_path) if (!IS_ERR_OR_NULL(cand_cache)) { hashmap__for_each_entry(cand_cache, entry, i) { - bpf_core_free_cands(entry->value); + bpf_core_free_cands(entry->pvalue); } hashmap__free(cand_cache); } return err; } +/* base map load ldimm64 special constant, used also for log fixup logic */ +#define POISON_LDIMM64_MAP_BASE 2001000000 +#define POISON_LDIMM64_MAP_PFX "200100" + +static void poison_map_ldimm64(struct bpf_program *prog, int relo_idx, + int insn_idx, struct bpf_insn *insn, + int map_idx, const struct bpf_map *map) +{ + int i; + + pr_debug("prog '%s': relo #%d: poisoning insn #%d that loads map #%d '%s'\n", + prog->name, relo_idx, insn_idx, map_idx, map->name); + + /* we turn single ldimm64 into two identical invalid calls */ + for (i = 0; i < 2; i++) { + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with something like: + * invalid func unknown#2001000123 + * where lower 123 is map index into obj->maps[] array + */ + insn->imm = POISON_LDIMM64_MAP_BASE + map_idx; + + insn++; + } +} + +/* unresolved kfunc call special constant, used also for log fixup logic */ +#define POISON_CALL_KFUNC_BASE 2002000000 +#define POISON_CALL_KFUNC_PFX "2002" + +static void poison_kfunc_call(struct bpf_program *prog, int relo_idx, + int insn_idx, struct bpf_insn *insn, + int ext_idx, const struct extern_desc *ext) +{ + pr_debug("prog '%s': relo #%d: poisoning insn #%d that calls kfunc '%s'\n", + prog->name, relo_idx, insn_idx, ext->name); + + /* we turn kfunc call into invalid helper call with identifiable constant */ + insn->code = BPF_JMP | BPF_CALL; + insn->dst_reg = 0; + insn->src_reg = 0; + insn->off = 0; + /* if this instruction is reachable (not a dead code), + * verifier will complain with something like: + * invalid func unknown#2001000123 + * where lower 123 is extern index into obj->externs[] array + */ + insn->imm = POISON_CALL_KFUNC_BASE + ext_idx; +} + /* Relocate data references within program code: * - map references; * - global variable references; @@ -5818,37 +6036,39 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) for (i = 0; i < prog->nr_reloc; i++) { struct reloc_desc *relo = &prog->reloc_desc[i]; struct bpf_insn *insn = &prog->insns[relo->insn_idx]; + const struct bpf_map *map; struct extern_desc *ext; switch (relo->type) { case RELO_LD64: + map = &obj->maps[relo->map_idx]; if (obj->gen_loader) { insn[0].src_reg = BPF_PSEUDO_MAP_IDX; insn[0].imm = relo->map_idx; - } else { + } else if (map->autocreate) { insn[0].src_reg = BPF_PSEUDO_MAP_FD; - insn[0].imm = obj->maps[relo->map_idx].fd; + insn[0].imm = map->fd; + } else { + poison_map_ldimm64(prog, i, relo->insn_idx, insn, + relo->map_idx, map); } break; case RELO_DATA: + map = &obj->maps[relo->map_idx]; insn[1].imm = insn[0].imm + relo->sym_off; if (obj->gen_loader) { insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; insn[0].imm = relo->map_idx; - } else { - const struct bpf_map *map = &obj->maps[relo->map_idx]; - - if (map->skipped) { - pr_warn("prog '%s': relo #%d: kernel doesn't support global data\n", - prog->name, i); - return -ENOTSUP; - } + } else if (map->autocreate) { insn[0].src_reg = BPF_PSEUDO_MAP_VALUE; - insn[0].imm = obj->maps[relo->map_idx].fd; + insn[0].imm = map->fd; + } else { + poison_map_ldimm64(prog, i, relo->insn_idx, insn, + relo->map_idx, map); } break; - case RELO_EXTERN_VAR: - ext = &obj->externs[relo->sym_off]; + case RELO_EXTERN_LD64: + ext = &obj->externs[relo->ext_idx]; if (ext->type == EXT_KCFG) { if (obj->gen_loader) { insn[0].src_reg = BPF_PSEUDO_MAP_IDX_VALUE; @@ -5869,15 +6089,15 @@ bpf_object__relocate_data(struct bpf_object *obj, struct bpf_program *prog) } } break; - case RELO_EXTERN_FUNC: - ext = &obj->externs[relo->sym_off]; + case RELO_EXTERN_CALL: + ext = &obj->externs[relo->ext_idx]; insn[0].src_reg = BPF_PSEUDO_KFUNC_CALL; if (ext->is_set) { insn[0].imm = ext->ksym.kernel_btf_id; insn[0].off = ext->ksym.btf_fd_idx; - } else { /* unresolved weak kfunc */ - insn[0].imm = 0; - insn[0].off = 0; + } else { /* unresolved weak kfunc call */ + poison_kfunc_call(prog, i, relo->insn_idx, insn, + relo->ext_idx, ext); } break; case RELO_SUBPROG_ADDR: @@ -5980,7 +6200,7 @@ reloc_prog_func_and_line_info(const struct bpf_object *obj, int err; /* no .BTF.ext relocation if .BTF.ext is missing or kernel doesn't - * supprot func/line info + * support func/line info */ if (!obj->btf_ext || !kernel_supports(obj, FEAT_BTF_FUNC)) return 0; @@ -6071,7 +6291,11 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra if (main_prog == subprog) return 0; relos = libbpf_reallocarray(main_prog->reloc_desc, new_cnt, sizeof(*relos)); - if (!relos) + /* if new count is zero, reallocarray can return a valid NULL result; + * in this case the previous pointer will be freed, so we *have to* + * reassign old pointer to the new value (even if it's NULL) + */ + if (!relos && new_cnt) return -ENOMEM; if (subprog->nr_reloc) memcpy(relos + main_prog->nr_reloc, subprog->reloc_desc, @@ -6087,14 +6311,46 @@ static int append_subprog_relos(struct bpf_program *main_prog, struct bpf_progra return 0; } +static int +bpf_object__append_subprog_code(struct bpf_object *obj, struct bpf_program *main_prog, + struct bpf_program *subprog) +{ + struct bpf_insn *insns; + size_t new_cnt; + int err; + + subprog->sub_insn_off = main_prog->insns_cnt; + + new_cnt = main_prog->insns_cnt + subprog->insns_cnt; + insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); + if (!insns) { + pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); + return -ENOMEM; + } + main_prog->insns = insns; + main_prog->insns_cnt = new_cnt; + + memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, + subprog->insns_cnt * sizeof(*insns)); + + pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", + main_prog->name, subprog->insns_cnt, subprog->name); + + /* The subprog insns are now appended. Append its relos too. */ + err = append_subprog_relos(main_prog, subprog); + if (err) + return err; + return 0; +} + static int bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, struct bpf_program *prog) { - size_t sub_insn_idx, insn_idx, new_cnt; + size_t sub_insn_idx, insn_idx; struct bpf_program *subprog; - struct bpf_insn *insns, *insn; struct reloc_desc *relo; + struct bpf_insn *insn; int err; err = reloc_prog_func_and_line_info(obj, main_prog, prog); @@ -6107,7 +6363,7 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, continue; relo = find_prog_insn_relo(prog, insn_idx); - if (relo && relo->type == RELO_EXTERN_FUNC) + if (relo && relo->type == RELO_EXTERN_CALL) /* kfunc relocations will be handled later * in bpf_object__relocate_data() */ @@ -6169,25 +6425,7 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * and relocate. */ if (subprog->sub_insn_off == 0) { - subprog->sub_insn_off = main_prog->insns_cnt; - - new_cnt = main_prog->insns_cnt + subprog->insns_cnt; - insns = libbpf_reallocarray(main_prog->insns, new_cnt, sizeof(*insns)); - if (!insns) { - pr_warn("prog '%s': failed to realloc prog code\n", main_prog->name); - return -ENOMEM; - } - main_prog->insns = insns; - main_prog->insns_cnt = new_cnt; - - memcpy(main_prog->insns + subprog->sub_insn_off, subprog->insns, - subprog->insns_cnt * sizeof(*insns)); - - pr_debug("prog '%s': added %zu insns from sub-prog '%s'\n", - main_prog->name, subprog->insns_cnt, subprog->name); - - /* The subprog insns are now appended. Append its relos too. */ - err = append_subprog_relos(main_prog, subprog); + err = bpf_object__append_subprog_code(obj, main_prog, subprog); if (err) return err; err = bpf_object__reloc_code(obj, main_prog, subprog); @@ -6203,7 +6441,8 @@ bpf_object__reloc_code(struct bpf_object *obj, struct bpf_program *main_prog, * prog; each main prog can have a different set of * subprograms appended (potentially in different order as * well), so position of any subprog can be different for - * different main programs */ + * different main programs + */ insn->imm = subprog->sub_insn_off - (prog->sub_insn_off + insn_idx) - 1; pr_debug("prog '%s': insn #%zu relocated, imm %d points to subprog '%s' (now at %zu offset)\n", @@ -6361,8 +6600,408 @@ static void bpf_object__sort_relos(struct bpf_object *obj) } } -static int -bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) +static int bpf_prog_assign_exc_cb(struct bpf_object *obj, struct bpf_program *prog) +{ + const char *str = "exception_callback:"; + size_t pfx_len = strlen(str); + int i, j, n; + + if (!obj->btf || !kernel_supports(obj, FEAT_BTF_DECL_TAG)) + return 0; + + n = btf__type_cnt(obj->btf); + for (i = 1; i < n; i++) { + const char *name; + struct btf_type *t; + + t = btf_type_by_id(obj->btf, i); + if (!btf_is_decl_tag(t) || btf_decl_tag(t)->component_idx != -1) + continue; + + name = btf__str_by_offset(obj->btf, t->name_off); + if (strncmp(name, str, pfx_len) != 0) + continue; + + t = btf_type_by_id(obj->btf, t->type); + if (!btf_is_func(t) || btf_func_linkage(t) != BTF_FUNC_GLOBAL) { + pr_warn("prog '%s': exception_callback: decl tag not applied to the main program\n", + prog->name); + return -EINVAL; + } + if (strcmp(prog->name, btf__str_by_offset(obj->btf, t->name_off)) != 0) + continue; + /* Multiple callbacks are specified for the same prog, + * the verifier will eventually return an error for this + * case, hence simply skip appending a subprog. + */ + if (prog->exception_cb_idx >= 0) { + prog->exception_cb_idx = -1; + break; + } + + name += pfx_len; + if (str_is_empty(name)) { + pr_warn("prog '%s': exception_callback: decl tag contains empty value\n", + prog->name); + return -EINVAL; + } + + for (j = 0; j < obj->nr_programs; j++) { + struct bpf_program *subprog = &obj->programs[j]; + + if (!prog_is_subprog(obj, subprog)) + continue; + if (strcmp(name, subprog->name) != 0) + continue; + /* Enforce non-hidden, as from verifier point of + * view it expects global functions, whereas the + * mark_btf_static fixes up linkage as static. + */ + if (!subprog->sym_global || subprog->mark_btf_static) { + pr_warn("prog '%s': exception callback %s must be a global non-hidden function\n", + prog->name, subprog->name); + return -EINVAL; + } + /* Let's see if we already saw a static exception callback with the same name */ + if (prog->exception_cb_idx >= 0) { + pr_warn("prog '%s': multiple subprogs with same name as exception callback '%s'\n", + prog->name, subprog->name); + return -EINVAL; + } + prog->exception_cb_idx = j; + break; + } + + if (prog->exception_cb_idx >= 0) + continue; + + pr_warn("prog '%s': cannot find exception callback '%s'\n", prog->name, name); + return -ENOENT; + } + + return 0; +} + +static struct { + enum bpf_prog_type prog_type; + const char *ctx_name; +} global_ctx_map[] = { + { BPF_PROG_TYPE_CGROUP_DEVICE, "bpf_cgroup_dev_ctx" }, + { BPF_PROG_TYPE_CGROUP_SKB, "__sk_buff" }, + { BPF_PROG_TYPE_CGROUP_SOCK, "bpf_sock" }, + { BPF_PROG_TYPE_CGROUP_SOCK_ADDR, "bpf_sock_addr" }, + { BPF_PROG_TYPE_CGROUP_SOCKOPT, "bpf_sockopt" }, + { BPF_PROG_TYPE_CGROUP_SYSCTL, "bpf_sysctl" }, + { BPF_PROG_TYPE_FLOW_DISSECTOR, "__sk_buff" }, + { BPF_PROG_TYPE_KPROBE, "bpf_user_pt_regs_t" }, + { BPF_PROG_TYPE_LWT_IN, "__sk_buff" }, + { BPF_PROG_TYPE_LWT_OUT, "__sk_buff" }, + { BPF_PROG_TYPE_LWT_SEG6LOCAL, "__sk_buff" }, + { BPF_PROG_TYPE_LWT_XMIT, "__sk_buff" }, + { BPF_PROG_TYPE_NETFILTER, "bpf_nf_ctx" }, + { BPF_PROG_TYPE_PERF_EVENT, "bpf_perf_event_data" }, + { BPF_PROG_TYPE_RAW_TRACEPOINT, "bpf_raw_tracepoint_args" }, + { BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE, "bpf_raw_tracepoint_args" }, + { BPF_PROG_TYPE_SCHED_ACT, "__sk_buff" }, + { BPF_PROG_TYPE_SCHED_CLS, "__sk_buff" }, + { BPF_PROG_TYPE_SK_LOOKUP, "bpf_sk_lookup" }, + { BPF_PROG_TYPE_SK_MSG, "sk_msg_md" }, + { BPF_PROG_TYPE_SK_REUSEPORT, "sk_reuseport_md" }, + { BPF_PROG_TYPE_SK_SKB, "__sk_buff" }, + { BPF_PROG_TYPE_SOCK_OPS, "bpf_sock_ops" }, + { BPF_PROG_TYPE_SOCKET_FILTER, "__sk_buff" }, + { BPF_PROG_TYPE_XDP, "xdp_md" }, + /* all other program types don't have "named" context structs */ +}; + +/* forward declarations for arch-specific underlying types of bpf_user_pt_regs_t typedef, + * for below __builtin_types_compatible_p() checks; + * with this approach we don't need any extra arch-specific #ifdef guards + */ +struct pt_regs; +struct user_pt_regs; +struct user_regs_struct; + +static bool need_func_arg_type_fixup(const struct btf *btf, const struct bpf_program *prog, + const char *subprog_name, int arg_idx, + int arg_type_id, const char *ctx_name) +{ + const struct btf_type *t; + const char *tname; + + /* check if existing parameter already matches verifier expectations */ + t = skip_mods_and_typedefs(btf, arg_type_id, NULL); + if (!btf_is_ptr(t)) + goto out_warn; + + /* typedef bpf_user_pt_regs_t is a special PITA case, valid for kprobe + * and perf_event programs, so check this case early on and forget + * about it for subsequent checks + */ + while (btf_is_mod(t)) + t = btf__type_by_id(btf, t->type); + if (btf_is_typedef(t) && + (prog->type == BPF_PROG_TYPE_KPROBE || prog->type == BPF_PROG_TYPE_PERF_EVENT)) { + tname = btf__str_by_offset(btf, t->name_off) ?: ""; + if (strcmp(tname, "bpf_user_pt_regs_t") == 0) + return false; /* canonical type for kprobe/perf_event */ + } + + /* now we can ignore typedefs moving forward */ + t = skip_mods_and_typedefs(btf, t->type, NULL); + + /* if it's `void *`, definitely fix up BTF info */ + if (btf_is_void(t)) + return true; + + /* if it's already proper canonical type, no need to fix up */ + tname = btf__str_by_offset(btf, t->name_off) ?: ""; + if (btf_is_struct(t) && strcmp(tname, ctx_name) == 0) + return false; + + /* special cases */ + switch (prog->type) { + case BPF_PROG_TYPE_KPROBE: + /* `struct pt_regs *` is expected, but we need to fix up */ + if (btf_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return true; + break; + case BPF_PROG_TYPE_PERF_EVENT: + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct pt_regs) && + btf_is_struct(t) && strcmp(tname, "pt_regs") == 0) + return true; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_pt_regs) && + btf_is_struct(t) && strcmp(tname, "user_pt_regs") == 0) + return true; + if (__builtin_types_compatible_p(bpf_user_pt_regs_t, struct user_regs_struct) && + btf_is_struct(t) && strcmp(tname, "user_regs_struct") == 0) + return true; + break; + case BPF_PROG_TYPE_RAW_TRACEPOINT: + case BPF_PROG_TYPE_RAW_TRACEPOINT_WRITABLE: + /* allow u64* as ctx */ + if (btf_is_int(t) && t->size == 8) + return true; + break; + default: + break; + } + +out_warn: + pr_warn("prog '%s': subprog '%s' arg#%d is expected to be of `struct %s *` type\n", + prog->name, subprog_name, arg_idx, ctx_name); + return false; +} + +static int clone_func_btf_info(struct btf *btf, int orig_fn_id, struct bpf_program *prog) +{ + int fn_id, fn_proto_id, ret_type_id, orig_proto_id; + int i, err, arg_cnt, fn_name_off, linkage; + struct btf_type *fn_t, *fn_proto_t, *t; + struct btf_param *p; + + /* caller already validated FUNC -> FUNC_PROTO validity */ + fn_t = btf_type_by_id(btf, orig_fn_id); + fn_proto_t = btf_type_by_id(btf, fn_t->type); + + /* Note that each btf__add_xxx() operation invalidates + * all btf_type and string pointers, so we need to be + * very careful when cloning BTF types. BTF type + * pointers have to be always refetched. And to avoid + * problems with invalidated string pointers, we + * add empty strings initially, then just fix up + * name_off offsets in place. Offsets are stable for + * existing strings, so that works out. + */ + fn_name_off = fn_t->name_off; /* we are about to invalidate fn_t */ + linkage = btf_func_linkage(fn_t); + orig_proto_id = fn_t->type; /* original FUNC_PROTO ID */ + ret_type_id = fn_proto_t->type; /* fn_proto_t will be invalidated */ + arg_cnt = btf_vlen(fn_proto_t); + + /* clone FUNC_PROTO and its params */ + fn_proto_id = btf__add_func_proto(btf, ret_type_id); + if (fn_proto_id < 0) + return -EINVAL; + + for (i = 0; i < arg_cnt; i++) { + int name_off; + + /* copy original parameter data */ + t = btf_type_by_id(btf, orig_proto_id); + p = &btf_params(t)[i]; + name_off = p->name_off; + + err = btf__add_func_param(btf, "", p->type); + if (err) + return err; + + fn_proto_t = btf_type_by_id(btf, fn_proto_id); + p = &btf_params(fn_proto_t)[i]; + p->name_off = name_off; /* use remembered str offset */ + } + + /* clone FUNC now, btf__add_func() enforces non-empty name, so use + * entry program's name as a placeholder, which we replace immediately + * with original name_off + */ + fn_id = btf__add_func(btf, prog->name, linkage, fn_proto_id); + if (fn_id < 0) + return -EINVAL; + + fn_t = btf_type_by_id(btf, fn_id); + fn_t->name_off = fn_name_off; /* reuse original string */ + + return fn_id; +} + +/* Check if main program or global subprog's function prototype has `arg:ctx` + * argument tags, and, if necessary, substitute correct type to match what BPF + * verifier would expect, taking into account specific program type. This + * allows to support __arg_ctx tag transparently on old kernels that don't yet + * have a native support for it in the verifier, making user's life much + * easier. + */ +static int bpf_program_fixup_func_info(struct bpf_object *obj, struct bpf_program *prog) +{ + const char *ctx_name = NULL, *ctx_tag = "arg:ctx", *fn_name; + struct bpf_func_info_min *func_rec; + struct btf_type *fn_t, *fn_proto_t; + struct btf *btf = obj->btf; + const struct btf_type *t; + struct btf_param *p; + int ptr_id = 0, struct_id, tag_id, orig_fn_id; + int i, n, arg_idx, arg_cnt, err, rec_idx; + int *orig_ids; + + /* no .BTF.ext, no problem */ + if (!obj->btf_ext || !prog->func_info) + return 0; + + /* don't do any fix ups if kernel natively supports __arg_ctx */ + if (kernel_supports(obj, FEAT_ARG_CTX_TAG)) + return 0; + + /* some BPF program types just don't have named context structs, so + * this fallback mechanism doesn't work for them + */ + for (i = 0; i < ARRAY_SIZE(global_ctx_map); i++) { + if (global_ctx_map[i].prog_type != prog->type) + continue; + ctx_name = global_ctx_map[i].ctx_name; + break; + } + if (!ctx_name) + return 0; + + /* remember original func BTF IDs to detect if we already cloned them */ + orig_ids = calloc(prog->func_info_cnt, sizeof(*orig_ids)); + if (!orig_ids) + return -ENOMEM; + for (i = 0; i < prog->func_info_cnt; i++) { + func_rec = prog->func_info + prog->func_info_rec_size * i; + orig_ids[i] = func_rec->type_id; + } + + /* go through each DECL_TAG with "arg:ctx" and see if it points to one + * of our subprogs; if yes and subprog is global and needs adjustment, + * clone and adjust FUNC -> FUNC_PROTO combo + */ + for (i = 1, n = btf__type_cnt(btf); i < n; i++) { + /* only DECL_TAG with "arg:ctx" value are interesting */ + t = btf__type_by_id(btf, i); + if (!btf_is_decl_tag(t)) + continue; + if (strcmp(btf__str_by_offset(btf, t->name_off), ctx_tag) != 0) + continue; + + /* only global funcs need adjustment, if at all */ + orig_fn_id = t->type; + fn_t = btf_type_by_id(btf, orig_fn_id); + if (!btf_is_func(fn_t) || btf_func_linkage(fn_t) != BTF_FUNC_GLOBAL) + continue; + + /* sanity check FUNC -> FUNC_PROTO chain, just in case */ + fn_proto_t = btf_type_by_id(btf, fn_t->type); + if (!fn_proto_t || !btf_is_func_proto(fn_proto_t)) + continue; + + /* find corresponding func_info record */ + func_rec = NULL; + for (rec_idx = 0; rec_idx < prog->func_info_cnt; rec_idx++) { + if (orig_ids[rec_idx] == t->type) { + func_rec = prog->func_info + prog->func_info_rec_size * rec_idx; + break; + } + } + /* current main program doesn't call into this subprog */ + if (!func_rec) + continue; + + /* some more sanity checking of DECL_TAG */ + arg_cnt = btf_vlen(fn_proto_t); + arg_idx = btf_decl_tag(t)->component_idx; + if (arg_idx < 0 || arg_idx >= arg_cnt) + continue; + + /* check if we should fix up argument type */ + p = &btf_params(fn_proto_t)[arg_idx]; + fn_name = btf__str_by_offset(btf, fn_t->name_off) ?: ""; + if (!need_func_arg_type_fixup(btf, prog, fn_name, arg_idx, p->type, ctx_name)) + continue; + + /* clone fn/fn_proto, unless we already did it for another arg */ + if (func_rec->type_id == orig_fn_id) { + int fn_id; + + fn_id = clone_func_btf_info(btf, orig_fn_id, prog); + if (fn_id < 0) { + err = fn_id; + goto err_out; + } + + /* point func_info record to a cloned FUNC type */ + func_rec->type_id = fn_id; + } + + /* create PTR -> STRUCT type chain to mark PTR_TO_CTX argument; + * we do it just once per main BPF program, as all global + * funcs share the same program type, so need only PTR -> + * STRUCT type chain + */ + if (ptr_id == 0) { + struct_id = btf__add_struct(btf, ctx_name, 0); + ptr_id = btf__add_ptr(btf, struct_id); + if (ptr_id < 0 || struct_id < 0) { + err = -EINVAL; + goto err_out; + } + } + + /* for completeness, clone DECL_TAG and point it to cloned param */ + tag_id = btf__add_decl_tag(btf, ctx_tag, func_rec->type_id, arg_idx); + if (tag_id < 0) { + err = -EINVAL; + goto err_out; + } + + /* all the BTF manipulations invalidated pointers, refetch them */ + fn_t = btf_type_by_id(btf, func_rec->type_id); + fn_proto_t = btf_type_by_id(btf, fn_t->type); + + /* fix up type ID pointed to by param */ + p = &btf_params(fn_proto_t)[arg_idx]; + p->type = ptr_id; + } + + free(orig_ids); + return 0; +err_out: + free(orig_ids); + return err; +} + +static int bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) { struct bpf_program *prog; size_t i, j; @@ -6420,20 +7059,51 @@ bpf_object__relocate(struct bpf_object *obj, const char *targ_btf_path) prog->name, err); return err; } + + err = bpf_prog_assign_exc_cb(obj, prog); + if (err) + return err; + /* Now, also append exception callback if it has not been done already. */ + if (prog->exception_cb_idx >= 0) { + struct bpf_program *subprog = &obj->programs[prog->exception_cb_idx]; + + /* Calling exception callback directly is disallowed, which the + * verifier will reject later. In case it was processed already, + * we can skip this step, otherwise for all other valid cases we + * have to append exception callback now. + */ + if (subprog->sub_insn_off == 0) { + err = bpf_object__append_subprog_code(obj, prog, subprog); + if (err) + return err; + err = bpf_object__reloc_code(obj, prog, subprog); + if (err) + return err; + } + } } - /* Process data relos for main programs */ for (i = 0; i < obj->nr_programs; i++) { prog = &obj->programs[i]; if (prog_is_subprog(obj, prog)) continue; if (!prog->autoload) continue; + + /* Process data relos for main programs */ err = bpf_object__relocate_data(obj, prog); if (err) { pr_warn("prog '%s': failed to relocate data references: %d\n", prog->name, err); return err; } + + /* Fix up .BTF.ext information, if necessary */ + err = bpf_program_fixup_func_info(obj, prog); + if (err) { + pr_warn("prog '%s': failed to perform .BTF.ext fix ups: %d\n", + prog->name, err); + return err; + } } return 0; @@ -6596,12 +7266,12 @@ static int bpf_object__collect_relos(struct bpf_object *obj) data = sec_desc->data; idx = shdr->sh_info; - if (shdr->sh_type != SHT_REL) { + if (shdr->sh_type != SHT_REL || idx < 0 || idx >= obj->efile.sec_cnt) { pr_warn("internal error at %d\n", __LINE__); return -LIBBPF_ERRNO__INTERNAL; } - if (idx == obj->efile.st_ops_shndx) + if (obj->efile.secs[idx].sec_type == SEC_ST_OPS) err = bpf_object__collect_st_ops_relos(obj, shdr, data); else if (idx == obj->efile.btf_maps_shndx) err = bpf_object__collect_map_relos(obj, shdr, data); @@ -6682,17 +7352,31 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog, if (prog->type == BPF_PROG_TYPE_XDP && (def & SEC_XDP_FRAGS)) opts->prog_flags |= BPF_F_XDP_HAS_FRAGS; - if (def & SEC_DEPRECATED) - pr_warn("SEC(\"%s\") is deprecated, please see https://github.com/libbpf/libbpf/wiki/Libbpf-1.0-migration-guide#bpf-program-sec-annotation-deprecations for details\n", - prog->sec_name); + /* special check for usdt to use uprobe_multi link */ + if ((def & SEC_USDT) && kernel_supports(prog->obj, FEAT_UPROBE_MULTI_LINK)) + prog->expected_attach_type = BPF_TRACE_UPROBE_MULTI; - if ((prog->type == BPF_PROG_TYPE_TRACING || - prog->type == BPF_PROG_TYPE_LSM || - prog->type == BPF_PROG_TYPE_EXT) && !prog->attach_btf_id) { + if ((def & SEC_ATTACH_BTF) && !prog->attach_btf_id) { int btf_obj_fd = 0, btf_type_id = 0, err; const char *attach_name; - attach_name = strchr(prog->sec_name, '/') + 1; + attach_name = strchr(prog->sec_name, '/'); + if (!attach_name) { + /* if BPF program is annotated with just SEC("fentry") + * (or similar) without declaratively specifying + * target, then it is expected that target will be + * specified with bpf_program__set_attach_target() at + * runtime before BPF object load step. If not, then + * there is nothing to load into the kernel as BPF + * verifier won't be able to validate BPF program + * correctness anyways. + */ + pr_warn("prog '%s': no BTF-based attach target is specified, use bpf_program__set_attach_target()\n", + prog->name); + return -EINVAL; + } + attach_name++; /* skip over / */ + err = libbpf_find_attach_btf_id(prog, attach_name, &btf_obj_fd, &btf_type_id); if (err) return err; @@ -6714,21 +7398,24 @@ static int libbpf_prepare_prog_load(struct bpf_program *prog, static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz); -static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_program *prog, - struct bpf_insn *insns, int insns_cnt, - const char *license, __u32 kern_version, - int *prog_fd) +static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog, + struct bpf_insn *insns, int insns_cnt, + const char *license, __u32 kern_version, int *prog_fd) { LIBBPF_OPTS(bpf_prog_load_opts, load_attr); const char *prog_name = NULL; char *cp, errmsg[STRERR_BUFSIZE]; size_t log_buf_size = 0; char *log_buf = NULL, *tmp; - int btf_fd, ret, err; bool own_log_buf = true; __u32 log_level = prog->log_level; + int ret, err; - if (prog->type == BPF_PROG_TYPE_UNSPEC) { + /* Be more helpful by rejecting programs that can't be validated early + * with more meaningful and actionable error message. + */ + switch (prog->type) { + case BPF_PROG_TYPE_UNSPEC: /* * The program type must be set. Most likely we couldn't find a proper * section definition at load time, and thus we didn't infer the type. @@ -6736,12 +7423,20 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog pr_warn("prog '%s': missing BPF prog type, check ELF section name '%s'\n", prog->name, prog->sec_name); return -EINVAL; + case BPF_PROG_TYPE_STRUCT_OPS: + if (prog->attach_btf_id == 0) { + pr_warn("prog '%s': SEC(\"struct_ops\") program isn't referenced anywhere, did you forget to use it?\n", + prog->name); + return -EINVAL; + } + break; + default: + break; } if (!insns || !insns_cnt) return -EINVAL; - load_attr.expected_attach_type = prog->expected_attach_type; if (kernel_supports(obj, FEAT_PROG_NAME)) prog_name = prog->name; load_attr.attach_prog_fd = prog->attach_prog_fd; @@ -6751,9 +7446,8 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog load_attr.prog_ifindex = prog->prog_ifindex; /* specify func_info/line_info only if kernel supports them */ - btf_fd = bpf_object__btf_fd(obj); - if (btf_fd >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { - load_attr.prog_btf_fd = btf_fd; + if (obj->btf && btf__fd(obj->btf) >= 0 && kernel_supports(obj, FEAT_BTF_FUNC)) { + load_attr.prog_btf_fd = btf__fd(obj->btf); load_attr.func_info = prog->func_info; load_attr.func_info_rec_size = prog->func_info_rec_size; load_attr.func_info_cnt = prog->func_info_cnt; @@ -6765,6 +7459,10 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog load_attr.prog_flags = prog->prog_flags; load_attr.fd_array = obj->fd_array; + load_attr.token_fd = obj->token_fd; + if (obj->token_fd) + load_attr.prog_flags |= BPF_F_TOKEN_FD; + /* adjust load_attr if sec_def provides custom preload callback */ if (prog->sec_def && prog->sec_def->prog_prepare_load_fn) { err = prog->sec_def->prog_prepare_load_fn(prog, &load_attr, prog->sec_def->cookie); @@ -6773,8 +7471,13 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog prog->name, err); return err; } + insns = prog->insns; + insns_cnt = prog->insns_cnt; } + /* allow prog_prepare_load_fn to change expected_attach_type */ + load_attr.expected_attach_type = prog->expected_attach_type; + if (obj->gen_loader) { bpf_gen__prog_load(obj->gen_loader, prog->type, prog->name, license, insns, insns_cnt, &load_attr, @@ -6784,7 +7487,7 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog } retry_load: - /* if log_level is zero, we don't request logs initiallly even if + /* if log_level is zero, we don't request logs initially even if * custom log_buf is specified; if the program load fails, then we'll * bump log_level to 1 and use either custom log_buf or we'll allocate * our own and retry the load to get details on what failed @@ -6831,7 +7534,7 @@ static int bpf_object_load_prog_instance(struct bpf_object *obj, struct bpf_prog if (map->libbpf_type != LIBBPF_MAP_RODATA) continue; - if (bpf_prog_bind_map(ret, bpf_map__fd(map), NULL)) { + if (bpf_prog_bind_map(ret, map->fd, NULL)) { cp = libbpf_strerror_r(errno, errmsg, sizeof(errmsg)); pr_warn("prog '%s': failed to bind map '%s': %s\n", prog->name, map->real_name, cp); @@ -6947,7 +7650,7 @@ static void fixup_log_failed_core_relo(struct bpf_program *prog, const struct bpf_core_relo *relo; struct bpf_core_spec spec; char patch[512], spec_buf[256]; - int insn_idx, err; + int insn_idx, err, spec_len; if (sscanf(line1, "%d: (%*d) call unknown#195896080\n", &insn_idx) != 1) return; @@ -6960,16 +7663,82 @@ static void fixup_log_failed_core_relo(struct bpf_program *prog, if (err) return; - bpf_core_format_spec(spec_buf, sizeof(spec_buf), &spec); + spec_len = bpf_core_format_spec(spec_buf, sizeof(spec_buf), &spec); snprintf(patch, sizeof(patch), "%d: \n" - "failed to resolve CO-RE relocation %s\n", - insn_idx, spec_buf); + "failed to resolve CO-RE relocation %s%s\n", + insn_idx, spec_buf, spec_len >= sizeof(spec_buf) ? "..." : ""); patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); } -static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz) +static void fixup_log_missing_map_load(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded map reference: + * line1 -> 123: (85) call unknown#2001000345 + * line2 -> invalid func unknown#2001000345 + * line3 -> + * + * "123" is the index of the instruction that was poisoned. + * "345" in "2001000345" is a map index in obj->maps to fetch map name. + */ + struct bpf_object *obj = prog->obj; + const struct bpf_map *map; + int insn_idx, map_idx; + char patch[128]; + + if (sscanf(line1, "%d: (%*d) call unknown#%d\n", &insn_idx, &map_idx) != 2) + return; + + map_idx -= POISON_LDIMM64_MAP_BASE; + if (map_idx < 0 || map_idx >= obj->nr_maps) + return; + map = &obj->maps[map_idx]; + + snprintf(patch, sizeof(patch), + "%d: \n" + "BPF map '%s' is referenced but wasn't created\n", + insn_idx, map->name); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_log_missing_kfunc_call(struct bpf_program *prog, + char *buf, size_t buf_sz, size_t log_sz, + char *line1, char *line2, char *line3) +{ + /* Expected log for failed and not properly guarded kfunc call: + * line1 -> 123: (85) call unknown#2002000345 + * line2 -> invalid func unknown#2002000345 + * line3 -> + * + * "123" is the index of the instruction that was poisoned. + * "345" in "2002000345" is an extern index in obj->externs to fetch kfunc name. + */ + struct bpf_object *obj = prog->obj; + const struct extern_desc *ext; + int insn_idx, ext_idx; + char patch[128]; + + if (sscanf(line1, "%d: (%*d) call unknown#%d\n", &insn_idx, &ext_idx) != 2) + return; + + ext_idx -= POISON_CALL_KFUNC_BASE; + if (ext_idx < 0 || ext_idx >= obj->nr_extern) + return; + ext = &obj->externs[ext_idx]; + + snprintf(patch, sizeof(patch), + "%d: \n" + "kfunc '%s' is referenced but wasn't resolved\n", + insn_idx, ext->name); + + patch_log(buf, buf_sz, log_sz, line1, line3 - line1, patch); +} + +static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_sz) { /* look for familiar error patterns in last N lines of the log */ const size_t max_last_line_cnt = 10; @@ -6988,15 +7757,33 @@ static void fixup_verifier_log(struct bpf_program *prog, char *buf, size_t buf_s if (!cur_line) return; - /* failed CO-RE relocation case */ if (str_has_pfx(cur_line, "invalid func unknown#195896080\n")) { prev_line = find_prev_line(buf, cur_line); if (!prev_line) continue; + /* failed CO-RE relocation case */ fixup_log_failed_core_relo(prog, buf, buf_sz, log_sz, prev_line, cur_line, next_line); return; + } else if (str_has_pfx(cur_line, "invalid func unknown#"POISON_LDIMM64_MAP_PFX)) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + /* reference to uncreated BPF map */ + fixup_log_missing_map_load(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; + } else if (str_has_pfx(cur_line, "invalid func unknown#"POISON_CALL_KFUNC_PFX)) { + prev_line = find_prev_line(buf, cur_line); + if (!prev_line) + continue; + + /* reference to unresolved kfunc */ + fixup_log_missing_kfunc_call(prog, buf, buf_sz, log_sz, + prev_line, cur_line, next_line); + return; } } } @@ -7008,19 +7795,22 @@ static int bpf_program_record_relos(struct bpf_program *prog) for (i = 0; i < prog->nr_reloc; i++) { struct reloc_desc *relo = &prog->reloc_desc[i]; - struct extern_desc *ext = &obj->externs[relo->sym_off]; + struct extern_desc *ext = &obj->externs[relo->ext_idx]; + int kind; switch (relo->type) { - case RELO_EXTERN_VAR: + case RELO_EXTERN_LD64: if (ext->type != EXT_KSYM) continue; + kind = btf_is_var(btf__type_by_id(obj->btf, ext->btf_id)) ? + BTF_KIND_VAR : BTF_KIND_FUNC; bpf_gen__record_extern(obj->gen_loader, ext->name, ext->is_weak, !ext->ksym.type_id, - BTF_KIND_VAR, relo->insn_idx); + true, kind, relo->insn_idx); break; - case RELO_EXTERN_FUNC: + case RELO_EXTERN_CALL: bpf_gen__record_extern(obj->gen_loader, ext->name, - ext->is_weak, false, BTF_KIND_FUNC, + ext->is_weak, false, false, BTF_KIND_FUNC, relo->insn_idx); break; case RELO_CORE: { @@ -7041,93 +7831,6 @@ static int bpf_program_record_relos(struct bpf_program *prog) return 0; } -static int bpf_object_load_prog(struct bpf_object *obj, struct bpf_program *prog, - const char *license, __u32 kern_ver) -{ - int err = 0, fd, i; - - if (obj->loaded) { - pr_warn("prog '%s': can't load after object was loaded\n", prog->name); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr < 0 || !prog->instances.fds) { - if (prog->preprocessor) { - pr_warn("Internal error: can't load program '%s'\n", - prog->name); - return libbpf_err(-LIBBPF_ERRNO__INTERNAL); - } - - prog->instances.fds = malloc(sizeof(int)); - if (!prog->instances.fds) { - pr_warn("Not enough memory for BPF fds\n"); - return libbpf_err(-ENOMEM); - } - prog->instances.nr = 1; - prog->instances.fds[0] = -1; - } - - if (!prog->preprocessor) { - if (prog->instances.nr != 1) { - pr_warn("prog '%s': inconsistent nr(%d) != 1\n", - prog->name, prog->instances.nr); - } - if (obj->gen_loader) - bpf_program_record_relos(prog); - err = bpf_object_load_prog_instance(obj, prog, - prog->insns, prog->insns_cnt, - license, kern_ver, &fd); - if (!err) - prog->instances.fds[0] = fd; - goto out; - } - - for (i = 0; i < prog->instances.nr; i++) { - struct bpf_prog_prep_result result; - bpf_program_prep_t preprocessor = prog->preprocessor; - - memset(&result, 0, sizeof(result)); - err = preprocessor(prog, i, prog->insns, - prog->insns_cnt, &result); - if (err) { - pr_warn("Preprocessing the %dth instance of program '%s' failed\n", - i, prog->name); - goto out; - } - - if (!result.new_insn_ptr || !result.new_insn_cnt) { - pr_debug("Skip loading the %dth instance of program '%s'\n", - i, prog->name); - prog->instances.fds[i] = -1; - if (result.pfd) - *result.pfd = -1; - continue; - } - - err = bpf_object_load_prog_instance(obj, prog, - result.new_insn_ptr, result.new_insn_cnt, - license, kern_ver, &fd); - if (err) { - pr_warn("Loading the %dth instance of program '%s' failed\n", - i, prog->name); - goto out; - } - - if (result.pfd) - *result.pfd = fd; - prog->instances.fds[i] = fd; - } -out: - if (err) - pr_warn("failed to load program '%s'\n", prog->name); - return libbpf_err(err); -} - -int bpf_program__load(struct bpf_program *prog, const char *license, __u32 kern_ver) -{ - return bpf_object_load_prog(prog->obj, prog, license, kern_ver); -} - static int bpf_object__load_progs(struct bpf_object *obj, int log_level) { @@ -7151,9 +7854,16 @@ bpf_object__load_progs(struct bpf_object *obj, int log_level) continue; } prog->log_level |= log_level; - err = bpf_object_load_prog(obj, prog, obj->license, obj->kern_version); - if (err) + + if (obj->gen_loader) + bpf_program_record_relos(prog); + + err = bpf_object_load_prog(obj, prog, prog->insns, prog->insns_cnt, + obj->license, obj->kern_version, &prog->fd); + if (err) { + pr_warn("prog '%s': failed to load: %d\n", prog->name, err); return err; + } } bpf_object__free_relocs(obj); @@ -7179,13 +7889,6 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object prog->type = prog->sec_def->prog_type; prog->expected_attach_type = prog->sec_def->expected_attach_type; -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - if (prog->sec_def->prog_type == BPF_PROG_TYPE_TRACING || - prog->sec_def->prog_type == BPF_PROG_TYPE_EXT) - prog->attach_prog_fd = OPTS_GET(opts, attach_prog_fd, 0); -#pragma GCC diagnostic pop - /* sec_def can have custom callback which should be called * after bpf_program is initialized to adjust its properties */ @@ -7205,7 +7908,7 @@ static int bpf_object_init_progs(struct bpf_object *obj, const struct bpf_object static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) { - const char *obj_name, *kconfig, *btf_tmp_path; + const char *obj_name, *kconfig, *btf_tmp_path, *token_path; struct bpf_object *obj; char tmp_name[64]; int err; @@ -7242,6 +7945,16 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, if (log_size && !log_buf) return ERR_PTR(-EINVAL); + token_path = OPTS_GET(opts, bpf_token_path, NULL); + /* if user didn't specify bpf_token_path explicitly, check if + * LIBBPF_BPF_TOKEN_PATH envvar was set and treat it as bpf_token_path + * option + */ + if (!token_path) + token_path = getenv("LIBBPF_BPF_TOKEN_PATH"); + if (token_path && strlen(token_path) >= PATH_MAX) + return ERR_PTR(-ENAMETOOLONG); + obj = bpf_object__new(path, obj_buf, obj_buf_sz, obj_name); if (IS_ERR(obj)) return obj; @@ -7250,6 +7963,14 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, obj->log_size = log_size; obj->log_level = log_level; + if (token_path) { + obj->token_path = strdup(token_path); + if (!obj->token_path) { + err = -ENOMEM; + goto out; + } + } + btf_tmp_path = OPTS_GET(opts, btf_custom_path, NULL); if (btf_tmp_path) { if (strlen(btf_tmp_path) >= PATH_MAX) { @@ -7276,7 +7997,7 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, err = err ? : bpf_object__check_endianness(obj); err = err ? : bpf_object__elf_collect(obj); err = err ? : bpf_object__collect_externs(obj); - err = err ? : bpf_object__finalize_btf(obj); + err = err ? : bpf_object_fixup_btf(obj); err = err ? : bpf_object__init_maps(obj, opts); err = err ? : bpf_object_init_progs(obj, opts); err = err ? : bpf_object__collect_relos(obj); @@ -7291,36 +8012,6 @@ static struct bpf_object *bpf_object_open(const char *path, const void *obj_buf, return ERR_PTR(err); } -static struct bpf_object * -__bpf_object__open_xattr(struct bpf_object_open_attr *attr, int flags) -{ - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, - .relaxed_maps = flags & MAPS_RELAX_COMPAT, - ); - - /* param validation */ - if (!attr->file) - return NULL; - - pr_debug("loading %s\n", attr->file); - return bpf_object_open(attr->file, NULL, 0, &opts); -} - -struct bpf_object *bpf_object__open_xattr(struct bpf_object_open_attr *attr) -{ - return libbpf_ptr(__bpf_object__open_xattr(attr, 0)); -} - -struct bpf_object *bpf_object__open(const char *path) -{ - struct bpf_object_open_attr attr = { - .file = path, - .prog_type = BPF_PROG_TYPE_UNSPEC, - }; - - return libbpf_ptr(__bpf_object__open_xattr(&attr, 0)); -} - struct bpf_object * bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) { @@ -7332,6 +8023,11 @@ bpf_object__open_file(const char *path, const struct bpf_object_open_opts *opts) return libbpf_ptr(bpf_object_open(path, NULL, 0, opts)); } +struct bpf_object *bpf_object__open(const char *path) +{ + return bpf_object__open_file(path, NULL); +} + struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts) @@ -7342,23 +8038,6 @@ bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, opts)); } -struct bpf_object * -bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz, - const char *name) -{ - DECLARE_LIBBPF_OPTS(bpf_object_open_opts, opts, - .object_name = name, - /* wrong default, but backwards-compatible */ - .relaxed_maps = true, - ); - - /* returning NULL is wrong, but backwards-compatible */ - if (!obj_buf || obj_buf_sz == 0) - return errno = EINVAL, NULL; - - return libbpf_ptr(bpf_object_open(NULL, obj_buf, obj_buf_sz, &opts)); -} - static int bpf_object_unload(struct bpf_object *obj) { size_t i; @@ -7378,8 +8057,6 @@ static int bpf_object_unload(struct bpf_object *obj) return 0; } -int bpf_object__unload(struct bpf_object *obj) __attribute__((alias("bpf_object_unload"))); - static int bpf_object__sanitize_maps(struct bpf_object *obj) { struct bpf_map *m; @@ -7388,20 +8065,23 @@ static int bpf_object__sanitize_maps(struct bpf_object *obj) if (!bpf_map__is_internal(m)) continue; if (!kernel_supports(obj, FEAT_ARRAY_MMAP)) - m->def.map_flags ^= BPF_F_MMAPABLE; + m->def.map_flags &= ~BPF_F_MMAPABLE; } return 0; } -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) +typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx); + +static int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *ctx) { char sym_type, sym_name[500]; unsigned long long sym_addr; int ret, err = 0; FILE *f; - f = fopen("/proc/kallsyms", "r"); + f = fopen("/proc/kallsyms", "re"); if (!f) { err = -errno; pr_warn("failed to open /proc/kallsyms: %d\n", err); @@ -7434,8 +8114,13 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, struct bpf_object *obj = ctx; const struct btf_type *t; struct extern_desc *ext; + char *res; - ext = find_extern_by_name(obj, sym_name); + res = strstr(sym_name, ".llvm."); + if (sym_type == 'd' && res) + ext = find_extern_by_name_with_len(obj, sym_name, res - sym_name); + else + ext = find_extern_by_name(obj, sym_name); if (!ext || ext->type != EXT_KSYM) return 0; @@ -7444,14 +8129,14 @@ static int kallsyms_cb(unsigned long long sym_addr, char sym_type, return 0; if (ext->is_set && ext->ksym.addr != sym_addr) { - pr_warn("extern (ksym) '%s' resolution is ambiguous: 0x%llx or 0x%llx\n", + pr_warn("extern (ksym) '%s': resolution is ambiguous: 0x%llx or 0x%llx\n", sym_name, ext->ksym.addr, sym_addr); return -EINVAL; } if (!ext->is_set) { ext->is_set = true; ext->ksym.addr = sym_addr; - pr_debug("extern (ksym) %s=0x%llx\n", sym_name, sym_addr); + pr_debug("extern (ksym) '%s': set to 0x%llx\n", sym_name, sym_addr); } return 0; } @@ -7559,7 +8244,8 @@ static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, local_func_proto_id = ext->ksym.type_id; - kfunc_id = find_ksym_btf_id(obj, ext->name, BTF_KIND_FUNC, &kern_btf, &mod_btf); + kfunc_id = find_ksym_btf_id(obj, ext->essent_name ?: ext->name, BTF_KIND_FUNC, &kern_btf, + &mod_btf); if (kfunc_id < 0) { if (kfunc_id == -ESRCH && ext->is_weak) return 0; @@ -7574,8 +8260,12 @@ static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, ret = bpf_core_types_are_compat(obj->btf, local_func_proto_id, kern_btf, kfunc_proto_id); if (ret <= 0) { - pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with kernel [%d]\n", - ext->name, local_func_proto_id, kfunc_proto_id); + if (ext->is_weak) + return 0; + + pr_warn("extern (func ksym) '%s': func_proto [%d] incompatible with %s [%d]\n", + ext->name, local_func_proto_id, + mod_btf ? mod_btf->name : "vmlinux", kfunc_proto_id); return -EINVAL; } @@ -7603,8 +8293,14 @@ static int bpf_object__resolve_ksym_func_btf_id(struct bpf_object *obj, ext->is_set = true; ext->ksym.kernel_btf_id = kfunc_id; ext->ksym.btf_fd_idx = mod_btf ? mod_btf->fd_array_idx : 0; - pr_debug("extern (func ksym) '%s': resolved to kernel [%d]\n", - ext->name, kfunc_id); + /* Also set kernel_btf_obj_fd to make sure that bpf_object__relocate_data() + * populates FD into ld_imm64 insn when it's used to point to kfunc. + * {kernel_btf_id, btf_fd_idx} -> fixup bpf_call. + * {kernel_btf_id, kernel_btf_obj_fd} -> fixup ld_imm64. + */ + ext->ksym.kernel_btf_obj_fd = mod_btf ? mod_btf->fd : 0; + pr_debug("extern (func ksym) '%s': resolved to %s [%d]\n", + ext->name, mod_btf ? mod_btf->name : "vmlinux", kfunc_id); return 0; } @@ -7655,28 +8351,52 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, for (i = 0; i < obj->nr_extern; i++) { ext = &obj->externs[i]; - if (ext->type == EXT_KCFG && - strcmp(ext->name, "LINUX_KERNEL_VERSION") == 0) { - void *ext_val = kcfg_data + ext->kcfg.data_off; - __u32 kver = get_kernel_version(); + if (ext->type == EXT_KSYM) { + if (ext->ksym.type_id) + need_vmlinux_btf = true; + else + need_kallsyms = true; + continue; + } else if (ext->type == EXT_KCFG) { + void *ext_ptr = kcfg_data + ext->kcfg.data_off; + __u64 value = 0; + + /* Kconfig externs need actual /proc/config.gz */ + if (str_has_pfx(ext->name, "CONFIG_")) { + need_config = true; + continue; + } - if (!kver) { - pr_warn("failed to get kernel version\n"); + /* Virtual kcfg externs are customly handled by libbpf */ + if (strcmp(ext->name, "LINUX_KERNEL_VERSION") == 0) { + value = get_kernel_version(); + if (!value) { + pr_warn("extern (kcfg) '%s': failed to get kernel version\n", ext->name); + return -EINVAL; + } + } else if (strcmp(ext->name, "LINUX_HAS_BPF_COOKIE") == 0) { + value = kernel_supports(obj, FEAT_BPF_COOKIE); + } else if (strcmp(ext->name, "LINUX_HAS_SYSCALL_WRAPPER") == 0) { + value = kernel_supports(obj, FEAT_SYSCALL_WRAPPER); + } else if (!str_has_pfx(ext->name, "LINUX_") || !ext->is_weak) { + /* Currently libbpf supports only CONFIG_ and LINUX_ prefixed + * __kconfig externs, where LINUX_ ones are virtual and filled out + * customly by libbpf (their values don't come from Kconfig). + * If LINUX_xxx variable is not recognized by libbpf, but is marked + * __weak, it defaults to zero value, just like for CONFIG_xxx + * externs. + */ + pr_warn("extern (kcfg) '%s': unrecognized virtual extern\n", ext->name); return -EINVAL; } - err = set_kcfg_value_num(ext, ext_val, kver); + + err = set_kcfg_value_num(ext, ext_ptr, value); if (err) return err; - pr_debug("extern (kcfg) %s=0x%x\n", ext->name, kver); - } else if (ext->type == EXT_KCFG && str_has_pfx(ext->name, "CONFIG_")) { - need_config = true; - } else if (ext->type == EXT_KSYM) { - if (ext->ksym.type_id) - need_vmlinux_btf = true; - else - need_kallsyms = true; + pr_debug("extern (kcfg) '%s': set to 0x%llx\n", + ext->name, (long long)value); } else { - pr_warn("unrecognized extern '%s'\n", ext->name); + pr_warn("extern '%s': unrecognized extern kind\n", ext->name); return -EINVAL; } } @@ -7712,10 +8432,10 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, ext = &obj->externs[i]; if (!ext->is_set && !ext->is_weak) { - pr_warn("extern %s (strong) not resolved\n", ext->name); + pr_warn("extern '%s' (strong): not resolved\n", ext->name); return -ESRCH; } else if (!ext->is_set) { - pr_debug("extern %s (weak) not resolved, defaulting to zero\n", + pr_debug("extern '%s' (weak): not resolved, defaulting to zero\n", ext->name); } } @@ -7723,6 +8443,46 @@ static int bpf_object__resolve_externs(struct bpf_object *obj, return 0; } +static void bpf_map_prepare_vdata(const struct bpf_map *map) +{ + struct bpf_struct_ops *st_ops; + __u32 i; + + st_ops = map->st_ops; + for (i = 0; i < btf_vlen(st_ops->type); i++) { + struct bpf_program *prog = st_ops->progs[i]; + void *kern_data; + int prog_fd; + + if (!prog) + continue; + + prog_fd = bpf_program__fd(prog); + kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i]; + *(unsigned long *)kern_data = prog_fd; + } +} + +static int bpf_object_prepare_struct_ops(struct bpf_object *obj) +{ + struct bpf_map *map; + int i; + + for (i = 0; i < obj->nr_maps; i++) { + map = &obj->maps[i]; + + if (!bpf_map__is_struct_ops(map)) + continue; + + if (!map->autocreate) + continue; + + bpf_map_prepare_vdata(map); + } + + return 0; +} + static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const char *target_btf_path) { int err, i; @@ -7738,23 +8498,24 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch if (obj->gen_loader) bpf_gen__init(obj->gen_loader, extra_log_level, obj->nr_programs, obj->nr_maps); - err = bpf_object__probe_loading(obj); + err = bpf_object_prepare_token(obj); + err = err ? : bpf_object__probe_loading(obj); err = err ? : bpf_object__load_vmlinux_btf(obj, false); err = err ? : bpf_object__resolve_externs(obj, obj->kconfig); - err = err ? : bpf_object__sanitize_and_load_btf(obj); err = err ? : bpf_object__sanitize_maps(obj); err = err ? : bpf_object__init_kern_struct_ops_maps(obj); - err = err ? : bpf_object__create_maps(obj); + err = err ? : bpf_object_adjust_struct_ops_autoload(obj); err = err ? : bpf_object__relocate(obj, obj->btf_custom_path ? : target_btf_path); + err = err ? : bpf_object__sanitize_and_load_btf(obj); + err = err ? : bpf_object__create_maps(obj); err = err ? : bpf_object__load_progs(obj, extra_log_level); err = err ? : bpf_object_init_prog_arrays(obj); + err = err ? : bpf_object_prepare_struct_ops(obj); if (obj->gen_loader) { /* reset FDs */ if (obj->btf) btf__set_fd(obj->btf, -1); - for (i = 0; i < obj->nr_maps; i++) - obj->maps[i].fd = -1; if (!err) err = bpf_gen__finish(obj->gen_loader, obj->nr_programs, obj->nr_maps); } @@ -7791,11 +8552,6 @@ static int bpf_object_load(struct bpf_object *obj, int extra_log_level, const ch return libbpf_err(err); } -int bpf_object__load_xattr(struct bpf_object_load_attr *attr) -{ - return bpf_object_load(attr->obj, attr->log_level, attr->target_btf_path); -} - int bpf_object__load(struct bpf_object *obj) { return bpf_object_load(obj, 0, NULL); @@ -7853,11 +8609,16 @@ static int check_path(const char *path) return err; } -static int bpf_program_pin_instance(struct bpf_program *prog, const char *path, int instance) +int bpf_program__pin(struct bpf_program *prog, const char *path) { char *cp, errmsg[STRERR_BUFSIZE]; int err; + if (prog->fd < 0) { + pr_warn("prog '%s': can't pin program that wasn't loaded\n", prog->name); + return libbpf_err(-EINVAL); + } + err = make_parent_dir(path); if (err) return libbpf_err(err); @@ -7866,192 +8627,62 @@ static int bpf_program_pin_instance(struct bpf_program *prog, const char *path, if (err) return libbpf_err(err); - if (prog == NULL) { - pr_warn("invalid program pointer\n"); - return libbpf_err(-EINVAL); - } - - if (instance < 0 || instance >= prog->instances.nr) { - pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->name, prog->instances.nr); - return libbpf_err(-EINVAL); - } - - if (bpf_obj_pin(prog->instances.fds[instance], path)) { + if (bpf_obj_pin(prog->fd, path)) { err = -errno; cp = libbpf_strerror_r(err, errmsg, sizeof(errmsg)); - pr_warn("failed to pin program: %s\n", cp); + pr_warn("prog '%s': failed to pin at '%s': %s\n", prog->name, path, cp); return libbpf_err(err); } - pr_debug("pinned program '%s'\n", path); + pr_debug("prog '%s': pinned at '%s'\n", prog->name, path); return 0; } -static int bpf_program_unpin_instance(struct bpf_program *prog, const char *path, int instance) +int bpf_program__unpin(struct bpf_program *prog, const char *path) { int err; - err = check_path(path); - if (err) - return libbpf_err(err); - - if (prog == NULL) { - pr_warn("invalid program pointer\n"); + if (prog->fd < 0) { + pr_warn("prog '%s': can't unpin program that wasn't loaded\n", prog->name); return libbpf_err(-EINVAL); } - if (instance < 0 || instance >= prog->instances.nr) { - pr_warn("invalid prog instance %d of prog %s (max %d)\n", - instance, prog->name, prog->instances.nr); - return libbpf_err(-EINVAL); - } + err = check_path(path); + if (err) + return libbpf_err(err); err = unlink(path); - if (err != 0) + if (err) return libbpf_err(-errno); - pr_debug("unpinned program '%s'\n", path); - + pr_debug("prog '%s': unpinned from '%s'\n", prog->name, path); return 0; } -__attribute__((alias("bpf_program_pin_instance"))) -int bpf_object__pin_instance(struct bpf_program *prog, const char *path, int instance); - -__attribute__((alias("bpf_program_unpin_instance"))) -int bpf_program__unpin_instance(struct bpf_program *prog, const char *path, int instance); - -int bpf_program__pin(struct bpf_program *prog, const char *path) +int bpf_map__pin(struct bpf_map *map, const char *path) { - int i, err; - - err = make_parent_dir(path); - if (err) - return libbpf_err(err); - - err = check_path(path); - if (err) - return libbpf_err(err); + char *cp, errmsg[STRERR_BUFSIZE]; + int err; - if (prog == NULL) { - pr_warn("invalid program pointer\n"); + if (map == NULL) { + pr_warn("invalid map pointer\n"); return libbpf_err(-EINVAL); } - if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", prog->name); + if (map->fd < 0) { + pr_warn("map '%s': can't pin BPF map without FD (was it created?)\n", map->name); return libbpf_err(-EINVAL); } - if (prog->instances.nr == 1) { - /* don't create subdirs when pinning single instance */ - return bpf_program_pin_instance(prog, path, 0); - } - - for (i = 0; i < prog->instances.nr; i++) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%d", path, i); - if (len < 0) { - err = -EINVAL; - goto err_unpin; - } else if (len >= PATH_MAX) { - err = -ENAMETOOLONG; - goto err_unpin; - } - - err = bpf_program_pin_instance(prog, buf, i); - if (err) - goto err_unpin; - } - - return 0; - -err_unpin: - for (i = i - 1; i >= 0; i--) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%d", path, i); - if (len < 0) - continue; - else if (len >= PATH_MAX) - continue; - - bpf_program_unpin_instance(prog, buf, i); - } - - rmdir(path); - - return libbpf_err(err); -} - -int bpf_program__unpin(struct bpf_program *prog, const char *path) -{ - int i, err; - - err = check_path(path); - if (err) - return libbpf_err(err); - - if (prog == NULL) { - pr_warn("invalid program pointer\n"); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr <= 0) { - pr_warn("no instances of prog %s to pin\n", prog->name); - return libbpf_err(-EINVAL); - } - - if (prog->instances.nr == 1) { - /* don't create subdirs when pinning single instance */ - return bpf_program_unpin_instance(prog, path, 0); - } - - for (i = 0; i < prog->instances.nr; i++) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%d", path, i); - if (len < 0) - return libbpf_err(-EINVAL); - else if (len >= PATH_MAX) - return libbpf_err(-ENAMETOOLONG); - - err = bpf_program_unpin_instance(prog, buf, i); - if (err) - return err; - } - - err = rmdir(path); - if (err) - return libbpf_err(-errno); - - return 0; -} - -int bpf_map__pin(struct bpf_map *map, const char *path) -{ - char *cp, errmsg[STRERR_BUFSIZE]; - int err; - - if (map == NULL) { - pr_warn("invalid map pointer\n"); - return libbpf_err(-EINVAL); - } - - if (map->pin_path) { - if (path && strcmp(path, map->pin_path)) { - pr_warn("map '%s' already has pin path '%s' different from '%s'\n", - bpf_map__name(map), map->pin_path, path); - return libbpf_err(-EINVAL); - } else if (map->pinned) { - pr_debug("map '%s' already pinned at '%s'; not re-pinning\n", - bpf_map__name(map), map->pin_path); - return 0; + if (map->pin_path) { + if (path && strcmp(path, map->pin_path)) { + pr_warn("map '%s' already has pin path '%s' different from '%s'\n", + bpf_map__name(map), map->pin_path, path); + return libbpf_err(-EINVAL); + } else if (map->pinned) { + pr_debug("map '%s' already pinned at '%s'; not re-pinning\n", + bpf_map__name(map), map->pin_path); + return 0; } } else { if (!path) { @@ -8185,21 +8816,13 @@ int bpf_object__pin_maps(struct bpf_object *obj, const char *path) char *pin_path = NULL; char buf[PATH_MAX]; - if (map->skipped) + if (!map->autocreate) continue; if (path) { - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, - bpf_map__name(map)); - if (len < 0) { - err = -EINVAL; - goto err_unpin_maps; - } else if (len >= PATH_MAX) { - err = -ENAMETOOLONG; + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) goto err_unpin_maps; - } sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { @@ -8237,14 +8860,9 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) char buf[PATH_MAX]; if (path) { - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, - bpf_map__name(map)); - if (len < 0) - return libbpf_err(-EINVAL); - else if (len >= PATH_MAX) - return libbpf_err(-ENAMETOOLONG); + err = pathname_concat(buf, sizeof(buf), path, bpf_map__name(map)); + if (err) + return libbpf_err(err); sanitize_pin_path(buf); pin_path = buf; } else if (!map->pin_path) { @@ -8262,6 +8880,7 @@ int bpf_object__unpin_maps(struct bpf_object *obj, const char *path) int bpf_object__pin_programs(struct bpf_object *obj, const char *path) { struct bpf_program *prog; + char buf[PATH_MAX]; int err; if (!obj) @@ -8273,18 +8892,9 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) } bpf_object__for_each_program(prog, obj) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, - prog->pin_name); - if (len < 0) { - err = -EINVAL; - goto err_unpin_programs; - } else if (len >= PATH_MAX) { - err = -ENAMETOOLONG; + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) goto err_unpin_programs; - } err = bpf_program__pin(prog, buf); if (err) @@ -8295,14 +8905,7 @@ int bpf_object__pin_programs(struct bpf_object *obj, const char *path) err_unpin_programs: while ((prog = bpf_object__prev_program(obj, prog))) { - char buf[PATH_MAX]; - int len; - - len = snprintf(buf, PATH_MAX, "%s/%s", path, - prog->pin_name); - if (len < 0) - continue; - else if (len >= PATH_MAX) + if (pathname_concat(buf, sizeof(buf), path, prog->name)) continue; bpf_program__unpin(prog, buf); @@ -8321,14 +8924,10 @@ int bpf_object__unpin_programs(struct bpf_object *obj, const char *path) bpf_object__for_each_program(prog, obj) { char buf[PATH_MAX]; - int len; - len = snprintf(buf, PATH_MAX, "%s/%s", path, - prog->pin_name); - if (len < 0) - return libbpf_err(-EINVAL); - else if (len >= PATH_MAX) - return libbpf_err(-ENAMETOOLONG); + err = pathname_concat(buf, sizeof(buf), path, prog->name); + if (err) + return libbpf_err(err); err = bpf_program__unpin(prog, buf); if (err) @@ -8355,13 +8954,23 @@ int bpf_object__pin(struct bpf_object *obj, const char *path) return 0; } -static void bpf_map__destroy(struct bpf_map *map) +int bpf_object__unpin(struct bpf_object *obj, const char *path) { - if (map->clear_priv) - map->clear_priv(map, map->priv); - map->priv = NULL; - map->clear_priv = NULL; + int err; + + err = bpf_object__unpin_programs(obj, path); + if (err) + return libbpf_err(err); + + err = bpf_object__unpin_maps(obj, path); + if (err) + return libbpf_err(err); + return 0; +} + +static void bpf_map__destroy(struct bpf_map *map) +{ if (map->inner_map) { bpf_map__destroy(map->inner_map); zfree(&map->inner_map); @@ -8370,10 +8979,9 @@ static void bpf_map__destroy(struct bpf_map *map) zfree(&map->init_slots); map->init_slots_sz = 0; - if (map->mmaped) { + if (map->mmaped && map->mmaped != map->obj->arena_data) munmap(map->mmaped, bpf_map_mmap_sz(map)); - map->mmaped = NULL; - } + map->mmaped = NULL; if (map->st_ops) { zfree(&map->st_ops->data); @@ -8397,9 +9005,6 @@ void bpf_object__close(struct bpf_object *obj) if (IS_ERR_OR_NULL(obj)) return; - if (obj->clear_priv) - obj->clear_priv(obj, obj->priv); - usdt_manager_free(obj->usdt_man); obj->usdt_man = NULL; @@ -8407,6 +9012,7 @@ void bpf_object__close(struct bpf_object *obj) bpf_object__elf_finish(obj); bpf_object_unload(obj); btf__free(obj->btf); + btf__free(obj->btf_vmlinux); btf_ext__free(obj->btf_ext); for (i = 0; i < obj->nr_maps; i++) @@ -8414,6 +9020,10 @@ void bpf_object__close(struct bpf_object *obj) zfree(&obj->btf_custom_path); zfree(&obj->kconfig); + + for (i = 0; i < obj->nr_extern; i++) + zfree(&obj->externs[i].essent_name); + zfree(&obj->externs); obj->nr_extern = 0; @@ -8426,31 +9036,14 @@ void bpf_object__close(struct bpf_object *obj) } zfree(&obj->programs); - list_del(&obj->list); - free(obj); -} - -struct bpf_object * -bpf_object__next(struct bpf_object *prev) -{ - struct bpf_object *next; - bool strict = (libbpf_mode & LIBBPF_STRICT_NO_OBJECT_LIST); - - if (strict) - return NULL; - - if (!prev) - next = list_first_entry(&bpf_objects_list, - struct bpf_object, - list); - else - next = list_next_entry(prev, list); + zfree(&obj->feat_cache); + zfree(&obj->token_path); + if (obj->token_fd > 0) + close(obj->token_fd); - /* Empty list is noticed here so don't need checking on entry. */ - if (&next->list == &bpf_objects_list) - return NULL; + zfree(&obj->arena_data); - return next; + free(obj); } const char *bpf_object__name(const struct bpf_object *obj) @@ -8483,22 +9076,6 @@ int bpf_object__set_kversion(struct bpf_object *obj, __u32 kern_version) return 0; } -int bpf_object__set_priv(struct bpf_object *obj, void *priv, - bpf_object_clear_priv_t clear_priv) -{ - if (obj->priv && obj->clear_priv) - obj->clear_priv(obj, obj->priv); - - obj->priv = priv; - obj->clear_priv = clear_priv; - return 0; -} - -void *bpf_object__priv(const struct bpf_object *obj) -{ - return obj ? obj->priv : libbpf_err_ptr(-EINVAL); -} - int bpf_object__gen_loader(struct bpf_object *obj, struct gen_loader_opts *opts) { struct bpf_gen *gen; @@ -8541,12 +9118,6 @@ __bpf_program__iter(const struct bpf_program *p, const struct bpf_object *obj, return &obj->programs[idx]; } -struct bpf_program * -bpf_program__next(struct bpf_program *prev, const struct bpf_object *obj) -{ - return bpf_object__next_program(obj, prev); -} - struct bpf_program * bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prev) { @@ -8559,12 +9130,6 @@ bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prev) return prog; } -struct bpf_program * -bpf_program__prev(struct bpf_program *next, const struct bpf_object *obj) -{ - return bpf_object__prev_program(obj, next); -} - struct bpf_program * bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *next) { @@ -8577,22 +9142,6 @@ bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *next) return prog; } -int bpf_program__set_priv(struct bpf_program *prog, void *priv, - bpf_program_clear_priv_t clear_priv) -{ - if (prog->priv && prog->clear_priv) - prog->clear_priv(prog, prog->priv); - - prog->priv = priv; - prog->clear_priv = clear_priv; - return 0; -} - -void *bpf_program__priv(const struct bpf_program *prog) -{ - return prog ? prog->priv : libbpf_err_ptr(-EINVAL); -} - void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex) { prog->prog_ifindex = ifindex; @@ -8608,22 +9157,6 @@ const char *bpf_program__section_name(const struct bpf_program *prog) return prog->sec_name; } -const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy) -{ - const char *title; - - title = prog->sec_name; - if (needs_copy) { - title = strdup(title); - if (!title) { - pr_warn("failed to strdup program title\n"); - return libbpf_err_ptr(-ENOMEM); - } - } - - return title; -} - bool bpf_program__autoload(const struct bpf_program *prog) { return prog->autoload; @@ -8638,16 +9171,14 @@ int bpf_program__set_autoload(struct bpf_program *prog, bool autoload) return 0; } -static int bpf_program_nth_fd(const struct bpf_program *prog, int n); - -int bpf_program__fd(const struct bpf_program *prog) +bool bpf_program__autoattach(const struct bpf_program *prog) { - return bpf_program_nth_fd(prog, 0); + return prog->autoattach; } -size_t bpf_program__size(const struct bpf_program *prog) +void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach) { - return prog->insns_cnt * BPF_INSN_SZ; + prog->autoattach = autoattach; } const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog) @@ -8660,58 +9191,36 @@ size_t bpf_program__insn_cnt(const struct bpf_program *prog) return prog->insns_cnt; } -int bpf_program__set_prep(struct bpf_program *prog, int nr_instances, - bpf_program_prep_t prep) +int bpf_program__set_insns(struct bpf_program *prog, + struct bpf_insn *new_insns, size_t new_insn_cnt) { - int *instances_fds; - - if (nr_instances <= 0 || !prep) - return libbpf_err(-EINVAL); + struct bpf_insn *insns; - if (prog->instances.nr > 0 || prog->instances.fds) { - pr_warn("Can't set pre-processor after loading\n"); - return libbpf_err(-EINVAL); - } + if (prog->obj->loaded) + return -EBUSY; - instances_fds = malloc(sizeof(int) * nr_instances); - if (!instances_fds) { - pr_warn("alloc memory failed for fds\n"); - return libbpf_err(-ENOMEM); + insns = libbpf_reallocarray(prog->insns, new_insn_cnt, sizeof(*insns)); + /* NULL is a valid return from reallocarray if the new count is zero */ + if (!insns && new_insn_cnt) { + pr_warn("prog '%s': failed to realloc prog code\n", prog->name); + return -ENOMEM; } + memcpy(insns, new_insns, new_insn_cnt * sizeof(*insns)); - /* fill all fd with -1 */ - memset(instances_fds, -1, sizeof(int) * nr_instances); - - prog->instances.nr = nr_instances; - prog->instances.fds = instances_fds; - prog->preprocessor = prep; + prog->insns = insns; + prog->insns_cnt = new_insn_cnt; return 0; } -__attribute__((alias("bpf_program_nth_fd"))) -int bpf_program__nth_fd(const struct bpf_program *prog, int n); - -static int bpf_program_nth_fd(const struct bpf_program *prog, int n) +int bpf_program__fd(const struct bpf_program *prog) { - int fd; - if (!prog) return libbpf_err(-EINVAL); - if (n >= prog->instances.nr || n < 0) { - pr_warn("Can't get the %dth fd from program %s: only %d instances\n", - n, prog->name, prog->instances.nr); - return libbpf_err(-EINVAL); - } - - fd = prog->instances.fds[n]; - if (fd < 0) { - pr_warn("%dth instance of program '%s' is invalid\n", - n, prog->name); + if (prog->fd < 0) return libbpf_err(-ENOENT); - } - return fd; + return prog->fd; } __alias(bpf_program__type) @@ -8722,48 +9231,34 @@ enum bpf_prog_type bpf_program__type(const struct bpf_program *prog) return prog->type; } +static size_t custom_sec_def_cnt; +static struct bpf_sec_def *custom_sec_defs; +static struct bpf_sec_def custom_fallback_def; +static bool has_custom_fallback_def; +static int last_custom_sec_def_handler_id; + int bpf_program__set_type(struct bpf_program *prog, enum bpf_prog_type type) { if (prog->obj->loaded) return libbpf_err(-EBUSY); + /* if type is not changed, do nothing */ + if (prog->type == type) + return 0; + prog->type = type; + + /* If a program type was changed, we need to reset associated SEC() + * handler, as it will be invalid now. The only exception is a generic + * fallback handler, which by definition is program type-agnostic and + * is a catch-all custom handler, optionally set by the application, + * so should be able to handle any type of BPF program. + */ + if (prog->sec_def != &custom_fallback_def) + prog->sec_def = NULL; return 0; } -static bool bpf_program__is_type(const struct bpf_program *prog, - enum bpf_prog_type type) -{ - return prog ? (prog->type == type) : false; -} - -#define BPF_PROG_TYPE_FNS(NAME, TYPE) \ -int bpf_program__set_##NAME(struct bpf_program *prog) \ -{ \ - if (!prog) \ - return libbpf_err(-EINVAL); \ - return bpf_program__set_type(prog, TYPE); \ -} \ - \ -bool bpf_program__is_##NAME(const struct bpf_program *prog) \ -{ \ - return bpf_program__is_type(prog, TYPE); \ -} \ - -BPF_PROG_TYPE_FNS(socket_filter, BPF_PROG_TYPE_SOCKET_FILTER); -BPF_PROG_TYPE_FNS(lsm, BPF_PROG_TYPE_LSM); -BPF_PROG_TYPE_FNS(kprobe, BPF_PROG_TYPE_KPROBE); -BPF_PROG_TYPE_FNS(sched_cls, BPF_PROG_TYPE_SCHED_CLS); -BPF_PROG_TYPE_FNS(sched_act, BPF_PROG_TYPE_SCHED_ACT); -BPF_PROG_TYPE_FNS(tracepoint, BPF_PROG_TYPE_TRACEPOINT); -BPF_PROG_TYPE_FNS(raw_tracepoint, BPF_PROG_TYPE_RAW_TRACEPOINT); -BPF_PROG_TYPE_FNS(xdp, BPF_PROG_TYPE_XDP); -BPF_PROG_TYPE_FNS(perf_event, BPF_PROG_TYPE_PERF_EVENT); -BPF_PROG_TYPE_FNS(tracing, BPF_PROG_TYPE_TRACING); -BPF_PROG_TYPE_FNS(struct_ops, BPF_PROG_TYPE_STRUCT_OPS); -BPF_PROG_TYPE_FNS(extension, BPF_PROG_TYPE_EXT); -BPF_PROG_TYPE_FNS(sk_lookup, BPF_PROG_TYPE_SK_LOOKUP); - __alias(bpf_program__expected_attach_type) enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); @@ -8841,102 +9336,121 @@ int bpf_program__set_log_buf(struct bpf_program *prog, char *log_buf, size_t log static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_usdt(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, struct bpf_link **link); +static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_link **link); static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_link **link); static const struct bpf_sec_def section_defs[] = { - SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("kprobe/", KPROBE, 0, SEC_NONE, attach_kprobe), + SEC_DEF("socket", SOCKET_FILTER, 0, SEC_NONE), + SEC_DEF("sk_reuseport/migrate", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT_OR_MIGRATE, SEC_ATTACHABLE), + SEC_DEF("sk_reuseport", SK_REUSEPORT, BPF_SK_REUSEPORT_SELECT, SEC_ATTACHABLE), + SEC_DEF("kprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), SEC_DEF("uprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), - SEC_DEF("kretprobe/", KPROBE, 0, SEC_NONE, attach_kprobe), + SEC_DEF("uprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), + SEC_DEF("kretprobe+", KPROBE, 0, SEC_NONE, attach_kprobe), SEC_DEF("uretprobe+", KPROBE, 0, SEC_NONE, attach_uprobe), - SEC_DEF("kprobe.multi/", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), - SEC_DEF("kretprobe.multi/", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), - SEC_DEF("usdt+", KPROBE, 0, SEC_NONE, attach_usdt), - SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), - SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE | SEC_SLOPPY_PFX | SEC_DEPRECATED), - SEC_DEF("action", SCHED_ACT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("tracepoint/", TRACEPOINT, 0, SEC_NONE, attach_tp), - SEC_DEF("tp/", TRACEPOINT, 0, SEC_NONE, attach_tp), - SEC_DEF("raw_tracepoint/", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), - SEC_DEF("raw_tp/", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), - SEC_DEF("raw_tracepoint.w/", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), - SEC_DEF("raw_tp.w/", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), - SEC_DEF("tp_btf/", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace), - SEC_DEF("fentry/", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace), - SEC_DEF("fmod_ret/", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace), - SEC_DEF("fexit/", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace), - SEC_DEF("fentry.s/", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), - SEC_DEF("fmod_ret.s/", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), - SEC_DEF("fexit.s/", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), - SEC_DEF("freplace/", EXT, 0, SEC_ATTACH_BTF, attach_trace), - SEC_DEF("lsm/", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), - SEC_DEF("lsm.s/", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), - SEC_DEF("iter/", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter), - SEC_DEF("iter.s/", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter), + SEC_DEF("uretprobe.s+", KPROBE, 0, SEC_SLEEPABLE, attach_uprobe), + SEC_DEF("kprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("kretprobe.multi+", KPROBE, BPF_TRACE_KPROBE_MULTI, SEC_NONE, attach_kprobe_multi), + SEC_DEF("kprobe.session+", KPROBE, BPF_TRACE_KPROBE_SESSION, SEC_NONE, attach_kprobe_session), + SEC_DEF("uprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), + SEC_DEF("uretprobe.multi+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_NONE, attach_uprobe_multi), + SEC_DEF("uprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), + SEC_DEF("uretprobe.multi.s+", KPROBE, BPF_TRACE_UPROBE_MULTI, SEC_SLEEPABLE, attach_uprobe_multi), + SEC_DEF("ksyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), + SEC_DEF("kretsyscall+", KPROBE, 0, SEC_NONE, attach_ksyscall), + SEC_DEF("usdt+", KPROBE, 0, SEC_USDT, attach_usdt), + SEC_DEF("usdt.s+", KPROBE, 0, SEC_USDT | SEC_SLEEPABLE, attach_usdt), + SEC_DEF("tc/ingress", SCHED_CLS, BPF_TCX_INGRESS, SEC_NONE), /* alias for tcx */ + SEC_DEF("tc/egress", SCHED_CLS, BPF_TCX_EGRESS, SEC_NONE), /* alias for tcx */ + SEC_DEF("tcx/ingress", SCHED_CLS, BPF_TCX_INGRESS, SEC_NONE), + SEC_DEF("tcx/egress", SCHED_CLS, BPF_TCX_EGRESS, SEC_NONE), + SEC_DEF("tc", SCHED_CLS, 0, SEC_NONE), /* deprecated / legacy, use tcx */ + SEC_DEF("classifier", SCHED_CLS, 0, SEC_NONE), /* deprecated / legacy, use tcx */ + SEC_DEF("action", SCHED_ACT, 0, SEC_NONE), /* deprecated / legacy, use tcx */ + SEC_DEF("netkit/primary", SCHED_CLS, BPF_NETKIT_PRIMARY, SEC_NONE), + SEC_DEF("netkit/peer", SCHED_CLS, BPF_NETKIT_PEER, SEC_NONE), + SEC_DEF("tracepoint+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("tp+", TRACEPOINT, 0, SEC_NONE, attach_tp), + SEC_DEF("raw_tracepoint+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tp+", RAW_TRACEPOINT, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tracepoint.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("raw_tp.w+", RAW_TRACEPOINT_WRITABLE, 0, SEC_NONE, attach_raw_tp), + SEC_DEF("tp_btf+", TRACING, BPF_TRACE_RAW_TP, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fentry+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fmod_ret+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fexit+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("fentry.s+", TRACING, BPF_TRACE_FENTRY, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fmod_ret.s+", TRACING, BPF_MODIFY_RETURN, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("fexit.s+", TRACING, BPF_TRACE_FEXIT, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_trace), + SEC_DEF("freplace+", EXT, 0, SEC_ATTACH_BTF, attach_trace), + SEC_DEF("lsm+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF, attach_lsm), + SEC_DEF("lsm.s+", LSM, BPF_LSM_MAC, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_lsm), + SEC_DEF("lsm_cgroup+", LSM, BPF_LSM_CGROUP, SEC_ATTACH_BTF), + SEC_DEF("iter+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF, attach_iter), + SEC_DEF("iter.s+", TRACING, BPF_TRACE_ITER, SEC_ATTACH_BTF | SEC_SLEEPABLE, attach_iter), SEC_DEF("syscall", SYSCALL, 0, SEC_SLEEPABLE), SEC_DEF("xdp.frags/devmap", XDP, BPF_XDP_DEVMAP, SEC_XDP_FRAGS), SEC_DEF("xdp/devmap", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE), - SEC_DEF("xdp_devmap/", XDP, BPF_XDP_DEVMAP, SEC_ATTACHABLE | SEC_DEPRECATED), SEC_DEF("xdp.frags/cpumap", XDP, BPF_XDP_CPUMAP, SEC_XDP_FRAGS), SEC_DEF("xdp/cpumap", XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE), - SEC_DEF("xdp_cpumap/", XDP, BPF_XDP_CPUMAP, SEC_ATTACHABLE | SEC_DEPRECATED), SEC_DEF("xdp.frags", XDP, BPF_XDP, SEC_XDP_FRAGS), - SEC_DEF("xdp", XDP, BPF_XDP, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("perf_event", PERF_EVENT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_in", LWT_IN, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_out", LWT_OUT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_xmit", LWT_XMIT, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("lwt_seg6local", LWT_SEG6LOCAL, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup_skb/ingress", CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup_skb/egress", CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/skb", CGROUP_SKB, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sock_create", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sock_release", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sock", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/post_bind4", CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/post_bind6", CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE | SEC_SLOPPY_PFX), - SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("flow_dissector", FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/bind4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), - SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE | SEC_SLOPPY_PFX), + SEC_DEF("xdp", XDP, BPF_XDP, SEC_ATTACHABLE_OPT), + SEC_DEF("perf_event", PERF_EVENT, 0, SEC_NONE), + SEC_DEF("lwt_in", LWT_IN, 0, SEC_NONE), + SEC_DEF("lwt_out", LWT_OUT, 0, SEC_NONE), + SEC_DEF("lwt_xmit", LWT_XMIT, 0, SEC_NONE), + SEC_DEF("lwt_seg6local", LWT_SEG6LOCAL, 0, SEC_NONE), + SEC_DEF("sockops", SOCK_OPS, BPF_CGROUP_SOCK_OPS, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_parser", SK_SKB, BPF_SK_SKB_STREAM_PARSER, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/stream_verdict",SK_SKB, BPF_SK_SKB_STREAM_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb/verdict", SK_SKB, BPF_SK_SKB_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("sk_skb", SK_SKB, 0, SEC_NONE), + SEC_DEF("sk_msg", SK_MSG, BPF_SK_MSG_VERDICT, SEC_ATTACHABLE_OPT), + SEC_DEF("lirc_mode2", LIRC_MODE2, BPF_LIRC_MODE2, SEC_ATTACHABLE_OPT), + SEC_DEF("flow_dissector", FLOW_DISSECTOR, BPF_FLOW_DISSECTOR, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/ingress", CGROUP_SKB, BPF_CGROUP_INET_INGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup_skb/egress", CGROUP_SKB, BPF_CGROUP_INET_EGRESS, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/skb", CGROUP_SKB, 0, SEC_NONE), + SEC_DEF("cgroup/sock_create", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock_release", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_RELEASE, SEC_ATTACHABLE), + SEC_DEF("cgroup/sock", CGROUP_SOCK, BPF_CGROUP_INET_SOCK_CREATE, SEC_ATTACHABLE_OPT), + SEC_DEF("cgroup/post_bind4", CGROUP_SOCK, BPF_CGROUP_INET4_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/post_bind6", CGROUP_SOCK, BPF_CGROUP_INET6_POST_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/bind6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_BIND, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/connect_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_CONNECT, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/sendmsg_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_SENDMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg4", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP4_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg6", CGROUP_SOCK_ADDR, BPF_CGROUP_UDP6_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/recvmsg_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_RECVMSG, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getpeername_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_GETPEERNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname4", CGROUP_SOCK_ADDR, BPF_CGROUP_INET4_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname6", CGROUP_SOCK_ADDR, BPF_CGROUP_INET6_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockname_unix", CGROUP_SOCK_ADDR, BPF_CGROUP_UNIX_GETSOCKNAME, SEC_ATTACHABLE), + SEC_DEF("cgroup/sysctl", CGROUP_SYSCTL, BPF_CGROUP_SYSCTL, SEC_ATTACHABLE), + SEC_DEF("cgroup/getsockopt", CGROUP_SOCKOPT, BPF_CGROUP_GETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/setsockopt", CGROUP_SOCKOPT, BPF_CGROUP_SETSOCKOPT, SEC_ATTACHABLE), + SEC_DEF("cgroup/dev", CGROUP_DEVICE, BPF_CGROUP_DEVICE, SEC_ATTACHABLE_OPT), SEC_DEF("struct_ops+", STRUCT_OPS, 0, SEC_NONE), - SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE | SEC_SLOPPY_PFX), + SEC_DEF("struct_ops.s+", STRUCT_OPS, 0, SEC_SLEEPABLE), + SEC_DEF("sk_lookup", SK_LOOKUP, BPF_SK_LOOKUP, SEC_ATTACHABLE), + SEC_DEF("netfilter", NETFILTER, BPF_NETFILTER, SEC_NONE), }; -static size_t custom_sec_def_cnt; -static struct bpf_sec_def *custom_sec_defs; -static struct bpf_sec_def custom_fallback_def; -static bool has_custom_fallback_def; - -static int last_custom_sec_def_handler_id; - int libbpf_register_prog_handler(const char *sec, enum bpf_prog_type prog_type, enum bpf_attach_type exp_attach_type, @@ -9016,14 +9530,17 @@ int libbpf_unregister_prog_handler(int handler_id) /* try to shrink the array, but it's ok if we couldn't */ sec_defs = libbpf_reallocarray(custom_sec_defs, custom_sec_def_cnt, sizeof(*sec_defs)); - if (sec_defs) + /* if new count is zero, reallocarray can return a valid NULL result; + * in this case the previous pointer will be freed, so we *have to* + * reassign old pointer to the new value (even if it's NULL) + */ + if (sec_defs || custom_sec_def_cnt == 0) custom_sec_defs = sec_defs; return 0; } -static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_name, - bool allow_sloppy) +static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_name) { size_t len = strlen(sec_def->sec); @@ -9048,17 +9565,6 @@ static bool sec_def_matches(const struct bpf_sec_def *sec_def, const char *sec_n return false; } - /* SEC_SLOPPY_PFX definitions are allowed to be just prefix - * matches, unless strict section name mode - * (LIBBPF_STRICT_SEC_NAME) is enabled, in which case the - * match has to be exact. - */ - if (allow_sloppy && str_has_pfx(sec_name, sec_def->sec)) - return true; - - /* Definitions not marked SEC_SLOPPY_PFX (e.g., - * SEC("syscall")) are exact matches in both modes. - */ return strcmp(sec_name, sec_def->sec) == 0; } @@ -9066,20 +9572,18 @@ static const struct bpf_sec_def *find_sec_def(const char *sec_name) { const struct bpf_sec_def *sec_def; int i, n; - bool strict = libbpf_mode & LIBBPF_STRICT_SEC_NAME, allow_sloppy; n = custom_sec_def_cnt; for (i = 0; i < n; i++) { sec_def = &custom_sec_defs[i]; - if (sec_def_matches(sec_def, sec_name, false)) + if (sec_def_matches(sec_def, sec_name)) return sec_def; } n = ARRAY_SIZE(section_defs); for (i = 0; i < n; i++) { sec_def = §ion_defs[i]; - allow_sloppy = (sec_def->cookie & SEC_SLOPPY_PFX) && !strict; - if (sec_def_matches(sec_def, sec_name, allow_sloppy)) + if (sec_def_matches(sec_def, sec_name)) return sec_def; } @@ -9150,7 +9654,40 @@ int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, return libbpf_err(-ESRCH); } +const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(attach_type_name)) + return NULL; + + return attach_type_name[t]; +} + +const char *libbpf_bpf_link_type_str(enum bpf_link_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(link_type_name)) + return NULL; + + return link_type_name[t]; +} + +const char *libbpf_bpf_map_type_str(enum bpf_map_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(map_type_name)) + return NULL; + + return map_type_name[t]; +} + +const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t) +{ + if (t < 0 || t >= ARRAY_SIZE(prog_type_name)) + return NULL; + + return prog_type_name[t]; +} + static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, + int sec_idx, size_t offset) { struct bpf_map *map; @@ -9160,7 +9697,8 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, map = &obj->maps[i]; if (!bpf_map__is_struct_ops(map)) continue; - if (map->sec_offset <= offset && + if (map->sec_idx == sec_idx && + map->sec_offset <= offset && offset - map->sec_offset < map->def.value_size) return map; } @@ -9168,7 +9706,9 @@ static struct bpf_map *find_struct_ops_map_by_offset(struct bpf_object *obj, return NULL; } -/* Collect the reloc from ELF and populate the st_ops->progs[] */ +/* Collect the reloc from ELF, populate the st_ops->progs[], and update + * st_ops->data for shadow type. + */ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, Elf64_Shdr *shdr, Elf_Data *data) { @@ -9202,7 +9742,7 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, } name = elf_sym_str(obj, sym->st_name) ?: ""; - map = find_struct_ops_map_by_offset(obj, rel->r_offset); + map = find_struct_ops_map_by_offset(obj, shdr->sh_info, rel->r_offset); if (!map) { pr_warn("struct_ops reloc: cannot find map at rel->r_offset %zu\n", (size_t)rel->r_offset); @@ -9260,27 +9800,15 @@ static int bpf_object__collect_st_ops_relos(struct bpf_object *obj, return -EINVAL; } - /* if we haven't yet processed this BPF program, record proper - * attach_btf_id and member_idx - */ - if (!prog->attach_btf_id) { - prog->attach_btf_id = st_ops->type_id; - prog->expected_attach_type = member_idx; - } + st_ops->progs[member_idx] = prog; - /* struct_ops BPF prog can be re-used between multiple - * .struct_ops as long as it's the same struct_ops struct - * definition and the same function pointer field + /* st_ops->data will be exposed to users, being returned by + * bpf_map__initial_value() as a pointer to the shadow + * type. All function pointers in the original struct type + * should be converted to a pointer to struct bpf_program + * in the shadow type. */ - if (prog->attach_btf_id != st_ops->type_id || - prog->expected_attach_type != member_idx) { - pr_warn("struct_ops reloc %s: cannot use prog %s in sec %s with type %u attach_btf_id %u expected_attach_type %u for func ptr %s\n", - map->name, prog->name, prog->sec_name, prog->type, - prog->attach_btf_id, prog->expected_attach_type, name); - return -EINVAL; - } - - st_ops->progs[member_idx] = prog; + *((struct bpf_program **)(st_ops->data + moff)) = prog; } return 0; @@ -9300,6 +9828,7 @@ void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, *kind = BTF_KIND_TYPEDEF; break; case BPF_LSM_MAC: + case BPF_LSM_CGROUP: *prefix = BTF_LSM_PREFIX; *kind = BTF_KIND_FUNC; break; @@ -9363,14 +9892,15 @@ int libbpf_find_vmlinux_btf_id(const char *name, static int libbpf_find_prog_btf_id(const char *name, __u32 attach_prog_fd) { - struct bpf_prog_info info = {}; + struct bpf_prog_info info; __u32 info_len = sizeof(info); struct btf *btf; int err; - err = bpf_obj_get_info_by_fd(attach_prog_fd, &info, &info_len); + memset(&info, 0, info_len); + err = bpf_prog_get_info_by_fd(attach_prog_fd, &info, &info_len); if (err) { - pr_warn("failed bpf_obj_get_info_by_fd for FD %d: %d\n", + pr_warn("failed bpf_prog_get_info_by_fd for FD %d: %d\n", attach_prog_fd, err); return err; } @@ -9400,16 +9930,28 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, enum bpf_attach_type attach_type, int *btf_obj_fd, int *btf_type_id) { - int ret, i; + int ret, i, mod_len; + const char *fn_name, *mod_name = NULL; - ret = find_attach_btf_id(obj->btf_vmlinux, attach_name, attach_type); - if (ret > 0) { - *btf_obj_fd = 0; /* vmlinux BTF */ - *btf_type_id = ret; - return 0; + fn_name = strchr(attach_name, ':'); + if (fn_name) { + mod_name = attach_name; + mod_len = fn_name - mod_name; + fn_name++; + } + + if (!mod_name || strncmp(mod_name, "vmlinux", mod_len) == 0) { + ret = find_attach_btf_id(obj->btf_vmlinux, + mod_name ? fn_name : attach_name, + attach_type); + if (ret > 0) { + *btf_obj_fd = 0; /* vmlinux BTF */ + *btf_type_id = ret; + return 0; + } + if (ret != -ENOENT) + return ret; } - if (ret != -ENOENT) - return ret; ret = load_module_btfs(obj); if (ret) @@ -9418,7 +9960,12 @@ static int find_kernel_btf_id(struct bpf_object *obj, const char *attach_name, for (i = 0; i < obj->btf_module_cnt; i++) { const struct module_btf *mod = &obj->btf_modules[i]; - ret = find_attach_btf_id(mod->btf, attach_name, attach_type); + if (mod_name && strncmp(mod->name, mod_name, mod_len) != 0) + continue; + + ret = find_attach_btf_id(mod->btf, + mod_name ? fn_name : attach_name, + attach_type); if (ret > 0) { *btf_obj_fd = mod->fd; *btf_type_id = ret; @@ -9441,11 +9988,15 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac int err = 0; /* BPF program's BTF ID */ - if (attach_prog_fd) { + if (prog->type == BPF_PROG_TYPE_EXT || attach_prog_fd) { + if (!attach_prog_fd) { + pr_warn("prog '%s': attach program FD is not set\n", prog->name); + return -EINVAL; + } err = libbpf_find_prog_btf_id(attach_name, attach_prog_fd); if (err < 0) { - pr_warn("failed to find BPF program (FD %d) BTF ID for '%s': %d\n", - attach_prog_fd, attach_name, err); + pr_warn("prog '%s': failed to find BPF program (FD %d) BTF ID for '%s': %d\n", + prog->name, attach_prog_fd, attach_name, err); return err; } *btf_obj_fd = 0; @@ -9459,10 +10010,13 @@ static int libbpf_find_attach_btf_id(struct bpf_program *prog, const char *attac *btf_obj_fd = 0; *btf_type_id = 1; } else { - err = find_kernel_btf_id(prog->obj, attach_name, attach_type, btf_obj_fd, btf_type_id); + err = find_kernel_btf_id(prog->obj, attach_name, + attach_type, btf_obj_fd, + btf_type_id); } if (err) { - pr_warn("failed to find kernel BTF type ID of '%s': %d\n", attach_name, err); + pr_warn("prog '%s': failed to find kernel BTF type ID of '%s': %d\n", + prog->name, attach_name, err); return err; } return 0; @@ -9500,12 +10054,11 @@ int libbpf_attach_type_by_name(const char *name, int bpf_map__fd(const struct bpf_map *map) { - return map ? map->fd : libbpf_err(-EINVAL); -} - -const struct bpf_map_def *bpf_map__def(const struct bpf_map *map) -{ - return map ? &map->def : libbpf_err_ptr(-EINVAL); + if (!map) + return libbpf_err(-EINVAL); + if (!map_is_created(map)) + return -1; + return map->fd; } static bool map_uses_real_name(const struct bpf_map *map) @@ -9541,7 +10094,7 @@ enum bpf_map_type bpf_map__type(const struct bpf_map *map) int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->def.type = type; return 0; @@ -9554,7 +10107,7 @@ __u32 bpf_map__map_flags(const struct bpf_map *map) int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->def.map_flags = flags; return 0; @@ -9567,7 +10120,7 @@ __u64 bpf_map__map_extra(const struct bpf_map *map) int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->map_extra = map_extra; return 0; @@ -9580,7 +10133,7 @@ __u32 bpf_map__numa_node(const struct bpf_map *map) int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->numa_node = numa_node; return 0; @@ -9593,7 +10146,7 @@ __u32 bpf_map__key_size(const struct bpf_map *map) int bpf_map__set_key_size(struct bpf_map *map, __u32 size) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->def.key_size = size; return 0; @@ -9604,67 +10157,159 @@ __u32 bpf_map__value_size(const struct bpf_map *map) return map->def.value_size; } -int bpf_map__set_value_size(struct bpf_map *map, __u32 size) +static int map_btf_datasec_resize(struct bpf_map *map, __u32 size) { - if (map->fd >= 0) - return libbpf_err(-EBUSY); - map->def.value_size = size; - return 0; -} + struct btf *btf; + struct btf_type *datasec_type, *var_type; + struct btf_var_secinfo *var; + const struct btf_type *array_type; + const struct btf_array *array; + int vlen, element_sz, new_array_id; + __u32 nr_elements; -__u32 bpf_map__btf_key_type_id(const struct bpf_map *map) -{ - return map ? map->btf_key_type_id : 0; -} + /* check btf existence */ + btf = bpf_object__btf(map->obj); + if (!btf) + return -ENOENT; -__u32 bpf_map__btf_value_type_id(const struct bpf_map *map) -{ - return map ? map->btf_value_type_id : 0; + /* verify map is datasec */ + datasec_type = btf_type_by_id(btf, bpf_map__btf_value_type_id(map)); + if (!btf_is_datasec(datasec_type)) { + pr_warn("map '%s': cannot be resized, map value type is not a datasec\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify datasec has at least one var */ + vlen = btf_vlen(datasec_type); + if (vlen == 0) { + pr_warn("map '%s': cannot be resized, map value datasec is empty\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify last var in the datasec is an array */ + var = &btf_var_secinfos(datasec_type)[vlen - 1]; + var_type = btf_type_by_id(btf, var->type); + array_type = skip_mods_and_typedefs(btf, var_type->type, NULL); + if (!btf_is_array(array_type)) { + pr_warn("map '%s': cannot be resized, last var must be an array\n", + bpf_map__name(map)); + return -EINVAL; + } + + /* verify request size aligns with array */ + array = btf_array(array_type); + element_sz = btf__resolve_size(btf, array->type); + if (element_sz <= 0 || (size - var->offset) % element_sz != 0) { + pr_warn("map '%s': cannot be resized, element size (%d) doesn't align with new total size (%u)\n", + bpf_map__name(map), element_sz, size); + return -EINVAL; + } + + /* create a new array based on the existing array, but with new length */ + nr_elements = (size - var->offset) / element_sz; + new_array_id = btf__add_array(btf, array->index_type, array->type, nr_elements); + if (new_array_id < 0) + return new_array_id; + + /* adding a new btf type invalidates existing pointers to btf objects, + * so refresh pointers before proceeding + */ + datasec_type = btf_type_by_id(btf, map->btf_value_type_id); + var = &btf_var_secinfos(datasec_type)[vlen - 1]; + var_type = btf_type_by_id(btf, var->type); + + /* finally update btf info */ + datasec_type->size = size; + var->size = size - var->offset; + var_type->type = new_array_id; + + return 0; } -int bpf_map__set_priv(struct bpf_map *map, void *priv, - bpf_map_clear_priv_t clear_priv) +int bpf_map__set_value_size(struct bpf_map *map, __u32 size) { - if (!map) - return libbpf_err(-EINVAL); + if (map->obj->loaded || map->reused) + return libbpf_err(-EBUSY); + + if (map->mmaped) { + size_t mmap_old_sz, mmap_new_sz; + int err; - if (map->priv) { - if (map->clear_priv) - map->clear_priv(map, map->priv); + if (map->def.type != BPF_MAP_TYPE_ARRAY) + return -EOPNOTSUPP; + + mmap_old_sz = bpf_map_mmap_sz(map); + mmap_new_sz = array_map_mmap_sz(size, map->def.max_entries); + err = bpf_map_mmap_resize(map, mmap_old_sz, mmap_new_sz); + if (err) { + pr_warn("map '%s': failed to resize memory-mapped region: %d\n", + bpf_map__name(map), err); + return err; + } + err = map_btf_datasec_resize(map, size); + if (err && err != -ENOENT) { + pr_warn("map '%s': failed to adjust resized BTF, clearing BTF key/value info: %d\n", + bpf_map__name(map), err); + map->btf_value_type_id = 0; + map->btf_key_type_id = 0; + } } - map->priv = priv; - map->clear_priv = clear_priv; + map->def.value_size = size; return 0; } -void *bpf_map__priv(const struct bpf_map *map) +__u32 bpf_map__btf_key_type_id(const struct bpf_map *map) +{ + return map ? map->btf_key_type_id : 0; +} + +__u32 bpf_map__btf_value_type_id(const struct bpf_map *map) { - return map ? map->priv : libbpf_err_ptr(-EINVAL); + return map ? map->btf_value_type_id : 0; } int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size) { - if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG || - size != map->def.value_size || map->fd >= 0) + size_t actual_sz; + + if (map->obj->loaded || map->reused) + return libbpf_err(-EBUSY); + + if (!map->mmaped || map->libbpf_type == LIBBPF_MAP_KCONFIG) + return libbpf_err(-EINVAL); + + if (map->def.type == BPF_MAP_TYPE_ARENA) + actual_sz = map->obj->arena_data_sz; + else + actual_sz = map->def.value_size; + if (size != actual_sz) return libbpf_err(-EINVAL); memcpy(map->mmaped, data, size); return 0; } -const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize) +void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize) { + if (bpf_map__is_struct_ops(map)) { + if (psize) + *psize = map->def.value_size; + return map->st_ops->data; + } + if (!map->mmaped) return NULL; - *psize = map->def.value_size; - return map->mmaped; -} -bool bpf_map__is_offload_neutral(const struct bpf_map *map) -{ - return map->def.type == BPF_MAP_TYPE_PERF_EVENT_ARRAY; + if (map->def.type == BPF_MAP_TYPE_ARENA) + *psize = map->obj->arena_data_sz; + else + *psize = map->def.value_size; + + return map->mmaped; } bool bpf_map__is_internal(const struct bpf_map *map) @@ -9679,7 +10324,7 @@ __u32 bpf_map__ifindex(const struct bpf_map *map) int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex) { - if (map->fd >= 0) + if (map_is_created(map)) return libbpf_err(-EBUSY); map->map_ifindex = ifindex; return 0; @@ -9727,31 +10372,19 @@ __bpf_map__iter(const struct bpf_map *m, const struct bpf_object *obj, int i) return &obj->maps[idx]; } -struct bpf_map * -bpf_map__next(const struct bpf_map *prev, const struct bpf_object *obj) -{ - return bpf_object__next_map(obj, prev); -} - struct bpf_map * bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *prev) { - if (prev == NULL) + if (prev == NULL && obj != NULL) return obj->maps; return __bpf_map__iter(prev, obj, 1); } -struct bpf_map * -bpf_map__prev(const struct bpf_map *next, const struct bpf_object *obj) -{ - return bpf_object__prev_map(obj, next); -} - struct bpf_map * bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *next) { - if (next == NULL) { + if (next == NULL && obj != NULL) { if (!obj->nr_maps) return NULL; return obj->maps + obj->nr_maps - 1; @@ -9793,123 +10426,144 @@ bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name) return bpf_map__fd(bpf_object__find_map_by_name(obj, name)); } -struct bpf_map * -bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset) +static int validate_map_op(const struct bpf_map *map, size_t key_sz, + size_t value_sz, bool check_value_sz) { - return libbpf_err_ptr(-ENOTSUP); + if (!map_is_created(map)) /* map is not yet created */ + return -ENOENT; + + if (map->def.key_size != key_sz) { + pr_warn("map '%s': unexpected key size %zu provided, expected %u\n", + map->name, key_sz, map->def.key_size); + return -EINVAL; + } + + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); + return -EINVAL; + } + + if (!check_value_sz) + return 0; + + switch (map->def.type) { + case BPF_MAP_TYPE_PERCPU_ARRAY: + case BPF_MAP_TYPE_PERCPU_HASH: + case BPF_MAP_TYPE_LRU_PERCPU_HASH: + case BPF_MAP_TYPE_PERCPU_CGROUP_STORAGE: { + int num_cpu = libbpf_num_possible_cpus(); + size_t elem_sz = roundup(map->def.value_size, 8); + + if (value_sz != num_cpu * elem_sz) { + pr_warn("map '%s': unexpected value size %zu provided for per-CPU map, expected %d * %zu = %zd\n", + map->name, value_sz, num_cpu, elem_sz, num_cpu * elem_sz); + return -EINVAL; + } + break; + } + default: + if (map->def.value_size != value_sz) { + pr_warn("map '%s': unexpected value size %zu provided, expected %u\n", + map->name, value_sz, map->def.value_size); + return -EINVAL; + } + break; + } + return 0; } -long libbpf_get_error(const void *ptr) +int bpf_map__lookup_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags) { - if (!IS_ERR_OR_NULL(ptr)) - return 0; + int err; - if (IS_ERR(ptr)) - errno = -PTR_ERR(ptr); + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); - /* If ptr == NULL, then errno should be already set by the failing - * API, because libbpf never returns NULL on success and it now always - * sets errno on error. So no extra errno handling for ptr == NULL - * case. - */ - return -errno; + return bpf_map_lookup_elem_flags(map->fd, key, value, flags); } -__attribute__((alias("bpf_prog_load_xattr2"))) -int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd); - -static int bpf_prog_load_xattr2(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd) +int bpf_map__update_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + const void *value, size_t value_sz, __u64 flags) { - struct bpf_object_open_attr open_attr = {}; - struct bpf_program *prog, *first_prog = NULL; - struct bpf_object *obj; - struct bpf_map *map; int err; - if (!attr) - return libbpf_err(-EINVAL); - if (!attr->file) - return libbpf_err(-EINVAL); + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); + + return bpf_map_update_elem(map->fd, key, value, flags); +} - open_attr.file = attr->file; - open_attr.prog_type = attr->prog_type; +int bpf_map__delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, __u64 flags) +{ + int err; - obj = __bpf_object__open_xattr(&open_attr, 0); - err = libbpf_get_error(obj); + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); if (err) - return libbpf_err(-ENOENT); + return libbpf_err(err); - bpf_object__for_each_program(prog, obj) { - enum bpf_attach_type attach_type = attr->expected_attach_type; - /* - * to preserve backwards compatibility, bpf_prog_load treats - * attr->prog_type, if specified, as an override to whatever - * bpf_object__open guessed - */ - if (attr->prog_type != BPF_PROG_TYPE_UNSPEC) { - prog->type = attr->prog_type; - prog->expected_attach_type = attach_type; - } - if (bpf_program__type(prog) == BPF_PROG_TYPE_UNSPEC) { - /* - * we haven't guessed from section name and user - * didn't provide a fallback type, too bad... - */ - bpf_object__close(obj); - return libbpf_err(-EINVAL); - } + return bpf_map_delete_elem_flags(map->fd, key, flags); +} - prog->prog_ifindex = attr->ifindex; - prog->log_level = attr->log_level; - prog->prog_flags |= attr->prog_flags; - if (!first_prog) - first_prog = prog; - } +int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags) +{ + int err; - bpf_object__for_each_map(map, obj) { - if (map->def.type != BPF_MAP_TYPE_PERF_EVENT_ARRAY) - map->map_ifindex = attr->ifindex; - } + err = validate_map_op(map, key_sz, value_sz, true); + if (err) + return libbpf_err(err); - if (!first_prog) { - pr_warn("object file doesn't contain bpf program\n"); - bpf_object__close(obj); - return libbpf_err(-ENOENT); - } + return bpf_map_lookup_and_delete_elem_flags(map->fd, key, value, flags); +} - err = bpf_object__load(obj); - if (err) { - bpf_object__close(obj); +int bpf_map__get_next_key(const struct bpf_map *map, + const void *cur_key, void *next_key, size_t key_sz) +{ + int err; + + err = validate_map_op(map, key_sz, 0, false /* check_value_sz */); + if (err) return libbpf_err(err); - } - *pobj = obj; - *prog_fd = bpf_program__fd(first_prog); - return 0; + return bpf_map_get_next_key(map->fd, cur_key, next_key); } -COMPAT_VERSION(bpf_prog_load_deprecated, bpf_prog_load, LIBBPF_0.0.1) -int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type, - struct bpf_object **pobj, int *prog_fd) +long libbpf_get_error(const void *ptr) { - struct bpf_prog_load_attr attr; + if (!IS_ERR_OR_NULL(ptr)) + return 0; - memset(&attr, 0, sizeof(struct bpf_prog_load_attr)); - attr.file = file; - attr.prog_type = type; - attr.expected_attach_type = 0; + if (IS_ERR(ptr)) + errno = -PTR_ERR(ptr); - return bpf_prog_load_xattr2(&attr, pobj, prog_fd); + /* If ptr == NULL, then errno should be already set by the failing + * API, because libbpf never returns NULL on success and it now always + * sets errno on error. So no extra errno handling for ptr == NULL + * case. + */ + return -errno; } /* Replace link's underlying BPF program with the new one */ int bpf_link__update_program(struct bpf_link *link, struct bpf_program *prog) { int ret; + int prog_fd = bpf_program__fd(prog); + + if (prog_fd < 0) { + pr_warn("prog '%s': can't use BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err(-EINVAL); + } - ret = bpf_link_update(bpf_link__fd(link), bpf_program__fd(prog), NULL); + ret = bpf_link_update(bpf_link__fd(link), prog_fd, NULL); return libbpf_err_errno(ret); } @@ -10091,6 +10745,7 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p char errmsg[STRERR_BUFSIZE]; struct bpf_link_perf *link; int prog_fd, link_fd = -1, err; + bool force_ioctl_attach; if (!OPTS_VALID(opts, bpf_perf_event_opts)) return libbpf_err_ptr(-EINVAL); @@ -10102,7 +10757,7 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p } prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } @@ -10114,7 +10769,8 @@ struct bpf_link *bpf_program__attach_perf_event_opts(const struct bpf_program *p link->link.dealloc = &bpf_link_perf_dealloc; link->perf_event_fd = pfd; - if (kernel_supports(prog->obj, FEAT_PERF_LINK)) { + force_ioctl_attach = OPTS_GET(opts, force_ioctl_attach, false); + if (kernel_supports(prog->obj, FEAT_PERF_LINK) && !force_ioctl_attach) { DECLARE_LIBBPF_OPTS(bpf_link_create_opts, link_opts, .perf_event.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0)); @@ -10176,7 +10832,7 @@ static int parse_uint_from_file(const char *file, const char *fmt) int err, ret; FILE *f; - f = fopen(file, "r"); + f = fopen(file, "re"); if (!f) { err = -errno; pr_debug("failed to open '%s': %s\n", file, @@ -10229,13 +10885,16 @@ static int determine_uprobe_retprobe_bit(void) static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, uint64_t offset, int pid, size_t ref_ctr_off) { - struct perf_event_attr attr = {}; + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; char errmsg[STRERR_BUFSIZE]; - int type, pfd, err; + int type, pfd; - if (ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) + if ((__u64)ref_ctr_off >= (1ULL << PERF_UPROBE_REF_CTR_OFFSET_BITS)) return -EINVAL; + memset(&attr, 0, attr_sz); + type = uprobe ? determine_uprobe_perf_type() : determine_kprobe_perf_type(); if (type < 0) { @@ -10256,7 +10915,7 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, } attr.config |= 1 << bit; } - attr.size = sizeof(attr); + attr.size = attr_sz; attr.type = type; attr.config |= (__u64)ref_ctr_off << PERF_UPROBE_REF_CTR_OFFSET_SHIFT; attr.config1 = ptr_to_u64(name); /* kprobe_func or uprobe_path */ @@ -10267,51 +10926,93 @@ static int perf_event_open_probe(bool uprobe, bool retprobe, const char *name, pid < 0 ? -1 : pid /* pid */, pid == -1 ? 0 : -1 /* cpu */, -1 /* group_fd */, PERF_FLAG_FD_CLOEXEC); - if (pfd < 0) { - err = -errno; - pr_warn("%s perf_event_open() failed: %s\n", - uprobe ? "uprobe" : "kprobe", - libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - return err; - } - return pfd; + return pfd >= 0 ? pfd : -errno; } static int append_to_file(const char *file, const char *fmt, ...) { int fd, n, err = 0; va_list ap; + char buf[1024]; + + va_start(ap, fmt); + n = vsnprintf(buf, sizeof(buf), fmt, ap); + va_end(ap); + + if (n < 0 || n >= sizeof(buf)) + return -EINVAL; fd = open(file, O_WRONLY | O_APPEND | O_CLOEXEC, 0); if (fd < 0) return -errno; - va_start(ap, fmt); - n = vdprintf(fd, fmt, ap); - va_end(ap); - - if (n < 0) + if (write(fd, buf, n) < 0) err = -errno; close(fd); return err; } +#define DEBUGFS "/sys/kernel/debug/tracing" +#define TRACEFS "/sys/kernel/tracing" + +static bool use_debugfs(void) +{ + static int has_debugfs = -1; + + if (has_debugfs < 0) + has_debugfs = faccessat(AT_FDCWD, DEBUGFS, F_OK, AT_EACCESS) == 0; + + return has_debugfs == 1; +} + +static const char *tracefs_path(void) +{ + return use_debugfs() ? DEBUGFS : TRACEFS; +} + +static const char *tracefs_kprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/kprobe_events" : TRACEFS"/kprobe_events"; +} + +static const char *tracefs_uprobe_events(void) +{ + return use_debugfs() ? DEBUGFS"/uprobe_events" : TRACEFS"/uprobe_events"; +} + +static const char *tracefs_available_filter_functions(void) +{ + return use_debugfs() ? DEBUGFS"/available_filter_functions" + : TRACEFS"/available_filter_functions"; +} + +static const char *tracefs_available_filter_functions_addrs(void) +{ + return use_debugfs() ? DEBUGFS"/available_filter_functions_addrs" + : TRACEFS"/available_filter_functions_addrs"; +} + static void gen_kprobe_legacy_event_name(char *buf, size_t buf_sz, const char *kfunc_name, size_t offset) { static int index = 0; + int i; snprintf(buf, buf_sz, "libbpf_%u_%s_0x%zx_%d", getpid(), kfunc_name, offset, __sync_fetch_and_add(&index, 1)); + + /* sanitize binary_path in the probe name */ + for (i = 0; buf[i]; i++) { + if (!isalnum(buf[i])) + buf[i] = '_'; + } } static int add_kprobe_event_legacy(const char *probe_name, bool retprobe, const char *kfunc_name, size_t offset) { - const char *file = "/sys/kernel/debug/tracing/kprobe_events"; - - return append_to_file(file, "%c:%s/%s %s+0x%zx", + return append_to_file(tracefs_kprobe_events(), "%c:%s/%s %s+0x%zx", retprobe ? 'r' : 'p', retprobe ? "kretprobes" : "kprobes", probe_name, kfunc_name, offset); @@ -10319,18 +11020,16 @@ static int add_kprobe_event_legacy(const char *probe_name, bool retprobe, static int remove_kprobe_event_legacy(const char *probe_name, bool retprobe) { - const char *file = "/sys/kernel/debug/tracing/kprobe_events"; - - return append_to_file(file, "-:%s/%s", retprobe ? "kretprobes" : "kprobes", probe_name); + return append_to_file(tracefs_kprobe_events(), "-:%s/%s", + retprobe ? "kretprobes" : "kprobes", probe_name); } static int determine_kprobe_perf_type_legacy(const char *probe_name, bool retprobe) { char file[256]; - snprintf(file, sizeof(file), - "/sys/kernel/debug/tracing/events/%s/%s/id", - retprobe ? "kretprobes" : "kprobes", probe_name); + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "kretprobes" : "kprobes", probe_name); return parse_uint_from_file(file, "%d\n"); } @@ -10338,7 +11037,8 @@ static int determine_kprobe_perf_type_legacy(const char *probe_name, bool retpro static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, const char *kfunc_name, size_t offset, int pid) { - struct perf_event_attr attr = {}; + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; char errmsg[STRERR_BUFSIZE]; int type, pfd, err; @@ -10351,12 +11051,15 @@ static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, } type = determine_kprobe_perf_type_legacy(probe_name, retprobe); if (type < 0) { + err = type; pr_warn("failed to determine legacy kprobe event id for '%s+0x%zx': %s\n", kfunc_name, offset, - libbpf_strerror_r(type, errmsg, sizeof(errmsg))); - return type; + libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto err_clean_legacy; } - attr.size = sizeof(attr); + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; attr.config = type; attr.type = PERF_TYPE_TRACEPOINT; @@ -10368,9 +11071,72 @@ static int perf_event_kprobe_open_legacy(const char *probe_name, bool retprobe, err = -errno; pr_warn("legacy kprobe perf_event_open() failed: %s\n", libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - return err; + goto err_clean_legacy; } return pfd; + +err_clean_legacy: + /* Clear the newly added legacy kprobe_event */ + remove_kprobe_event_legacy(probe_name, retprobe); + return err; +} + +static const char *arch_specific_syscall_pfx(void) +{ +#if defined(__x86_64__) + return "x64"; +#elif defined(__i386__) + return "ia32"; +#elif defined(__s390x__) + return "s390x"; +#elif defined(__s390__) + return "s390"; +#elif defined(__arm__) + return "arm"; +#elif defined(__aarch64__) + return "arm64"; +#elif defined(__mips__) + return "mips"; +#elif defined(__riscv) + return "riscv"; +#elif defined(__powerpc__) + return "powerpc"; +#elif defined(__powerpc64__) + return "powerpc64"; +#else + return NULL; +#endif +} + +int probe_kern_syscall_wrapper(int token_fd) +{ + char syscall_name[64]; + const char *ksys_pfx; + + ksys_pfx = arch_specific_syscall_pfx(); + if (!ksys_pfx) + return 0; + + snprintf(syscall_name, sizeof(syscall_name), "__%s_sys_bpf", ksys_pfx); + + if (determine_kprobe_perf_type() >= 0) { + int pfd; + + pfd = perf_event_open_probe(false, false, syscall_name, 0, getpid(), 0); + if (pfd >= 0) + close(pfd); + + return pfd >= 0 ? 1 : 0; + } else { /* legacy mode */ + char probe_name[128]; + + gen_kprobe_legacy_event_name(probe_name, sizeof(probe_name), syscall_name, 0); + if (add_kprobe_event_legacy(probe_name, false, syscall_name, 0) < 0) + return 0; + + (void)remove_kprobe_event_legacy(probe_name, false); + return 1; + } } struct bpf_link * @@ -10379,6 +11145,7 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, const struct bpf_kprobe_opts *opts) { DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + enum probe_attach_mode attach_mode; char errmsg[STRERR_BUFSIZE]; char *legacy_probe = NULL; struct bpf_link *link; @@ -10389,11 +11156,32 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, if (!OPTS_VALID(opts, bpf_kprobe_opts)) return libbpf_err_ptr(-EINVAL); + attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT); retprobe = OPTS_GET(opts, retprobe, false); offset = OPTS_GET(opts, offset, 0); pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); legacy = determine_kprobe_perf_type() < 0; + switch (attach_mode) { + case PROBE_ATTACH_MODE_LEGACY: + legacy = true; + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_PERF: + if (legacy) + return libbpf_err_ptr(-ENOTSUP); + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_LINK: + if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK)) + return libbpf_err_ptr(-ENOTSUP); + break; + case PROBE_ATTACH_MODE_DEFAULT: + break; + default: + return libbpf_err_ptr(-EINVAL); + } + if (!legacy) { pfd = perf_event_open_probe(false /* uprobe */, retprobe, func_name, offset, @@ -10427,7 +11215,7 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, prog->name, retprobe ? "kretprobe" : "kprobe", func_name, offset, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - goto err_out; + goto err_clean_legacy; } if (legacy) { struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); @@ -10438,6 +11226,10 @@ bpf_program__attach_kprobe_opts(const struct bpf_program *prog, } return link; + +err_clean_legacy: + if (legacy) + remove_kprobe_event_legacy(legacy_probe, retprobe); err_out: free(legacy_probe); return libbpf_err_ptr(err); @@ -10454,13 +11246,41 @@ struct bpf_link *bpf_program__attach_kprobe(const struct bpf_program *prog, return bpf_program__attach_kprobe_opts(prog, func_name, &opts); } -/* Adapted from perf/util/string.c */ -static bool glob_match(const char *str, const char *pat) +struct bpf_link *bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts) { - while (*str && *pat && *pat != '*') { - if (*pat == '?') { /* Matches any single character */ - str++; - pat++; + LIBBPF_OPTS(bpf_kprobe_opts, kprobe_opts); + char func_name[128]; + + if (!OPTS_VALID(opts, bpf_ksyscall_opts)) + return libbpf_err_ptr(-EINVAL); + + if (kernel_supports(prog->obj, FEAT_SYSCALL_WRAPPER)) { + /* arch_specific_syscall_pfx() should never return NULL here + * because it is guarded by kernel_supports(). However, since + * compiler does not know that we have an explicit conditional + * as well. + */ + snprintf(func_name, sizeof(func_name), "__%s_sys_%s", + arch_specific_syscall_pfx() ? : "", syscall_name); + } else { + snprintf(func_name, sizeof(func_name), "__se_sys_%s", syscall_name); + } + + kprobe_opts.retprobe = OPTS_GET(opts, retprobe, false); + kprobe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); + + return bpf_program__attach_kprobe_opts(prog, func_name, &kprobe_opts); +} + +/* Adapted from perf/util/string.c */ +bool glob_match(const char *str, const char *pat) +{ + while (*str && *pat && *pat != '*') { + if (*pat == '?') { /* Matches any single character */ + str++; + pat++; continue; } if (*str != *pat) @@ -10488,25 +11308,158 @@ struct kprobe_multi_resolve { size_t cnt; }; -static int -resolve_kprobe_multi_cb(unsigned long long sym_addr, char sym_type, - const char *sym_name, void *ctx) +struct avail_kallsyms_data { + char **syms; + size_t cnt; + struct kprobe_multi_resolve *res; +}; + +static int avail_func_cmp(const void *a, const void *b) { - struct kprobe_multi_resolve *res = ctx; + return strcmp(*(const char **)a, *(const char **)b); +} + +static int avail_kallsyms_cb(unsigned long long sym_addr, char sym_type, + const char *sym_name, void *ctx) +{ + struct avail_kallsyms_data *data = ctx; + struct kprobe_multi_resolve *res = data->res; int err; - if (!glob_match(sym_name, res->pattern)) + if (!bsearch(&sym_name, data->syms, data->cnt, sizeof(*data->syms), avail_func_cmp)) return 0; - err = libbpf_ensure_mem((void **) &res->addrs, &res->cap, sizeof(unsigned long), - res->cnt + 1); + err = libbpf_ensure_mem((void **)&res->addrs, &res->cap, sizeof(*res->addrs), res->cnt + 1); if (err) return err; - res->addrs[res->cnt++] = (unsigned long) sym_addr; + res->addrs[res->cnt++] = (unsigned long)sym_addr; return 0; } +static int libbpf_available_kallsyms_parse(struct kprobe_multi_resolve *res) +{ + const char *available_functions_file = tracefs_available_filter_functions(); + struct avail_kallsyms_data data; + char sym_name[500]; + FILE *f; + int err = 0, ret, i; + char **syms = NULL; + size_t cap = 0, cnt = 0; + + f = fopen(available_functions_file, "re"); + if (!f) { + err = -errno; + pr_warn("failed to open %s: %d\n", available_functions_file, err); + return err; + } + + while (true) { + char *name; + + ret = fscanf(f, "%499s%*[^\n]\n", sym_name); + if (ret == EOF && feof(f)) + break; + + if (ret != 1) { + pr_warn("failed to parse available_filter_functions entry: %d\n", ret); + err = -EINVAL; + goto cleanup; + } + + if (!glob_match(sym_name, res->pattern)) + continue; + + err = libbpf_ensure_mem((void **)&syms, &cap, sizeof(*syms), cnt + 1); + if (err) + goto cleanup; + + name = strdup(sym_name); + if (!name) { + err = -errno; + goto cleanup; + } + + syms[cnt++] = name; + } + + /* no entries found, bail out */ + if (cnt == 0) { + err = -ENOENT; + goto cleanup; + } + + /* sort available functions */ + qsort(syms, cnt, sizeof(*syms), avail_func_cmp); + + data.syms = syms; + data.res = res; + data.cnt = cnt; + libbpf_kallsyms_parse(avail_kallsyms_cb, &data); + + if (res->cnt == 0) + err = -ENOENT; + +cleanup: + for (i = 0; i < cnt; i++) + free((char *)syms[i]); + free(syms); + + fclose(f); + return err; +} + +static bool has_available_filter_functions_addrs(void) +{ + return access(tracefs_available_filter_functions_addrs(), R_OK) != -1; +} + +static int libbpf_available_kprobes_parse(struct kprobe_multi_resolve *res) +{ + const char *available_path = tracefs_available_filter_functions_addrs(); + char sym_name[500]; + FILE *f; + int ret, err = 0; + unsigned long long sym_addr; + + f = fopen(available_path, "re"); + if (!f) { + err = -errno; + pr_warn("failed to open %s: %d\n", available_path, err); + return err; + } + + while (true) { + ret = fscanf(f, "%llx %499s%*[^\n]\n", &sym_addr, sym_name); + if (ret == EOF && feof(f)) + break; + + if (ret != 2) { + pr_warn("failed to parse available_filter_functions_addrs entry: %d\n", + ret); + err = -EINVAL; + goto cleanup; + } + + if (!glob_match(sym_name, res->pattern)) + continue; + + err = libbpf_ensure_mem((void **)&res->addrs, &res->cap, + sizeof(*res->addrs), res->cnt + 1); + if (err) + goto cleanup; + + res->addrs[res->cnt++] = (unsigned long)sym_addr; + } + + if (res->cnt == 0) + err = -ENOENT; + +cleanup: + fclose(f); + return err; +} + struct bpf_link * bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, const char *pattern, @@ -10516,18 +11469,26 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, struct kprobe_multi_resolve res = { .pattern = pattern, }; + enum bpf_attach_type attach_type; struct bpf_link *link = NULL; char errmsg[STRERR_BUFSIZE]; const unsigned long *addrs; int err, link_fd, prog_fd; + bool retprobe, session; const __u64 *cookies; const char **syms; - bool retprobe; size_t cnt; if (!OPTS_VALID(opts, bpf_kprobe_multi_opts)) return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + syms = OPTS_GET(opts, syms, false); addrs = OPTS_GET(opts, addrs, false); cnt = OPTS_GET(opts, cnt, false); @@ -10543,18 +11504,23 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, return libbpf_err_ptr(-EINVAL); if (pattern) { - err = libbpf_kallsyms_parse(resolve_kprobe_multi_cb, &res); + if (has_available_filter_functions_addrs()) + err = libbpf_available_kprobes_parse(&res); + else + err = libbpf_available_kallsyms_parse(&res); if (err) goto error; - if (!res.cnt) { - err = -ENOENT; - goto error; - } addrs = res.addrs; cnt = res.cnt; } retprobe = OPTS_GET(opts, retprobe, false); + session = OPTS_GET(opts, session, false); + + if (retprobe && session) + return libbpf_err_ptr(-EINVAL); + + attach_type = session ? BPF_TRACE_KPROBE_SESSION : BPF_TRACE_KPROBE_MULTI; lopts.kprobe_multi.syms = syms; lopts.kprobe_multi.addrs = addrs; @@ -10569,8 +11535,7 @@ bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, } link->detach = &bpf_link__detach_fd; - prog_fd = bpf_program__fd(prog); - link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_KPROBE_MULTI, &lopts); + link_fd = bpf_link_create(prog_fd, 0, attach_type, &lopts); if (link_fd < 0) { err = -errno; pr_warn("prog '%s': failed to attach: %s\n", @@ -10595,6 +11560,12 @@ static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf char *func; int n; + *link = NULL; + + /* no auto-attach for SEC("kprobe") and SEC("kretprobe") */ + if (strcmp(prog->sec_name, "kprobe") == 0 || strcmp(prog->sec_name, "kretprobe") == 0) + return 0; + opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe/"); if (opts.retprobe) func_name = prog->sec_name + sizeof("kretprobe/") - 1; @@ -10618,6 +11589,27 @@ static int attach_kprobe(const struct bpf_program *prog, long cookie, struct bpf return libbpf_get_error(*link); } +static int attach_ksyscall(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_ksyscall_opts, opts); + const char *syscall_name; + + *link = NULL; + + /* no auto-attach for SEC("ksyscall") and SEC("kretsyscall") */ + if (strcmp(prog->sec_name, "ksyscall") == 0 || strcmp(prog->sec_name, "kretsyscall") == 0) + return 0; + + opts.retprobe = str_has_pfx(prog->sec_name, "kretsyscall/"); + if (opts.retprobe) + syscall_name = prog->sec_name + sizeof("kretsyscall/") - 1; + else + syscall_name = prog->sec_name + sizeof("ksyscall/") - 1; + + *link = bpf_program__attach_ksyscall(prog, syscall_name, &opts); + return *link ? 0 : -errno; +} + static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) { LIBBPF_OPTS(bpf_kprobe_multi_opts, opts); @@ -10625,6 +11617,13 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru char *pattern; int n; + *link = NULL; + + /* no auto-attach for SEC("kprobe.multi") and SEC("kretprobe.multi") */ + if (strcmp(prog->sec_name, "kprobe.multi") == 0 || + strcmp(prog->sec_name, "kretprobe.multi") == 0) + return 0; + opts.retprobe = str_has_pfx(prog->sec_name, "kretprobe.multi/"); if (opts.retprobe) spec = prog->sec_name + sizeof("kretprobe.multi/") - 1; @@ -10633,7 +11632,7 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); if (n < 1) { - pr_warn("kprobe multi pattern is invalid: %s\n", pattern); + pr_warn("kprobe multi pattern is invalid: %s\n", spec); return -EINVAL; } @@ -10642,6 +11641,63 @@ static int attach_kprobe_multi(const struct bpf_program *prog, long cookie, stru return libbpf_get_error(*link); } +static int attach_kprobe_session(const struct bpf_program *prog, long cookie, + struct bpf_link **link) +{ + LIBBPF_OPTS(bpf_kprobe_multi_opts, opts, .session = true); + const char *spec; + char *pattern; + int n; + + *link = NULL; + + /* no auto-attach for SEC("kprobe.session") */ + if (strcmp(prog->sec_name, "kprobe.session") == 0) + return 0; + + spec = prog->sec_name + sizeof("kprobe.session/") - 1; + n = sscanf(spec, "%m[a-zA-Z0-9_.*?]", &pattern); + if (n < 1) { + pr_warn("kprobe session pattern is invalid: %s\n", spec); + return -EINVAL; + } + + *link = bpf_program__attach_kprobe_multi_opts(prog, pattern, &opts); + free(pattern); + return *link ? 0 : -errno; +} + +static int attach_uprobe_multi(const struct bpf_program *prog, long cookie, struct bpf_link **link) +{ + char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts); + int n, ret = -EINVAL; + + *link = NULL; + + n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[^\n]", + &probe_type, &binary_path, &func_name); + switch (n) { + case 1: + /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ + ret = 0; + break; + case 3: + opts.retprobe = strcmp(probe_type, "uretprobe.multi") == 0; + *link = bpf_program__attach_uprobe_multi(prog, -1, binary_path, func_name, &opts); + ret = libbpf_get_error(*link); + break; + default: + pr_warn("prog '%s': invalid format of section definition '%s'\n", prog->name, + prog->sec_name); + break; + } + free(probe_type); + free(binary_path); + free(func_name); + return ret; +} + static void gen_uprobe_legacy_event_name(char *buf, size_t buf_sz, const char *binary_path, uint64_t offset) { @@ -10659,9 +11715,7 @@ static void gen_uprobe_legacy_event_name(char *buf, size_t buf_sz, static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe, const char *binary_path, size_t offset) { - const char *file = "/sys/kernel/debug/tracing/uprobe_events"; - - return append_to_file(file, "%c:%s/%s %s:0x%zx", + return append_to_file(tracefs_uprobe_events(), "%c:%s/%s %s:0x%zx", retprobe ? 'r' : 'p', retprobe ? "uretprobes" : "uprobes", probe_name, binary_path, offset); @@ -10669,18 +11723,16 @@ static inline int add_uprobe_event_legacy(const char *probe_name, bool retprobe, static inline int remove_uprobe_event_legacy(const char *probe_name, bool retprobe) { - const char *file = "/sys/kernel/debug/tracing/uprobe_events"; - - return append_to_file(file, "-:%s/%s", retprobe ? "uretprobes" : "uprobes", probe_name); + return append_to_file(tracefs_uprobe_events(), "-:%s/%s", + retprobe ? "uretprobes" : "uprobes", probe_name); } static int determine_uprobe_perf_type_legacy(const char *probe_name, bool retprobe) { char file[512]; - snprintf(file, sizeof(file), - "/sys/kernel/debug/tracing/events/%s/%s/id", - retprobe ? "uretprobes" : "uprobes", probe_name); + snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), retprobe ? "uretprobes" : "uprobes", probe_name); return parse_uint_from_file(file, "%d\n"); } @@ -10688,6 +11740,7 @@ static int determine_uprobe_perf_type_legacy(const char *probe_name, bool retpro static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, const char *binary_path, size_t offset, int pid) { + const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_event_attr attr; int type, pfd, err; @@ -10699,13 +11752,14 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, } type = determine_uprobe_perf_type_legacy(probe_name, retprobe); if (type < 0) { + err = type; pr_warn("failed to determine legacy uprobe event id for %s:0x%zx: %d\n", binary_path, offset, err); - return type; + goto err_clean_legacy; } - memset(&attr, 0, sizeof(attr)); - attr.size = sizeof(attr); + memset(&attr, 0, attr_sz); + attr.size = attr_sz; attr.config = type; attr.type = PERF_TYPE_TRACEPOINT; @@ -10716,197 +11770,75 @@ static int perf_event_uprobe_open_legacy(const char *probe_name, bool retprobe, if (pfd < 0) { err = -errno; pr_warn("legacy uprobe perf_event_open() failed: %d\n", err); - return err; + goto err_clean_legacy; } return pfd; -} -/* uprobes deal in relative offsets; subtract the base address associated with - * the mapped binary. See Documentation/trace/uprobetracer.rst for more - * details. - */ -static long elf_find_relative_offset(const char *filename, Elf *elf, long addr) -{ - size_t n; - int i; - - if (elf_getphdrnum(elf, &n)) { - pr_warn("elf: failed to find program headers for '%s': %s\n", filename, - elf_errmsg(-1)); - return -ENOENT; - } - - for (i = 0; i < n; i++) { - int seg_start, seg_end, seg_offset; - GElf_Phdr phdr; - - if (!gelf_getphdr(elf, i, &phdr)) { - pr_warn("elf: failed to get program header %d from '%s': %s\n", i, filename, - elf_errmsg(-1)); - return -ENOENT; - } - if (phdr.p_type != PT_LOAD || !(phdr.p_flags & PF_X)) - continue; - - seg_start = phdr.p_vaddr; - seg_end = seg_start + phdr.p_memsz; - seg_offset = phdr.p_offset; - if (addr >= seg_start && addr < seg_end) - return addr - seg_start + seg_offset; - } - pr_warn("elf: failed to find prog header containing 0x%lx in '%s'\n", addr, filename); - return -ENOENT; -} - -/* Return next ELF section of sh_type after scn, or first of that type if scn is NULL. */ -static Elf_Scn *elf_find_next_scn_by_type(Elf *elf, int sh_type, Elf_Scn *scn) -{ - while ((scn = elf_nextscn(elf, scn)) != NULL) { - GElf_Shdr sh; - - if (!gelf_getshdr(scn, &sh)) - continue; - if (sh.sh_type == sh_type) - return scn; - } - return NULL; +err_clean_legacy: + /* Clear the newly added legacy uprobe_event */ + remove_uprobe_event_legacy(probe_name, retprobe); + return err; } -/* Find offset of function name in object specified by path. "name" matches - * symbol name or name@@LIB for library functions. +/* Find offset of function name in archive specified by path. Currently + * supported are .zip files that do not compress their contents, as used on + * Android in the form of APKs, for example. "file_name" is the name of the ELF + * file inside the archive. "func_name" matches symbol name or name@@LIB for + * library functions. + * + * An overview of the APK format specifically provided here: + * https://en.wikipedia.org/w/index.php?title=Apk_(file_format)&oldid=1139099120#Package_contents */ -static long elf_find_func_offset(const char *binary_path, const char *name) +static long elf_find_func_offset_from_archive(const char *archive_path, const char *file_name, + const char *func_name) { - int fd, i, sh_types[2] = { SHT_DYNSYM, SHT_SYMTAB }; - bool is_shared_lib, is_name_qualified; - char errmsg[STRERR_BUFSIZE]; - long ret = -ENOENT; - size_t name_len; - GElf_Ehdr ehdr; + struct zip_archive *archive; + struct zip_entry entry; + long ret; Elf *elf; - fd = open(binary_path, O_RDONLY | O_CLOEXEC); - if (fd < 0) { - ret = -errno; - pr_warn("failed to open %s: %s\n", binary_path, - libbpf_strerror_r(ret, errmsg, sizeof(errmsg))); + archive = zip_archive_open(archive_path); + if (IS_ERR(archive)) { + ret = PTR_ERR(archive); + pr_warn("zip: failed to open %s: %ld\n", archive_path, ret); return ret; } - elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); - if (!elf) { - pr_warn("elf: could not read elf from %s: %s\n", binary_path, elf_errmsg(-1)); - close(fd); - return -LIBBPF_ERRNO__FORMAT; + + ret = zip_archive_find_entry(archive, file_name, &entry); + if (ret) { + pr_warn("zip: could not find archive member %s in %s: %ld\n", file_name, + archive_path, ret); + goto out; } - if (!gelf_getehdr(elf, &ehdr)) { - pr_warn("elf: failed to get ehdr from %s: %s\n", binary_path, elf_errmsg(-1)); + pr_debug("zip: found entry for %s in %s at 0x%lx\n", file_name, archive_path, + (unsigned long)entry.data_offset); + + if (entry.compression) { + pr_warn("zip: entry %s of %s is compressed and cannot be handled\n", file_name, + archive_path); ret = -LIBBPF_ERRNO__FORMAT; goto out; } - /* for shared lib case, we do not need to calculate relative offset */ - is_shared_lib = ehdr.e_type == ET_DYN; - - name_len = strlen(name); - /* Does name specify "@@LIB"? */ - is_name_qualified = strstr(name, "@@") != NULL; - - /* Search SHT_DYNSYM, SHT_SYMTAB for symbol. This search order is used because if - * a binary is stripped, it may only have SHT_DYNSYM, and a fully-statically - * linked binary may not have SHT_DYMSYM, so absence of a section should not be - * reported as a warning/error. - */ - for (i = 0; i < ARRAY_SIZE(sh_types); i++) { - size_t nr_syms, strtabidx, idx; - Elf_Data *symbols = NULL; - Elf_Scn *scn = NULL; - int last_bind = -1; - const char *sname; - GElf_Shdr sh; - - scn = elf_find_next_scn_by_type(elf, sh_types[i], NULL); - if (!scn) { - pr_debug("elf: failed to find symbol table ELF sections in '%s'\n", - binary_path); - continue; - } - if (!gelf_getshdr(scn, &sh)) - continue; - strtabidx = sh.sh_link; - symbols = elf_getdata(scn, 0); - if (!symbols) { - pr_warn("elf: failed to get symbols for symtab section in '%s': %s\n", - binary_path, elf_errmsg(-1)); - ret = -LIBBPF_ERRNO__FORMAT; - goto out; - } - nr_syms = symbols->d_size / sh.sh_entsize; - - for (idx = 0; idx < nr_syms; idx++) { - int curr_bind; - GElf_Sym sym; - - if (!gelf_getsym(symbols, idx, &sym)) - continue; - - if (GELF_ST_TYPE(sym.st_info) != STT_FUNC) - continue; - - sname = elf_strptr(elf, strtabidx, sym.st_name); - if (!sname) - continue; - - curr_bind = GELF_ST_BIND(sym.st_info); - /* User can specify func, func@@LIB or func@@LIB_VERSION. */ - if (strncmp(sname, name, name_len) != 0) - continue; - /* ...but we don't want a search for "foo" to match 'foo2" also, so any - * additional characters in sname should be of the form "@@LIB". - */ - if (!is_name_qualified && sname[name_len] != '\0' && sname[name_len] != '@') - continue; - - if (ret >= 0) { - /* handle multiple matches */ - if (last_bind != STB_WEAK && curr_bind != STB_WEAK) { - /* Only accept one non-weak bind. */ - pr_warn("elf: ambiguous match for '%s', '%s' in '%s'\n", - sname, name, binary_path); - ret = -LIBBPF_ERRNO__FORMAT; - goto out; - } else if (curr_bind == STB_WEAK) { - /* already have a non-weak bind, and - * this is a weak bind, so ignore. - */ - continue; - } - } - ret = sym.st_value; - last_bind = curr_bind; - } - /* For binaries that are not shared libraries, we need relative offset */ - if (ret > 0 && !is_shared_lib) - ret = elf_find_relative_offset(binary_path, elf, ret); - if (ret > 0) - break; + elf = elf_memory((void *)entry.data, entry.data_length); + if (!elf) { + pr_warn("elf: could not read elf file %s from %s: %s\n", file_name, archive_path, + elf_errmsg(-1)); + ret = -LIBBPF_ERRNO__LIBELF; + goto out; } + ret = elf_find_func_offset(elf, file_name, func_name); if (ret > 0) { - pr_debug("elf: symbol address match for '%s' in '%s': 0x%lx\n", name, binary_path, - ret); - } else { - if (ret == 0) { - pr_warn("elf: '%s' is 0 in symtab for '%s': %s\n", name, binary_path, - is_shared_lib ? "should not be 0 in a shared library" : - "try using shared library path instead"); - ret = -ENOENT; - } else { - pr_warn("elf: failed to find symbol '%s' in '%s'\n", name, binary_path); - } + pr_debug("elf: symbol address match for %s of %s in %s: 0x%x + 0x%lx = 0x%lx\n", + func_name, file_name, archive_path, entry.data_offset, ret, + ret + entry.data_offset); + ret += entry.data_offset; } -out: elf_end(elf); - close(fd); + +out: + zip_archive_close(archive); return ret; } @@ -10951,15 +11883,17 @@ static const char *arch_specific_lib_paths(void) static int resolve_full_path(const char *file, char *result, size_t result_sz) { const char *search_paths[3] = {}; - int i; + int i, perm; if (str_has_sfx(file, ".so") || strstr(file, ".so.")) { search_paths[0] = getenv("LD_LIBRARY_PATH"); search_paths[1] = "/usr/lib64:/usr/lib"; search_paths[2] = arch_specific_lib_paths(); + perm = R_OK; } else { search_paths[0] = getenv("PATH"); search_paths[1] = "/usr/bin:/usr/sbin"; + perm = R_OK | X_OK; } for (i = 0; i < ARRAY_SIZE(search_paths); i++) { @@ -10978,8 +11912,8 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz) if (!seg_len) continue; snprintf(result, result_sz, "%.*s/%s", seg_len, s, file); - /* ensure it is an executable file/link */ - if (access(result, R_OK | X_OK) < 0) + /* ensure it has required permissions */ + if (faccessat(AT_FDCWD, result, perm, AT_EACCESS) < 0) continue; pr_debug("resolved '%s' to '%s'\n", file, result); return 0; @@ -10988,14 +11922,136 @@ static int resolve_full_path(const char *file, char *result, size_t result_sz) return -ENOENT; } +struct bpf_link * +bpf_program__attach_uprobe_multi(const struct bpf_program *prog, + pid_t pid, + const char *path, + const char *func_pattern, + const struct bpf_uprobe_multi_opts *opts) +{ + const unsigned long *ref_ctr_offsets = NULL, *offsets = NULL; + LIBBPF_OPTS(bpf_link_create_opts, lopts); + unsigned long *resolved_offsets = NULL; + int err = 0, link_fd, prog_fd; + struct bpf_link *link = NULL; + char errmsg[STRERR_BUFSIZE]; + char full_path[PATH_MAX]; + const __u64 *cookies; + const char **syms; + size_t cnt; + + if (!OPTS_VALID(opts, bpf_uprobe_multi_opts)) + return libbpf_err_ptr(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + syms = OPTS_GET(opts, syms, NULL); + offsets = OPTS_GET(opts, offsets, NULL); + ref_ctr_offsets = OPTS_GET(opts, ref_ctr_offsets, NULL); + cookies = OPTS_GET(opts, cookies, NULL); + cnt = OPTS_GET(opts, cnt, 0); + + /* + * User can specify 2 mutually exclusive set of inputs: + * + * 1) use only path/func_pattern/pid arguments + * + * 2) use path/pid with allowed combinations of: + * syms/offsets/ref_ctr_offsets/cookies/cnt + * + * - syms and offsets are mutually exclusive + * - ref_ctr_offsets and cookies are optional + * + * Any other usage results in error. + */ + + if (!path) + return libbpf_err_ptr(-EINVAL); + if (!func_pattern && cnt == 0) + return libbpf_err_ptr(-EINVAL); + + if (func_pattern) { + if (syms || offsets || ref_ctr_offsets || cookies || cnt) + return libbpf_err_ptr(-EINVAL); + } else { + if (!!syms == !!offsets) + return libbpf_err_ptr(-EINVAL); + } + + if (func_pattern) { + if (!strchr(path, '/')) { + err = resolve_full_path(path, full_path, sizeof(full_path)); + if (err) { + pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", + prog->name, path, err); + return libbpf_err_ptr(err); + } + path = full_path; + } + + err = elf_resolve_pattern_offsets(path, func_pattern, + &resolved_offsets, &cnt); + if (err < 0) + return libbpf_err_ptr(err); + offsets = resolved_offsets; + } else if (syms) { + err = elf_resolve_syms_offsets(path, cnt, syms, &resolved_offsets, STT_FUNC); + if (err < 0) + return libbpf_err_ptr(err); + offsets = resolved_offsets; + } + + lopts.uprobe_multi.path = path; + lopts.uprobe_multi.offsets = offsets; + lopts.uprobe_multi.ref_ctr_offsets = ref_ctr_offsets; + lopts.uprobe_multi.cookies = cookies; + lopts.uprobe_multi.cnt = cnt; + lopts.uprobe_multi.flags = OPTS_GET(opts, retprobe, false) ? BPF_F_UPROBE_MULTI_RETURN : 0; + + if (pid == 0) + pid = getpid(); + if (pid > 0) + lopts.uprobe_multi.pid = pid; + + link = calloc(1, sizeof(*link)); + if (!link) { + err = -ENOMEM; + goto error; + } + link->detach = &bpf_link__detach_fd; + + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &lopts); + if (link_fd < 0) { + err = -errno; + pr_warn("prog '%s': failed to attach multi-uprobe: %s\n", + prog->name, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); + goto error; + } + link->fd = link_fd; + free(resolved_offsets); + return link; + +error: + free(resolved_offsets); + free(link); + return libbpf_err_ptr(err); +} + LIBBPF_API struct bpf_link * bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, const char *binary_path, size_t func_offset, const struct bpf_uprobe_opts *opts) { - DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + const char *archive_path = NULL, *archive_sep = NULL; char errmsg[STRERR_BUFSIZE], *legacy_probe = NULL; - char full_binary_path[PATH_MAX]; + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); + enum probe_attach_mode attach_mode; + char full_path[PATH_MAX]; struct bpf_link *link; size_t ref_ctr_off; int pfd, err; @@ -11005,36 +12061,68 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, if (!OPTS_VALID(opts, bpf_uprobe_opts)) return libbpf_err_ptr(-EINVAL); + attach_mode = OPTS_GET(opts, attach_mode, PROBE_ATTACH_MODE_DEFAULT); retprobe = OPTS_GET(opts, retprobe, false); ref_ctr_off = OPTS_GET(opts, ref_ctr_offset, 0); pe_opts.bpf_cookie = OPTS_GET(opts, bpf_cookie, 0); - if (binary_path && !strchr(binary_path, '/')) { - err = resolve_full_path(binary_path, full_binary_path, - sizeof(full_binary_path)); + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + + /* Check if "binary_path" refers to an archive. */ + archive_sep = strstr(binary_path, "!/"); + if (archive_sep) { + full_path[0] = '\0'; + libbpf_strlcpy(full_path, binary_path, + min(sizeof(full_path), (size_t)(archive_sep - binary_path + 1))); + archive_path = full_path; + binary_path = archive_sep + 2; + } else if (!strchr(binary_path, '/')) { + err = resolve_full_path(binary_path, full_path, sizeof(full_path)); if (err) { pr_warn("prog '%s': failed to resolve full path for '%s': %d\n", prog->name, binary_path, err); return libbpf_err_ptr(err); } - binary_path = full_binary_path; + binary_path = full_path; } func_name = OPTS_GET(opts, func_name, NULL); if (func_name) { long sym_off; - if (!binary_path) { - pr_warn("prog '%s': name-based attach requires binary_path\n", - prog->name); - return libbpf_err_ptr(-EINVAL); + if (archive_path) { + sym_off = elf_find_func_offset_from_archive(archive_path, binary_path, + func_name); + binary_path = archive_path; + } else { + sym_off = elf_find_func_offset_from_file(binary_path, func_name); } - sym_off = elf_find_func_offset(binary_path, func_name); if (sym_off < 0) return libbpf_err_ptr(sym_off); func_offset += sym_off; } legacy = determine_uprobe_perf_type() < 0; + switch (attach_mode) { + case PROBE_ATTACH_MODE_LEGACY: + legacy = true; + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_PERF: + if (legacy) + return libbpf_err_ptr(-ENOTSUP); + pe_opts.force_ioctl_attach = true; + break; + case PROBE_ATTACH_MODE_LINK: + if (legacy || !kernel_supports(prog->obj, FEAT_PERF_LINK)) + return libbpf_err_ptr(-ENOTSUP); + break; + case PROBE_ATTACH_MODE_DEFAULT: + break; + default: + return libbpf_err_ptr(-EINVAL); + } + if (!legacy) { pfd = perf_event_open_probe(true /* uprobe */, retprobe, binary_path, func_offset, pid, ref_ctr_off); @@ -11071,7 +12159,7 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, prog->name, retprobe ? "uretprobe" : "uprobe", binary_path, func_offset, libbpf_strerror_r(err, errmsg, sizeof(errmsg))); - goto err_out; + goto err_clean_legacy; } if (legacy) { struct bpf_link_perf *perf_link = container_of(link, struct bpf_link_perf, link); @@ -11081,10 +12169,13 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, perf_link->legacy_is_retprobe = retprobe; } return link; + +err_clean_legacy: + if (legacy) + remove_uprobe_event_legacy(legacy_probe, retprobe); err_out: free(legacy_probe); return libbpf_err_ptr(err); - } /* Format of u[ret]probe section definition supporting auto-attach: @@ -11100,14 +12191,14 @@ bpf_program__attach_uprobe_opts(const struct bpf_program *prog, pid_t pid, static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf_link **link) { DECLARE_LIBBPF_OPTS(bpf_uprobe_opts, opts); - char *probe_type = NULL, *binary_path = NULL, *func_name = NULL; - int n, ret = -EINVAL; + char *probe_type = NULL, *binary_path = NULL, *func_name = NULL, *func_off; + int n, c, ret = -EINVAL; long offset = 0; *link = NULL; - n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[a-zA-Z0-9_.]+%li", - &probe_type, &binary_path, &func_name, &offset); + n = sscanf(prog->sec_name, "%m[^/]/%m[^:]:%m[^\n]", + &probe_type, &binary_path, &func_name); switch (n) { case 1: /* handle SEC("u[ret]probe") - format is valid, but auto-attach is impossible. */ @@ -11118,8 +12209,19 @@ static int attach_uprobe(const struct bpf_program *prog, long cookie, struct bpf prog->name, prog->sec_name); break; case 3: - case 4: - opts.retprobe = strcmp(probe_type, "uretprobe") == 0; + /* check if user specifies `+offset`, if yes, this should be + * the last part of the string, make sure sscanf read to EOL + */ + func_off = strrchr(func_name, '+'); + if (func_off) { + n = sscanf(func_off, "+%li%n", &offset, &c); + if (n == 1 && *(func_off + c) == '\0') + func_off[0] = '\0'; + else + offset = 0; + } + opts.retprobe = strcmp(probe_type, "uretprobe") == 0 || + strcmp(probe_type, "uretprobe.s") == 0; if (opts.retprobe && offset != 0) { pr_warn("prog '%s': uretprobes do not support offset specification\n", prog->name); @@ -11166,11 +12268,14 @@ struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, return libbpf_err_ptr(-EINVAL); if (bpf_program__fd(prog) < 0) { - pr_warn("prog '%s': can't attach BPF program w/o FD (did you load it?)\n", + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", prog->name); return libbpf_err_ptr(-EINVAL); } + if (!binary_path) + return libbpf_err_ptr(-EINVAL); + if (!strchr(binary_path, '/')) { err = resolve_full_path(binary_path, resolved_path, sizeof(resolved_path)); if (err) { @@ -11194,7 +12299,7 @@ struct bpf_link *bpf_program__attach_usdt(const struct bpf_program *prog, usdt_cookie = OPTS_GET(opts, usdt_cookie, 0); link = usdt_manager_attach_usdt(obj->usdt_man, prog, pid, binary_path, - usdt_provider, usdt_name, usdt_cookie); + usdt_provider, usdt_name, usdt_cookie); err = libbpf_get_error(link); if (err) return libbpf_err_ptr(err); @@ -11236,9 +12341,8 @@ static int determine_tracepoint_id(const char *tp_category, char file[PATH_MAX]; int ret; - ret = snprintf(file, sizeof(file), - "/sys/kernel/debug/tracing/events/%s/%s/id", - tp_category, tp_name); + ret = snprintf(file, sizeof(file), "%s/events/%s/%s/id", + tracefs_path(), tp_category, tp_name); if (ret < 0) return -errno; if (ret >= sizeof(file)) { @@ -11252,7 +12356,8 @@ static int determine_tracepoint_id(const char *tp_category, static int perf_event_open_tracepoint(const char *tp_category, const char *tp_name) { - struct perf_event_attr attr = {}; + const size_t attr_sz = sizeof(struct perf_event_attr); + struct perf_event_attr attr; char errmsg[STRERR_BUFSIZE]; int tp_id, pfd, err; @@ -11264,8 +12369,9 @@ static int perf_event_open_tracepoint(const char *tp_category, return tp_id; } + memset(&attr, 0, attr_sz); attr.type = PERF_TYPE_TRACEPOINT; - attr.size = sizeof(attr); + attr.size = attr_sz; attr.config = tp_id; pfd = syscall(__NR_perf_event_open, &attr, -1 /* pid */, 0 /* cpu */, @@ -11325,6 +12431,12 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin { char *sec_name, *tp_cat, *tp_name; + *link = NULL; + + /* no auto-attach for SEC("tp") or SEC("tracepoint") */ + if (strcmp(prog->sec_name, "tp") == 0 || strcmp(prog->sec_name, "tracepoint") == 0) + return 0; + sec_name = strdup(prog->sec_name); if (!sec_name) return -ENOMEM; @@ -11347,13 +12459,19 @@ static int attach_tp(const struct bpf_program *prog, long cookie, struct bpf_lin return libbpf_get_error(*link); } -struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, - const char *tp_name) +struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts) { + LIBBPF_OPTS(bpf_raw_tp_opts, raw_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; int prog_fd, pfd; + if (!OPTS_VALID(opts, bpf_raw_tracepoint_opts)) + return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { pr_warn("prog '%s': can't attach before loaded\n", prog->name); @@ -11365,7 +12483,9 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return libbpf_err_ptr(-ENOMEM); link->detach = &bpf_link__detach_fd; - pfd = bpf_raw_tracepoint_open(tp_name, prog_fd); + raw_opts.tp_name = tp_name; + raw_opts.cookie = OPTS_GET(opts, cookie, 0); + pfd = bpf_raw_tracepoint_open_opts(prog_fd, &raw_opts); if (pfd < 0) { pfd = -errno; free(link); @@ -11377,23 +12497,43 @@ struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *pr return link; } +struct bpf_link *bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, + const char *tp_name) +{ + return bpf_program__attach_raw_tracepoint_opts(prog, tp_name, NULL); +} + static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf_link **link) { static const char *const prefixes[] = { - "raw_tp/", - "raw_tracepoint/", - "raw_tp.w/", - "raw_tracepoint.w/", + "raw_tp", + "raw_tracepoint", + "raw_tp.w", + "raw_tracepoint.w", }; size_t i; const char *tp_name = NULL; + *link = NULL; + for (i = 0; i < ARRAY_SIZE(prefixes); i++) { - if (str_has_pfx(prog->sec_name, prefixes[i])) { - tp_name = prog->sec_name + strlen(prefixes[i]); - break; - } + size_t pfx_len; + + if (!str_has_pfx(prog->sec_name, prefixes[i])) + continue; + + pfx_len = strlen(prefixes[i]); + /* no auto-attach case of, e.g., SEC("raw_tp") */ + if (prog->sec_name[pfx_len] == '\0') + return 0; + + if (prog->sec_name[pfx_len] != '/') + continue; + + tp_name = prog->sec_name + pfx_len + 1; + break; } + if (!tp_name) { pr_warn("prog '%s': invalid section name '%s'\n", prog->name, prog->sec_name); @@ -11401,16 +12541,21 @@ static int attach_raw_tp(const struct bpf_program *prog, long cookie, struct bpf } *link = bpf_program__attach_raw_tracepoint(prog, tp_name); - return libbpf_get_error(link); + return libbpf_get_error(*link); } /* Common logic for all BPF program types that attach to a btf_id */ -static struct bpf_link *bpf_program__attach_btf_id(const struct bpf_program *prog) +static struct bpf_link *bpf_program__attach_btf_id(const struct bpf_program *prog, + const struct bpf_trace_opts *opts) { + LIBBPF_OPTS(bpf_link_create_opts, link_opts); char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; int prog_fd, pfd; + if (!OPTS_VALID(opts, bpf_trace_opts)) + return libbpf_err_ptr(-EINVAL); + prog_fd = bpf_program__fd(prog); if (prog_fd < 0) { pr_warn("prog '%s': can't attach before loaded\n", prog->name); @@ -11423,7 +12568,8 @@ static struct bpf_link *bpf_program__attach_btf_id(const struct bpf_program *pro link->detach = &bpf_link__detach_fd; /* libbpf is smart enough to redirect to BPF_RAW_TRACEPOINT_OPEN on old kernels */ - pfd = bpf_link_create(prog_fd, 0, bpf_program__expected_attach_type(prog), NULL); + link_opts.tracing.cookie = OPTS_GET(opts, cookie, 0); + pfd = bpf_link_create(prog_fd, 0, bpf_program__expected_attach_type(prog), &link_opts); if (pfd < 0) { pfd = -errno; free(link); @@ -11437,12 +12583,18 @@ static struct bpf_link *bpf_program__attach_btf_id(const struct bpf_program *pro struct bpf_link *bpf_program__attach_trace(const struct bpf_program *prog) { - return bpf_program__attach_btf_id(prog); + return bpf_program__attach_btf_id(prog, NULL); +} + +struct bpf_link *bpf_program__attach_trace_opts(const struct bpf_program *prog, + const struct bpf_trace_opts *opts) +{ + return bpf_program__attach_btf_id(prog, opts); } struct bpf_link *bpf_program__attach_lsm(const struct bpf_program *prog) { - return bpf_program__attach_btf_id(prog); + return bpf_program__attach_btf_id(prog, NULL); } static int attach_trace(const struct bpf_program *prog, long cookie, struct bpf_link **link) @@ -11458,11 +12610,10 @@ static int attach_lsm(const struct bpf_program *prog, long cookie, struct bpf_li } static struct bpf_link * -bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id, - const char *target_name) +bpf_program_attach_fd(const struct bpf_program *prog, + int target_fd, const char *target_name, + const struct bpf_link_create_opts *opts) { - DECLARE_LIBBPF_OPTS(bpf_link_create_opts, opts, - .target_btf_id = btf_id); enum bpf_attach_type attach_type; char errmsg[STRERR_BUFSIZE]; struct bpf_link *link; @@ -11480,7 +12631,7 @@ bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id link->detach = &bpf_link__detach_fd; attach_type = bpf_program__expected_attach_type(prog); - link_fd = bpf_link_create(prog_fd, target_fd, attach_type, &opts); + link_fd = bpf_link_create(prog_fd, target_fd, attach_type, opts); if (link_fd < 0) { link_fd = -errno; free(link); @@ -11489,26 +12640,101 @@ bpf_program__attach_fd(const struct bpf_program *prog, int target_fd, int btf_id libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); return libbpf_err_ptr(link_fd); } - link->fd = link_fd; - return link; -} + link->fd = link_fd; + return link; +} + +struct bpf_link * +bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd) +{ + return bpf_program_attach_fd(prog, cgroup_fd, "cgroup", NULL); +} + +struct bpf_link * +bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) +{ + return bpf_program_attach_fd(prog, netns_fd, "netns", NULL); +} + +struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd) +{ + return bpf_program_attach_fd(prog, map_fd, "sockmap", NULL); +} + +struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) +{ + /* target_fd/target_ifindex use the same field in LINK_CREATE */ + return bpf_program_attach_fd(prog, ifindex, "xdp", NULL); +} + +struct bpf_link * +bpf_program__attach_tcx(const struct bpf_program *prog, int ifindex, + const struct bpf_tcx_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, link_create_opts); + __u32 relative_id; + int relative_fd; + + if (!OPTS_VALID(opts, bpf_tcx_opts)) + return libbpf_err_ptr(-EINVAL); + + relative_id = OPTS_GET(opts, relative_id, 0); + relative_fd = OPTS_GET(opts, relative_fd, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (!ifindex) { + pr_warn("prog '%s': target netdevice ifindex cannot be zero\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + if (relative_fd && relative_id) { + pr_warn("prog '%s': relative_fd and relative_id cannot be set at the same time\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } -struct bpf_link * -bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd) -{ - return bpf_program__attach_fd(prog, cgroup_fd, 0, "cgroup"); + link_create_opts.tcx.expected_revision = OPTS_GET(opts, expected_revision, 0); + link_create_opts.tcx.relative_fd = relative_fd; + link_create_opts.tcx.relative_id = relative_id; + link_create_opts.flags = OPTS_GET(opts, flags, 0); + + /* target_fd/target_ifindex use the same field in LINK_CREATE */ + return bpf_program_attach_fd(prog, ifindex, "tcx", &link_create_opts); } struct bpf_link * -bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd) +bpf_program__attach_netkit(const struct bpf_program *prog, int ifindex, + const struct bpf_netkit_opts *opts) { - return bpf_program__attach_fd(prog, netns_fd, 0, "netns"); -} + LIBBPF_OPTS(bpf_link_create_opts, link_create_opts); + __u32 relative_id; + int relative_fd; -struct bpf_link *bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex) -{ - /* target_fd/target_ifindex use the same field in LINK_CREATE */ - return bpf_program__attach_fd(prog, ifindex, 0, "xdp"); + if (!OPTS_VALID(opts, bpf_netkit_opts)) + return libbpf_err_ptr(-EINVAL); + + relative_id = OPTS_GET(opts, relative_id, 0); + relative_fd = OPTS_GET(opts, relative_fd, 0); + + /* validate we don't have unexpected combinations of non-zero fields */ + if (!ifindex) { + pr_warn("prog '%s': target netdevice ifindex cannot be zero\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + if (relative_fd && relative_id) { + pr_warn("prog '%s': relative_fd and relative_id cannot be set at the same time\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link_create_opts.netkit.expected_revision = OPTS_GET(opts, expected_revision, 0); + link_create_opts.netkit.relative_fd = relative_fd; + link_create_opts.netkit.relative_id = relative_id; + link_create_opts.flags = OPTS_GET(opts, flags, 0); + + return bpf_program_attach_fd(prog, ifindex, "netkit", &link_create_opts); } struct bpf_link *bpf_program__attach_freplace(const struct bpf_program *prog, @@ -11530,11 +12756,16 @@ struct bpf_link *bpf_program__attach_freplace(const struct bpf_program *prog, } if (target_fd) { + LIBBPF_OPTS(bpf_link_create_opts, target_opts); + btf_id = libbpf_find_prog_btf_id(attach_func_name, target_fd); if (btf_id < 0) return libbpf_err_ptr(btf_id); - return bpf_program__attach_fd(prog, target_fd, btf_id, "freplace"); + target_opts.target_btf_id = btf_id; + + return bpf_program_attach_fd(prog, target_fd, "freplace", + &target_opts); } else { /* no target, so use raw_tracepoint_open for compatibility * with old kernels @@ -11589,6 +12820,48 @@ static int attach_iter(const struct bpf_program *prog, long cookie, struct bpf_l return libbpf_get_error(*link); } +struct bpf_link *bpf_program__attach_netfilter(const struct bpf_program *prog, + const struct bpf_netfilter_opts *opts) +{ + LIBBPF_OPTS(bpf_link_create_opts, lopts); + struct bpf_link *link; + int prog_fd, link_fd; + + if (!OPTS_VALID(opts, bpf_netfilter_opts)) + return libbpf_err_ptr(-EINVAL); + + prog_fd = bpf_program__fd(prog); + if (prog_fd < 0) { + pr_warn("prog '%s': can't attach before loaded\n", prog->name); + return libbpf_err_ptr(-EINVAL); + } + + link = calloc(1, sizeof(*link)); + if (!link) + return libbpf_err_ptr(-ENOMEM); + + link->detach = &bpf_link__detach_fd; + + lopts.netfilter.pf = OPTS_GET(opts, pf, 0); + lopts.netfilter.hooknum = OPTS_GET(opts, hooknum, 0); + lopts.netfilter.priority = OPTS_GET(opts, priority, 0); + lopts.netfilter.flags = OPTS_GET(opts, flags, 0); + + link_fd = bpf_link_create(prog_fd, 0, BPF_NETFILTER, &lopts); + if (link_fd < 0) { + char errmsg[STRERR_BUFSIZE]; + + link_fd = -errno; + free(link); + pr_warn("prog '%s': failed to attach to netfilter: %s\n", + prog->name, libbpf_strerror_r(link_fd, errmsg, sizeof(errmsg))); + return libbpf_err_ptr(link_fd); + } + link->fd = link_fd; + + return link; +} + struct bpf_link *bpf_program__attach(const struct bpf_program *prog) { struct bpf_link *link = NULL; @@ -11597,6 +12870,12 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog) if (!prog->sec_def || !prog->sec_def->prog_attach_fn) return libbpf_err_ptr(-EOPNOTSUPP); + if (bpf_program__fd(prog) < 0) { + pr_warn("prog '%s': can't attach BPF program without FD (was it loaded?)\n", + prog->name); + return libbpf_err_ptr(-EINVAL); + } + err = prog->sec_def->prog_attach_fn(prog, prog->sec_def->cookie, &link); if (err) return libbpf_err_ptr(err); @@ -11612,57 +12891,121 @@ struct bpf_link *bpf_program__attach(const struct bpf_program *prog) return link; } +struct bpf_link_struct_ops { + struct bpf_link link; + int map_fd; +}; + static int bpf_link__detach_struct_ops(struct bpf_link *link) { + struct bpf_link_struct_ops *st_link; __u32 zero = 0; - if (bpf_map_delete_elem(link->fd, &zero)) - return -errno; + st_link = container_of(link, struct bpf_link_struct_ops, link); - return 0; + if (st_link->map_fd < 0) + /* w/o a real link */ + return bpf_map_delete_elem(link->fd, &zero); + + return close(link->fd); } struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map) { - struct bpf_struct_ops *st_ops; - struct bpf_link *link; - __u32 i, zero = 0; - int err; + struct bpf_link_struct_ops *link; + __u32 zero = 0; + int err, fd; + + if (!bpf_map__is_struct_ops(map)) { + pr_warn("map '%s': can't attach non-struct_ops map\n", map->name); + return libbpf_err_ptr(-EINVAL); + } - if (!bpf_map__is_struct_ops(map) || map->fd == -1) + if (map->fd < 0) { + pr_warn("map '%s': can't attach BPF map without FD (was it created?)\n", map->name); return libbpf_err_ptr(-EINVAL); + } link = calloc(1, sizeof(*link)); if (!link) return libbpf_err_ptr(-EINVAL); - st_ops = map->st_ops; - for (i = 0; i < btf_vlen(st_ops->type); i++) { - struct bpf_program *prog = st_ops->progs[i]; - void *kern_data; - int prog_fd; + /* kern_vdata should be prepared during the loading phase. */ + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0); + /* It can be EBUSY if the map has been used to create or + * update a link before. We don't allow updating the value of + * a struct_ops once it is set. That ensures that the value + * never changed. So, it is safe to skip EBUSY. + */ + if (err && (!(map->def.map_flags & BPF_F_LINK) || err != -EBUSY)) { + free(link); + return libbpf_err_ptr(err); + } - if (!prog) - continue; + link->link.detach = bpf_link__detach_struct_ops; - prog_fd = bpf_program__fd(prog); - kern_data = st_ops->kern_vdata + st_ops->kern_func_off[i]; - *(unsigned long *)kern_data = prog_fd; + if (!(map->def.map_flags & BPF_F_LINK)) { + /* w/o a real link */ + link->link.fd = map->fd; + link->map_fd = -1; + return &link->link; } - err = bpf_map_update_elem(map->fd, &zero, st_ops->kern_vdata, 0); - if (err) { - err = -errno; + fd = bpf_link_create(map->fd, 0, BPF_STRUCT_OPS, NULL); + if (fd < 0) { free(link); - return libbpf_err_ptr(err); + return libbpf_err_ptr(fd); } - link->detach = bpf_link__detach_struct_ops; - link->fd = map->fd; + link->link.fd = fd; + link->map_fd = map->fd; - return link; + return &link->link; +} + +/* + * Swap the back struct_ops of a link with a new struct_ops map. + */ +int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map) +{ + struct bpf_link_struct_ops *st_ops_link; + __u32 zero = 0; + int err; + + if (!bpf_map__is_struct_ops(map)) + return -EINVAL; + + if (map->fd < 0) { + pr_warn("map '%s': can't use BPF map without FD (was it created?)\n", map->name); + return -EINVAL; + } + + st_ops_link = container_of(link, struct bpf_link_struct_ops, link); + /* Ensure the type of a link is correct */ + if (st_ops_link->map_fd < 0) + return -EINVAL; + + err = bpf_map_update_elem(map->fd, &zero, map->st_ops->kern_vdata, 0); + /* It can be EBUSY if the map has been used to create or + * update a link before. We don't allow updating the value of + * a struct_ops once it is set. That ensures that the value + * never changed. So, it is safe to skip EBUSY. + */ + if (err && err != -EBUSY) + return err; + + err = bpf_link_update(link->fd, map->fd, NULL); + if (err < 0) + return err; + + st_ops_link->map_fd = map->fd; + + return 0; } +typedef enum bpf_perf_event_ret (*bpf_perf_event_print_t)(struct perf_event_header *hdr, + void *private_data); + static enum bpf_perf_event_ret perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, void **copy_mem, size_t *copy_size, @@ -11711,12 +13054,6 @@ perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, return libbpf_err(ret); } -__attribute__((alias("perf_event_read_simple"))) -enum bpf_perf_event_ret -bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, - void **copy_mem, size_t *copy_size, - bpf_perf_event_print_t fn, void *private_data); - struct perf_buffer; struct perf_buffer_params { @@ -11850,24 +13187,31 @@ perf_buffer__open_cpu_buf(struct perf_buffer *pb, struct perf_event_attr *attr, static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, struct perf_buffer_params *p); -DEFAULT_VERSION(perf_buffer__new_v0_6_0, perf_buffer__new, LIBBPF_0.6.0) -struct perf_buffer *perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt, - perf_buffer_sample_fn sample_cb, - perf_buffer_lost_fn lost_cb, - void *ctx, - const struct perf_buffer_opts *opts) +struct perf_buffer *perf_buffer__new(int map_fd, size_t page_cnt, + perf_buffer_sample_fn sample_cb, + perf_buffer_lost_fn lost_cb, + void *ctx, + const struct perf_buffer_opts *opts) { + const size_t attr_sz = sizeof(struct perf_event_attr); struct perf_buffer_params p = {}; - struct perf_event_attr attr = {}; + struct perf_event_attr attr; + __u32 sample_period; if (!OPTS_VALID(opts, perf_buffer_opts)) return libbpf_err_ptr(-EINVAL); + sample_period = OPTS_GET(opts, sample_period, 1); + if (!sample_period) + sample_period = 1; + + memset(&attr, 0, attr_sz); + attr.size = attr_sz; attr.config = PERF_COUNT_SW_BPF_OUTPUT; attr.type = PERF_TYPE_SOFTWARE; attr.sample_type = PERF_SAMPLE_RAW; - attr.sample_period = 1; - attr.wakeup_events = 1; + attr.sample_period = sample_period; + attr.wakeup_events = sample_period; p.attr = &attr; p.sample_cb = sample_cb; @@ -11877,22 +13221,10 @@ struct perf_buffer *perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt, return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); } -COMPAT_VERSION(perf_buffer__new_deprecated, perf_buffer__new, LIBBPF_0.0.4) -struct perf_buffer *perf_buffer__new_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_opts *opts) -{ - return perf_buffer__new_v0_6_0(map_fd, page_cnt, - opts ? opts->sample_cb : NULL, - opts ? opts->lost_cb : NULL, - opts ? opts->ctx : NULL, - NULL); -} - -DEFAULT_VERSION(perf_buffer__new_raw_v0_6_0, perf_buffer__new_raw, LIBBPF_0.6.0) -struct perf_buffer *perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, - struct perf_event_attr *attr, - perf_buffer_event_fn event_cb, void *ctx, - const struct perf_buffer_raw_opts *opts) +struct perf_buffer *perf_buffer__new_raw(int map_fd, size_t page_cnt, + struct perf_event_attr *attr, + perf_buffer_event_fn event_cb, void *ctx, + const struct perf_buffer_raw_opts *opts) { struct perf_buffer_params p = {}; @@ -11912,20 +13244,6 @@ struct perf_buffer *perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, return libbpf_ptr(__perf_buffer__new(map_fd, page_cnt, &p)); } -COMPAT_VERSION(perf_buffer__new_raw_deprecated, perf_buffer__new_raw, LIBBPF_0.0.4) -struct perf_buffer *perf_buffer__new_raw_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_raw_opts *opts) -{ - LIBBPF_OPTS(perf_buffer_raw_opts, inner_opts, - .cpu_cnt = opts->cpu_cnt, - .cpus = opts->cpus, - .map_keys = opts->map_keys, - ); - - return perf_buffer__new_raw_v0_6_0(map_fd, page_cnt, opts->attr, - opts->event_cb, opts->ctx, &inner_opts); -} - static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, struct perf_buffer_params *p) { @@ -11946,7 +13264,7 @@ static struct perf_buffer *__perf_buffer__new(int map_fd, size_t page_cnt, /* best-effort sanity checks */ memset(&map, 0, sizeof(map)); map_info_len = sizeof(map); - err = bpf_obj_get_info_by_fd(map_fd, &map, &map_info_len); + err = bpf_map_get_info_by_fd(map_fd, &map, &map_info_len); if (err) { err = -errno; /* if BPF_OBJ_GET_INFO_BY_FD is supported, will return @@ -12186,6 +13504,22 @@ int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx) return cpu_buf->fd; } +int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, size_t *buf_size) +{ + struct perf_cpu_buf *cpu_buf; + + if (buf_idx >= pb->cpu_cnt) + return libbpf_err(-EINVAL); + + cpu_buf = pb->cpu_bufs[buf_idx]; + if (!cpu_buf) + return libbpf_err(-ENOENT); + + *buf = cpu_buf->base; + *buf_size = pb->mmap_size; + return 0; +} + /* * Consume data from perf ring buffer corresponding to slot *buf_idx* in * PERF_EVENT_ARRAY BPF map without waiting/polling. If there is no data to @@ -12227,254 +13561,6 @@ int perf_buffer__consume(struct perf_buffer *pb) return 0; } -struct bpf_prog_info_array_desc { - int array_offset; /* e.g. offset of jited_prog_insns */ - int count_offset; /* e.g. offset of jited_prog_len */ - int size_offset; /* > 0: offset of rec size, - * < 0: fix size of -size_offset - */ -}; - -static struct bpf_prog_info_array_desc bpf_prog_info_array_desc[] = { - [BPF_PROG_INFO_JITED_INSNS] = { - offsetof(struct bpf_prog_info, jited_prog_insns), - offsetof(struct bpf_prog_info, jited_prog_len), - -1, - }, - [BPF_PROG_INFO_XLATED_INSNS] = { - offsetof(struct bpf_prog_info, xlated_prog_insns), - offsetof(struct bpf_prog_info, xlated_prog_len), - -1, - }, - [BPF_PROG_INFO_MAP_IDS] = { - offsetof(struct bpf_prog_info, map_ids), - offsetof(struct bpf_prog_info, nr_map_ids), - -(int)sizeof(__u32), - }, - [BPF_PROG_INFO_JITED_KSYMS] = { - offsetof(struct bpf_prog_info, jited_ksyms), - offsetof(struct bpf_prog_info, nr_jited_ksyms), - -(int)sizeof(__u64), - }, - [BPF_PROG_INFO_JITED_FUNC_LENS] = { - offsetof(struct bpf_prog_info, jited_func_lens), - offsetof(struct bpf_prog_info, nr_jited_func_lens), - -(int)sizeof(__u32), - }, - [BPF_PROG_INFO_FUNC_INFO] = { - offsetof(struct bpf_prog_info, func_info), - offsetof(struct bpf_prog_info, nr_func_info), - offsetof(struct bpf_prog_info, func_info_rec_size), - }, - [BPF_PROG_INFO_LINE_INFO] = { - offsetof(struct bpf_prog_info, line_info), - offsetof(struct bpf_prog_info, nr_line_info), - offsetof(struct bpf_prog_info, line_info_rec_size), - }, - [BPF_PROG_INFO_JITED_LINE_INFO] = { - offsetof(struct bpf_prog_info, jited_line_info), - offsetof(struct bpf_prog_info, nr_jited_line_info), - offsetof(struct bpf_prog_info, jited_line_info_rec_size), - }, - [BPF_PROG_INFO_PROG_TAGS] = { - offsetof(struct bpf_prog_info, prog_tags), - offsetof(struct bpf_prog_info, nr_prog_tags), - -(int)sizeof(__u8) * BPF_TAG_SIZE, - }, - -}; - -static __u32 bpf_prog_info_read_offset_u32(struct bpf_prog_info *info, - int offset) -{ - __u32 *array = (__u32 *)info; - - if (offset >= 0) - return array[offset / sizeof(__u32)]; - return -(int)offset; -} - -static __u64 bpf_prog_info_read_offset_u64(struct bpf_prog_info *info, - int offset) -{ - __u64 *array = (__u64 *)info; - - if (offset >= 0) - return array[offset / sizeof(__u64)]; - return -(int)offset; -} - -static void bpf_prog_info_set_offset_u32(struct bpf_prog_info *info, int offset, - __u32 val) -{ - __u32 *array = (__u32 *)info; - - if (offset >= 0) - array[offset / sizeof(__u32)] = val; -} - -static void bpf_prog_info_set_offset_u64(struct bpf_prog_info *info, int offset, - __u64 val) -{ - __u64 *array = (__u64 *)info; - - if (offset >= 0) - array[offset / sizeof(__u64)] = val; -} - -struct bpf_prog_info_linear * -bpf_program__get_prog_info_linear(int fd, __u64 arrays) -{ - struct bpf_prog_info_linear *info_linear; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - __u32 data_len = 0; - int i, err; - void *ptr; - - if (arrays >> BPF_PROG_INFO_LAST_ARRAY) - return libbpf_err_ptr(-EINVAL); - - /* step 1: get array dimensions */ - err = bpf_obj_get_info_by_fd(fd, &info, &info_len); - if (err) { - pr_debug("can't get prog info: %s", strerror(errno)); - return libbpf_err_ptr(-EFAULT); - } - - /* step 2: calculate total size of all arrays */ - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - bool include_array = (arrays & (1UL << i)) > 0; - struct bpf_prog_info_array_desc *desc; - __u32 count, size; - - desc = bpf_prog_info_array_desc + i; - - /* kernel is too old to support this field */ - if (info_len < desc->array_offset + sizeof(__u32) || - info_len < desc->count_offset + sizeof(__u32) || - (desc->size_offset > 0 && info_len < desc->size_offset)) - include_array = false; - - if (!include_array) { - arrays &= ~(1UL << i); /* clear the bit */ - continue; - } - - count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - - data_len += count * size; - } - - /* step 3: allocate continuous memory */ - data_len = roundup(data_len, sizeof(__u64)); - info_linear = malloc(sizeof(struct bpf_prog_info_linear) + data_len); - if (!info_linear) - return libbpf_err_ptr(-ENOMEM); - - /* step 4: fill data to info_linear->info */ - info_linear->arrays = arrays; - memset(&info_linear->info, 0, sizeof(info)); - ptr = info_linear->data; - - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u32 count, size; - - if ((arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - count = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - size = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - bpf_prog_info_set_offset_u32(&info_linear->info, - desc->count_offset, count); - bpf_prog_info_set_offset_u32(&info_linear->info, - desc->size_offset, size); - bpf_prog_info_set_offset_u64(&info_linear->info, - desc->array_offset, - ptr_to_u64(ptr)); - ptr += count * size; - } - - /* step 5: call syscall again to get required arrays */ - err = bpf_obj_get_info_by_fd(fd, &info_linear->info, &info_len); - if (err) { - pr_debug("can't get prog info: %s", strerror(errno)); - free(info_linear); - return libbpf_err_ptr(-EFAULT); - } - - /* step 6: verify the data */ - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u32 v1, v2; - - if ((arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - v1 = bpf_prog_info_read_offset_u32(&info, desc->count_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, - desc->count_offset); - if (v1 != v2) - pr_warn("%s: mismatch in element count\n", __func__); - - v1 = bpf_prog_info_read_offset_u32(&info, desc->size_offset); - v2 = bpf_prog_info_read_offset_u32(&info_linear->info, - desc->size_offset); - if (v1 != v2) - pr_warn("%s: mismatch in rec size\n", __func__); - } - - /* step 7: update info_len and data_len */ - info_linear->info_len = sizeof(struct bpf_prog_info); - info_linear->data_len = data_len; - - return info_linear; -} - -void bpf_program__bpil_addr_to_offs(struct bpf_prog_info_linear *info_linear) -{ - int i; - - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u64 addr, offs; - - if ((info_linear->arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - addr = bpf_prog_info_read_offset_u64(&info_linear->info, - desc->array_offset); - offs = addr - ptr_to_u64(info_linear->data); - bpf_prog_info_set_offset_u64(&info_linear->info, - desc->array_offset, offs); - } -} - -void bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear) -{ - int i; - - for (i = BPF_PROG_INFO_FIRST_ARRAY; i < BPF_PROG_INFO_LAST_ARRAY; ++i) { - struct bpf_prog_info_array_desc *desc; - __u64 addr, offs; - - if ((info_linear->arrays & (1UL << i)) == 0) - continue; - - desc = bpf_prog_info_array_desc + i; - offs = bpf_prog_info_read_offset_u64(&info_linear->info, - desc->array_offset); - addr = offs + ptr_to_u64(info_linear->data); - bpf_prog_info_set_offset_u64(&info_linear->info, - desc->array_offset, addr); - } -} - int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name) @@ -12626,14 +13712,15 @@ int libbpf_num_possible_cpus(void) static int populate_skeleton_maps(const struct bpf_object *obj, struct bpf_map_skeleton *maps, - size_t map_cnt) + size_t map_cnt, size_t map_skel_sz) { int i; for (i = 0; i < map_cnt; i++) { - struct bpf_map **map = maps[i].map; - const char *name = maps[i].name; - void **mmaped = maps[i].mmaped; + struct bpf_map_skeleton *map_skel = (void *)maps + i * map_skel_sz; + struct bpf_map **map = map_skel->map; + const char *name = map_skel->name; + void **mmaped = map_skel->mmaped; *map = bpf_object__find_map_by_name(obj, name); if (!*map) { @@ -12650,13 +13737,14 @@ static int populate_skeleton_maps(const struct bpf_object *obj, static int populate_skeleton_progs(const struct bpf_object *obj, struct bpf_prog_skeleton *progs, - size_t prog_cnt) + size_t prog_cnt, size_t prog_skel_sz) { int i; for (i = 0; i < prog_cnt; i++) { - struct bpf_program **prog = progs[i].prog; - const char *name = progs[i].name; + struct bpf_prog_skeleton *prog_skel = (void *)progs + i * prog_skel_sz; + struct bpf_program **prog = prog_skel->prog; + const char *name = prog_skel->name; *prog = bpf_object__find_program_by_name(obj, name); if (!*prog) { @@ -12697,13 +13785,13 @@ int bpf_object__open_skeleton(struct bpf_object_skeleton *s, } *s->obj = obj; - err = populate_skeleton_maps(obj, s->maps, s->map_cnt); + err = populate_skeleton_maps(obj, s->maps, s->map_cnt, s->map_skel_sz); if (err) { pr_warn("failed to populate skeleton maps for '%s': %d\n", s->name, err); return libbpf_err(err); } - err = populate_skeleton_progs(obj, s->progs, s->prog_cnt); + err = populate_skeleton_progs(obj, s->progs, s->prog_cnt, s->prog_skel_sz); if (err) { pr_warn("failed to populate skeleton progs for '%s': %d\n", s->name, err); return libbpf_err(err); @@ -12729,24 +13817,24 @@ int bpf_object__open_subskeleton(struct bpf_object_subskeleton *s) btf = bpf_object__btf(s->obj); if (!btf) { pr_warn("subskeletons require BTF at runtime (object %s)\n", - bpf_object__name(s->obj)); + bpf_object__name(s->obj)); return libbpf_err(-errno); } - err = populate_skeleton_maps(s->obj, s->maps, s->map_cnt); + err = populate_skeleton_maps(s->obj, s->maps, s->map_cnt, s->map_skel_sz); if (err) { pr_warn("failed to populate subskeleton maps: %d\n", err); return libbpf_err(err); } - err = populate_skeleton_progs(s->obj, s->progs, s->prog_cnt); + err = populate_skeleton_progs(s->obj, s->progs, s->prog_cnt, s->prog_skel_sz); if (err) { pr_warn("failed to populate subskeleton maps: %d\n", err); return libbpf_err(err); } for (var_idx = 0; var_idx < s->var_cnt; var_idx++) { - var_skel = &s->vars[var_idx]; + var_skel = (void *)s->vars + var_idx * s->var_skel_sz; map = *var_skel->map; map_type_id = bpf_map__btf_value_type_id(map); map_type = btf__type_by_id(btf, map_type_id); @@ -12793,10 +13881,11 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) } for (i = 0; i < s->map_cnt; i++) { - struct bpf_map *map = *s->maps[i].map; + struct bpf_map_skeleton *map_skel = (void *)s->maps + i * s->map_skel_sz; + struct bpf_map *map = *map_skel->map; size_t mmap_sz = bpf_map_mmap_sz(map); - int prot, map_fd = bpf_map__fd(map); - void **mmaped = s->maps[i].mmaped; + int prot, map_fd = map->fd; + void **mmaped = map_skel->mmaped; if (!mmaped) continue; @@ -12806,6 +13895,11 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) continue; } + if (map->def.type == BPF_MAP_TYPE_ARENA) { + *mmaped = map->mmaped; + continue; + } + if (map->def.map_flags & BPF_F_RDONLY_PROG) prot = PROT_READ; else @@ -12821,8 +13915,7 @@ int bpf_object__load_skeleton(struct bpf_object_skeleton *s) * as per normal clean up procedure, so we don't need to worry * about it from skeleton's clean up perspective. */ - *mmaped = mmap(map->mmaped, mmap_sz, prot, - MAP_SHARED | MAP_FIXED, map_fd, 0); + *mmaped = mmap(map->mmaped, mmap_sz, prot, MAP_SHARED | MAP_FIXED, map_fd, 0); if (*mmaped == MAP_FAILED) { err = -errno; *mmaped = NULL; @@ -12840,10 +13933,11 @@ int bpf_object__attach_skeleton(struct bpf_object_skeleton *s) int i, err; for (i = 0; i < s->prog_cnt; i++) { - struct bpf_program *prog = *s->progs[i].prog; - struct bpf_link **link = s->progs[i].link; + struct bpf_prog_skeleton *prog_skel = (void *)s->progs + i * s->prog_skel_sz; + struct bpf_program *prog = *prog_skel->prog; + struct bpf_link **link = prog_skel->link; - if (!prog->autoload) + if (!prog->autoload || !prog->autoattach) continue; /* auto-attaching not supported for this program */ @@ -12873,6 +13967,38 @@ int bpf_object__attach_skeleton(struct bpf_object_skeleton *s) */ } + + for (i = 0; i < s->map_cnt; i++) { + struct bpf_map_skeleton *map_skel = (void *)s->maps + i * s->map_skel_sz; + struct bpf_map *map = *map_skel->map; + struct bpf_link **link; + + if (!map->autocreate || !map->autoattach) + continue; + + /* only struct_ops maps can be attached */ + if (!bpf_map__is_struct_ops(map)) + continue; + + /* skeleton is created with earlier version of bpftool, notify user */ + if (s->map_skel_sz < offsetofend(struct bpf_map_skeleton, link)) { + pr_warn("map '%s': BPF skeleton version is old, skipping map auto-attachment...\n", + bpf_map__name(map)); + continue; + } + + link = map_skel->link; + if (*link) + continue; + + *link = bpf_map__attach_struct_ops(map); + if (!*link) { + err = -errno; + pr_warn("map '%s': failed to auto-attach: %d\n", bpf_map__name(map), err); + return libbpf_err(err); + } + } + return 0; } @@ -12881,11 +14007,25 @@ void bpf_object__detach_skeleton(struct bpf_object_skeleton *s) int i; for (i = 0; i < s->prog_cnt; i++) { - struct bpf_link **link = s->progs[i].link; + struct bpf_prog_skeleton *prog_skel = (void *)s->progs + i * s->prog_skel_sz; + struct bpf_link **link = prog_skel->link; bpf_link__destroy(*link); *link = NULL; } + + if (s->map_skel_sz < sizeof(struct bpf_map_skeleton)) + return; + + for (i = 0; i < s->map_cnt; i++) { + struct bpf_map_skeleton *map_skel = (void *)s->maps + i * s->map_skel_sz; + struct bpf_link **link = map_skel->link; + + if (link) { + bpf_link__destroy(*link); + *link = NULL; + } + } } void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) @@ -12893,8 +14033,7 @@ void bpf_object__destroy_skeleton(struct bpf_object_skeleton *s) if (!s) return; - if (s->progs) - bpf_object__detach_skeleton(s); + bpf_object__detach_skeleton(s); if (s->obj) bpf_object__close(*s->obj); free(s->maps); diff --git a/pkg/plugin/lib/common/libbpf/_src/libbpf.h b/pkg/plugin/lib/common/libbpf/_src/libbpf.h index cdbfee60ea..64a6a3d323 100644 --- a/pkg/plugin/lib/common/libbpf/_src/libbpf.h +++ b/pkg/plugin/lib/common/libbpf/_src/libbpf.h @@ -51,6 +51,42 @@ enum libbpf_errno { LIBBPF_API int libbpf_strerror(int err, char *buf, size_t size); +/** + * @brief **libbpf_bpf_attach_type_str()** converts the provided attach type + * value into a textual representation. + * @param t The attach type. + * @return Pointer to a static string identifying the attach type. NULL is + * returned for unknown **bpf_attach_type** values. + */ +LIBBPF_API const char *libbpf_bpf_attach_type_str(enum bpf_attach_type t); + +/** + * @brief **libbpf_bpf_link_type_str()** converts the provided link type value + * into a textual representation. + * @param t The link type. + * @return Pointer to a static string identifying the link type. NULL is + * returned for unknown **bpf_link_type** values. + */ +LIBBPF_API const char *libbpf_bpf_link_type_str(enum bpf_link_type t); + +/** + * @brief **libbpf_bpf_map_type_str()** converts the provided map type value + * into a textual representation. + * @param t The map type. + * @return Pointer to a static string identifying the map type. NULL is + * returned for unknown **bpf_map_type** values. + */ +LIBBPF_API const char *libbpf_bpf_map_type_str(enum bpf_map_type t); + +/** + * @brief **libbpf_bpf_prog_type_str()** converts the provided program type + * value into a textual representation. + * @param t The program type. + * @return Pointer to a static string identifying the program type. NULL is + * returned for unknown **bpf_prog_type** values. + */ +LIBBPF_API const char *libbpf_bpf_prog_type_str(enum bpf_prog_type t); + enum libbpf_print_level { LIBBPF_WARN, LIBBPF_INFO, @@ -60,18 +96,24 @@ enum libbpf_print_level { typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level, const char *, va_list ap); +/** + * @brief **libbpf_set_print()** sets user-provided log callback function to + * be used for libbpf warnings and informational messages. If the user callback + * is not set, messages are logged to stderr by default. The verbosity of these + * messages can be controlled by setting the environment variable + * LIBBPF_LOG_LEVEL to either warn, info, or debug. + * @param fn The log print function. If NULL, libbpf won't print anything. + * @return Pointer to old print function. + * + * This function is thread-safe. + */ LIBBPF_API libbpf_print_fn_t libbpf_set_print(libbpf_print_fn_t fn); /* Hide internal to user */ struct bpf_object; -struct bpf_object_open_attr { - const char *file; - enum bpf_prog_type prog_type; -}; - struct bpf_object_open_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* object name override, if provided: * - for object open from file, this will override setting object @@ -82,21 +124,14 @@ struct bpf_object_open_opts { const char *object_name; /* parse map definitions non-strictly, allowing extra attributes/data */ bool relaxed_maps; - /* DEPRECATED: handle CO-RE relocations non-strictly, allowing failures. - * Value is ignored. Relocations always are processed non-strictly. - * Non-relocatable instructions are replaced with invalid ones to - * prevent accidental errors. - * */ - LIBBPF_DEPRECATED_SINCE(0, 6, "field has no effect") - bool relaxed_core_relocs; /* maps that set the 'pinning' attribute in their definition will have * their pin_path attribute set to a file in this directory, and be * auto-pinned to that path on load; defaults to "/sys/fs/bpf". */ const char *pin_root_path; - LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__set_attach_target() on each individual bpf_program") - __u32 attach_prog_fd; + __u32 :32; /* stub out now removed attach_prog_fd */ + /* Additional kernel config content that augments and overrides * system Kconfig for CONFIG_xxx externs. */ @@ -145,11 +180,38 @@ struct bpf_object_open_opts { * logs through its print callback. */ __u32 kernel_log_level; + /* Path to BPF FS mount point to derive BPF token from. + * + * Created BPF token will be used for all bpf() syscall operations + * that accept BPF token (e.g., map creation, BTF and program loads, + * etc) automatically within instantiated BPF object. + * + * If bpf_token_path is not specified, libbpf will consult + * LIBBPF_BPF_TOKEN_PATH environment variable. If set, it will be + * taken as a value of bpf_token_path option and will force libbpf to + * either create BPF token from provided custom BPF FS path, or will + * disable implicit BPF token creation, if envvar value is an empty + * string. bpf_token_path overrides LIBBPF_BPF_TOKEN_PATH, if both are + * set at the same time. + * + * Setting bpf_token_path option to empty string disables libbpf's + * automatic attempt to create BPF token from default BPF FS mount + * point (/sys/fs/bpf), in case this default behavior is undesirable. + */ + const char *bpf_token_path; size_t :0; }; -#define bpf_object_open_opts__last_field kernel_log_level +#define bpf_object_open_opts__last_field bpf_token_path +/** + * @brief **bpf_object__open()** creates a bpf_object by opening + * the BPF ELF object file pointed to by the passed path and loading it + * into memory. + * @param path BPF object file path. + * @return pointer to the new bpf_object; or NULL is returned on error, + * error code is stored in errno + */ LIBBPF_API struct bpf_object *bpf_object__open(const char *path); /** @@ -179,26 +241,46 @@ LIBBPF_API struct bpf_object * bpf_object__open_mem(const void *obj_buf, size_t obj_buf_sz, const struct bpf_object_open_opts *opts); -/* deprecated bpf_object__open variants */ -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__open_mem() instead") -LIBBPF_API struct bpf_object * -bpf_object__open_buffer(const void *obj_buf, size_t obj_buf_sz, - const char *name); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open_file() instead") -LIBBPF_API struct bpf_object * -bpf_object__open_xattr(struct bpf_object_open_attr *attr); +/** + * @brief **bpf_object__load()** loads BPF object into kernel. + * @param obj Pointer to a valid BPF object instance returned by + * **bpf_object__open*()** APIs + * @return 0, on success; negative error code, otherwise, error code is + * stored in errno + */ +LIBBPF_API int bpf_object__load(struct bpf_object *obj); -enum libbpf_pin_type { - LIBBPF_PIN_NONE, - /* PIN_BY_NAME: pin maps by name (in /sys/fs/bpf by default) */ - LIBBPF_PIN_BY_NAME, -}; +/** + * @brief **bpf_object__close()** closes a BPF object and releases all + * resources. + * @param obj Pointer to a valid BPF object + */ +LIBBPF_API void bpf_object__close(struct bpf_object *obj); -/* pin_maps and unpin_maps can both be called with a NULL path, in which case - * they will use the pin_path attribute of each map (and ignore all maps that - * don't have a pin_path set). +/** + * @brief **bpf_object__pin_maps()** pins each map contained within + * the BPF object at the passed directory. + * @param obj Pointer to a valid BPF object + * @param path A directory where maps should be pinned. + * @return 0, on success; negative error code, otherwise + * + * If `path` is NULL `bpf_map__pin` (which is being used on each map) + * will use the pin_path attribute of each map. In this case, maps that + * don't have a pin_path set will be ignored. */ LIBBPF_API int bpf_object__pin_maps(struct bpf_object *obj, const char *path); + +/** + * @brief **bpf_object__unpin_maps()** unpins each map contained within + * the BPF object found in the passed directory. + * @param obj Pointer to a valid BPF object + * @param path A directory where pinned maps should be searched for. + * @return 0, on success; negative error code, otherwise + * + * If `path` is NULL `bpf_map__unpin` (which is being used on each map) + * will use the pin_path attribute of each map. In this case, maps that + * don't have a pin_path set will be ignored. + */ LIBBPF_API int bpf_object__unpin_maps(struct bpf_object *obj, const char *path); LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj, @@ -206,20 +288,7 @@ LIBBPF_API int bpf_object__pin_programs(struct bpf_object *obj, LIBBPF_API int bpf_object__unpin_programs(struct bpf_object *obj, const char *path); LIBBPF_API int bpf_object__pin(struct bpf_object *object, const char *path); -LIBBPF_API void bpf_object__close(struct bpf_object *object); - -struct bpf_object_load_attr { - struct bpf_object *obj; - int log_level; - const char *target_btf_path; -}; - -/* Load/unload object into/from kernel */ -LIBBPF_API int bpf_object__load(struct bpf_object *obj); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__load() instead") -LIBBPF_API int bpf_object__load_xattr(struct bpf_object_load_attr *attr); -LIBBPF_DEPRECATED_SINCE(0, 6, "bpf_object__unload() is deprecated, use bpf_object__close() instead") -LIBBPF_API int bpf_object__unload(struct bpf_object *obj); +LIBBPF_API int bpf_object__unpin(struct bpf_object *object, const char *path); LIBBPF_API const char *bpf_object__name(const struct bpf_object *obj); LIBBPF_API unsigned int bpf_object__kversion(const struct bpf_object *obj); @@ -229,29 +298,10 @@ struct btf; LIBBPF_API struct btf *bpf_object__btf(const struct bpf_object *obj); LIBBPF_API int bpf_object__btf_fd(const struct bpf_object *obj); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__find_program_by_name() instead") -LIBBPF_API struct bpf_program * -bpf_object__find_program_by_title(const struct bpf_object *obj, - const char *title); LIBBPF_API struct bpf_program * bpf_object__find_program_by_name(const struct bpf_object *obj, const char *name); -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "track bpf_objects in application code instead") -struct bpf_object *bpf_object__next(struct bpf_object *prev); -#define bpf_object__for_each_safe(pos, tmp) \ - for ((pos) = bpf_object__next(NULL), \ - (tmp) = bpf_object__next(pos); \ - (pos) != NULL; \ - (pos) = (tmp), (tmp) = bpf_object__next(tmp)) - -typedef void (*bpf_object_clear_priv_t)(struct bpf_object *, void *); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API int bpf_object__set_priv(struct bpf_object *obj, void *priv, - bpf_object_clear_priv_t clear_priv); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API void *bpf_object__priv(const struct bpf_object *prog); - LIBBPF_API int libbpf_prog_type_by_name(const char *name, enum bpf_prog_type *prog_type, enum bpf_attach_type *expected_attach_type); @@ -262,9 +312,7 @@ LIBBPF_API int libbpf_find_vmlinux_btf_id(const char *name, /* Accessors of bpf_program */ struct bpf_program; -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__next_program() instead") -struct bpf_program *bpf_program__next(struct bpf_program *prog, - const struct bpf_object *obj); + LIBBPF_API struct bpf_program * bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog); @@ -273,32 +321,18 @@ bpf_object__next_program(const struct bpf_object *obj, struct bpf_program *prog) (pos) != NULL; \ (pos) = bpf_object__next_program((obj), (pos))) -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__prev_program() instead") -struct bpf_program *bpf_program__prev(struct bpf_program *prog, - const struct bpf_object *obj); LIBBPF_API struct bpf_program * bpf_object__prev_program(const struct bpf_object *obj, struct bpf_program *prog); -typedef void (*bpf_program_clear_priv_t)(struct bpf_program *, void *); - -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API int bpf_program__set_priv(struct bpf_program *prog, void *priv, - bpf_program_clear_priv_t clear_priv); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API void *bpf_program__priv(const struct bpf_program *prog); LIBBPF_API void bpf_program__set_ifindex(struct bpf_program *prog, __u32 ifindex); LIBBPF_API const char *bpf_program__name(const struct bpf_program *prog); LIBBPF_API const char *bpf_program__section_name(const struct bpf_program *prog); -LIBBPF_API LIBBPF_DEPRECATED("BPF program title is confusing term; please use bpf_program__section_name() instead") -const char *bpf_program__title(const struct bpf_program *prog, bool needs_copy); LIBBPF_API bool bpf_program__autoload(const struct bpf_program *prog); LIBBPF_API int bpf_program__set_autoload(struct bpf_program *prog, bool autoload); - -/* returns program size in bytes */ -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__insn_cnt() instead") -LIBBPF_API size_t bpf_program__size(const struct bpf_program *prog); +LIBBPF_API bool bpf_program__autoattach(const struct bpf_program *prog); +LIBBPF_API void bpf_program__set_autoattach(struct bpf_program *prog, bool autoattach); struct bpf_insn; @@ -323,6 +357,24 @@ struct bpf_insn; * different. */ LIBBPF_API const struct bpf_insn *bpf_program__insns(const struct bpf_program *prog); + +/** + * @brief **bpf_program__set_insns()** can set BPF program's underlying + * BPF instructions. + * + * WARNING: This is a very advanced libbpf API and users need to know + * what they are doing. This should be used from prog_prepare_load_fn + * callback only. + * + * @param prog BPF program for which to return instructions + * @param new_insns a pointer to an array of BPF instructions + * @param new_insn_cnt number of `struct bpf_insn`'s that form + * specified BPF program + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API int bpf_program__set_insns(struct bpf_program *prog, + struct bpf_insn *new_insns, size_t new_insn_cnt); + /** * @brief **bpf_program__insn_cnt()** returns number of `struct bpf_insn`'s * that form specified BPF program. @@ -334,17 +386,7 @@ LIBBPF_API const struct bpf_insn *bpf_program__insns(const struct bpf_program *p */ LIBBPF_API size_t bpf_program__insn_cnt(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 6, "use bpf_object__load() instead") -LIBBPF_API int bpf_program__load(struct bpf_program *prog, const char *license, __u32 kern_version); LIBBPF_API int bpf_program__fd(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") -LIBBPF_API int bpf_program__pin_instance(struct bpf_program *prog, - const char *path, - int instance); -LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") -LIBBPF_API int bpf_program__unpin_instance(struct bpf_program *prog, - const char *path, - int instance); /** * @brief **bpf_program__pin()** pins the BPF program to a file @@ -430,12 +472,15 @@ LIBBPF_API struct bpf_link * bpf_program__attach(const struct bpf_program *prog); struct bpf_perf_event_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* custom user-provided value fetchable through bpf_get_attach_cookie() */ __u64 bpf_cookie; + /* don't use BPF link when attach BPF program */ + bool force_ioctl_attach; + size_t :0; }; -#define bpf_perf_event_opts__last_field bpf_cookie +#define bpf_perf_event_opts__last_field force_ioctl_attach LIBBPF_API struct bpf_link * bpf_program__attach_perf_event(const struct bpf_program *prog, int pfd); @@ -444,8 +489,25 @@ LIBBPF_API struct bpf_link * bpf_program__attach_perf_event_opts(const struct bpf_program *prog, int pfd, const struct bpf_perf_event_opts *opts); +/** + * enum probe_attach_mode - the mode to attach kprobe/uprobe + * + * force libbpf to attach kprobe/uprobe in specific mode, -ENOTSUP will + * be returned if it is not supported by the kernel. + */ +enum probe_attach_mode { + /* attach probe in latest supported mode by kernel */ + PROBE_ATTACH_MODE_DEFAULT = 0, + /* attach probe in legacy mode, using debugfs/tracefs */ + PROBE_ATTACH_MODE_LEGACY, + /* create perf event with perf_event_open() syscall */ + PROBE_ATTACH_MODE_PERF, + /* attach probe with BPF link */ + PROBE_ATTACH_MODE_LINK, +}; + struct bpf_kprobe_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* custom user-provided value fetchable through bpf_get_attach_cookie() */ __u64 bpf_cookie; @@ -453,9 +515,11 @@ struct bpf_kprobe_opts { size_t offset; /* kprobe is return probe */ bool retprobe; + /* kprobe attach mode */ + enum probe_attach_mode attach_mode; size_t :0; }; -#define bpf_kprobe_opts__last_field retprobe +#define bpf_kprobe_opts__last_field attach_mode LIBBPF_API struct bpf_link * bpf_program__attach_kprobe(const struct bpf_program *prog, bool retprobe, @@ -478,18 +542,117 @@ struct bpf_kprobe_multi_opts { size_t cnt; /* create return kprobes */ bool retprobe; + /* create session kprobes */ + bool session; size_t :0; }; -#define bpf_kprobe_multi_opts__last_field retprobe +#define bpf_kprobe_multi_opts__last_field session LIBBPF_API struct bpf_link * bpf_program__attach_kprobe_multi_opts(const struct bpf_program *prog, const char *pattern, const struct bpf_kprobe_multi_opts *opts); +struct bpf_uprobe_multi_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* array of function symbols to attach to */ + const char **syms; + /* array of function addresses to attach to */ + const unsigned long *offsets; + /* optional, array of associated ref counter offsets */ + const unsigned long *ref_ctr_offsets; + /* optional, array of associated BPF cookies */ + const __u64 *cookies; + /* number of elements in syms/addrs/cookies arrays */ + size_t cnt; + /* create return uprobes */ + bool retprobe; + size_t :0; +}; + +#define bpf_uprobe_multi_opts__last_field retprobe + +/** + * @brief **bpf_program__attach_uprobe_multi()** attaches a BPF program + * to multiple uprobes with uprobe_multi link. + * + * User can specify 2 mutually exclusive set of inputs: + * + * 1) use only path/func_pattern/pid arguments + * + * 2) use path/pid with allowed combinations of + * syms/offsets/ref_ctr_offsets/cookies/cnt + * + * - syms and offsets are mutually exclusive + * - ref_ctr_offsets and cookies are optional + * + * + * @param prog BPF program to attach + * @param pid Process ID to attach the uprobe to, 0 for self (own process), + * -1 for all processes + * @param binary_path Path to binary + * @param func_pattern Regular expression to specify functions to attach + * BPF program to + * @param opts Additional options (see **struct bpf_uprobe_multi_opts**) + * @return 0, on success; negative error code, otherwise + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_uprobe_multi(const struct bpf_program *prog, + pid_t pid, + const char *binary_path, + const char *func_pattern, + const struct bpf_uprobe_multi_opts *opts); + +struct bpf_ksyscall_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 bpf_cookie; + /* attach as return probe? */ + bool retprobe; + size_t :0; +}; +#define bpf_ksyscall_opts__last_field retprobe + +/** + * @brief **bpf_program__attach_ksyscall()** attaches a BPF program + * to kernel syscall handler of a specified syscall. Optionally it's possible + * to request to install retprobe that will be triggered at syscall exit. It's + * also possible to associate BPF cookie (though options). + * + * Libbpf automatically will determine correct full kernel function name, + * which depending on system architecture and kernel version/configuration + * could be of the form ___sys_ or __se_sys_, and will + * attach specified program using kprobe/kretprobe mechanism. + * + * **bpf_program__attach_ksyscall()** is an API counterpart of declarative + * **SEC("ksyscall/")** annotation of BPF programs. + * + * At the moment **SEC("ksyscall")** and **bpf_program__attach_ksyscall()** do + * not handle all the calling convention quirks for mmap(), clone() and compat + * syscalls. It also only attaches to "native" syscall interfaces. If host + * system supports compat syscalls or defines 32-bit syscalls in 64-bit + * kernel, such syscall interfaces won't be attached to by libbpf. + * + * These limitations may or may not change in the future. Therefore it is + * recommended to use SEC("kprobe") for these syscalls or if working with + * compat and 32-bit interfaces is required. + * + * @param prog BPF program to attach + * @param syscall_name Symbolic name of the syscall (e.g., "bpf") + * @param opts Additional options (see **struct bpf_ksyscall_opts**) + * @return Reference to the newly created BPF link; or NULL is returned on + * error, error code is stored in errno + */ +LIBBPF_API struct bpf_link * +bpf_program__attach_ksyscall(const struct bpf_program *prog, + const char *syscall_name, + const struct bpf_ksyscall_opts *opts); + struct bpf_uprobe_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* offset of kernel reference counted USDT semaphore, added in * a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") @@ -507,9 +670,11 @@ struct bpf_uprobe_opts { * binary_path. */ const char *func_name; + /* uprobe attach mode */ + enum probe_attach_mode attach_mode; size_t :0; }; -#define bpf_uprobe_opts__last_field func_name +#define bpf_uprobe_opts__last_field attach_mode /** * @brief **bpf_program__attach_uprobe()** attaches a BPF program @@ -583,7 +748,7 @@ bpf_program__attach_usdt(const struct bpf_program *prog, const struct bpf_usdt_opts *opts); struct bpf_tracepoint_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* custom user-provided value fetchable through bpf_get_attach_cookie() */ __u64 bpf_cookie; @@ -600,11 +765,34 @@ bpf_program__attach_tracepoint_opts(const struct bpf_program *prog, const char *tp_name, const struct bpf_tracepoint_opts *opts); +struct bpf_raw_tracepoint_opts { + size_t sz; /* size of this struct for forward/backward compatibility */ + __u64 cookie; + size_t :0; +}; +#define bpf_raw_tracepoint_opts__last_field cookie + LIBBPF_API struct bpf_link * bpf_program__attach_raw_tracepoint(const struct bpf_program *prog, const char *tp_name); +LIBBPF_API struct bpf_link * +bpf_program__attach_raw_tracepoint_opts(const struct bpf_program *prog, + const char *tp_name, + struct bpf_raw_tracepoint_opts *opts); + +struct bpf_trace_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + /* custom user-provided value fetchable through bpf_get_attach_cookie() */ + __u64 cookie; +}; +#define bpf_trace_opts__last_field cookie + LIBBPF_API struct bpf_link * bpf_program__attach_trace(const struct bpf_program *prog); +LIBBPF_API struct bpf_link * +bpf_program__attach_trace_opts(const struct bpf_program *prog, const struct bpf_trace_opts *opts); + LIBBPF_API struct bpf_link * bpf_program__attach_lsm(const struct bpf_program *prog); LIBBPF_API struct bpf_link * @@ -612,14 +800,62 @@ bpf_program__attach_cgroup(const struct bpf_program *prog, int cgroup_fd); LIBBPF_API struct bpf_link * bpf_program__attach_netns(const struct bpf_program *prog, int netns_fd); LIBBPF_API struct bpf_link * +bpf_program__attach_sockmap(const struct bpf_program *prog, int map_fd); +LIBBPF_API struct bpf_link * bpf_program__attach_xdp(const struct bpf_program *prog, int ifindex); LIBBPF_API struct bpf_link * bpf_program__attach_freplace(const struct bpf_program *prog, int target_fd, const char *attach_func_name); +struct bpf_netfilter_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + + __u32 pf; + __u32 hooknum; + __s32 priority; + __u32 flags; +}; +#define bpf_netfilter_opts__last_field flags + +LIBBPF_API struct bpf_link * +bpf_program__attach_netfilter(const struct bpf_program *prog, + const struct bpf_netfilter_opts *opts); + +struct bpf_tcx_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 relative_fd; + __u32 relative_id; + __u64 expected_revision; + size_t :0; +}; +#define bpf_tcx_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_program__attach_tcx(const struct bpf_program *prog, int ifindex, + const struct bpf_tcx_opts *opts); + +struct bpf_netkit_opts { + /* size of this struct, for forward/backward compatibility */ + size_t sz; + __u32 flags; + __u32 relative_fd; + __u32 relative_id; + __u64 expected_revision; + size_t :0; +}; +#define bpf_netkit_opts__last_field expected_revision + +LIBBPF_API struct bpf_link * +bpf_program__attach_netkit(const struct bpf_program *prog, int ifindex, + const struct bpf_netkit_opts *opts); + struct bpf_map; LIBBPF_API struct bpf_link *bpf_map__attach_struct_ops(const struct bpf_map *map); +LIBBPF_API int bpf_link__update_map(struct bpf_link *link, const struct bpf_map *map); struct bpf_iter_attach_opts { size_t sz; /* size of this struct for forward/backward compatibility */ @@ -632,99 +868,6 @@ LIBBPF_API struct bpf_link * bpf_program__attach_iter(const struct bpf_program *prog, const struct bpf_iter_attach_opts *opts); -/* - * Libbpf allows callers to adjust BPF programs before being loaded - * into kernel. One program in an object file can be transformed into - * multiple variants to be attached to different hooks. - * - * bpf_program_prep_t, bpf_program__set_prep and bpf_program__nth_fd - * form an API for this purpose. - * - * - bpf_program_prep_t: - * Defines a 'preprocessor', which is a caller defined function - * passed to libbpf through bpf_program__set_prep(), and will be - * called before program is loaded. The processor should adjust - * the program one time for each instance according to the instance id - * passed to it. - * - * - bpf_program__set_prep: - * Attaches a preprocessor to a BPF program. The number of instances - * that should be created is also passed through this function. - * - * - bpf_program__nth_fd: - * After the program is loaded, get resulting FD of a given instance - * of the BPF program. - * - * If bpf_program__set_prep() is not used, the program would be loaded - * without adjustment during bpf_object__load(). The program has only - * one instance. In this case bpf_program__fd(prog) is equal to - * bpf_program__nth_fd(prog, 0). - */ -struct bpf_prog_prep_result { - /* - * If not NULL, load new instruction array. - * If set to NULL, don't load this instance. - */ - struct bpf_insn *new_insn_ptr; - int new_insn_cnt; - - /* If not NULL, result FD is written to it. */ - int *pfd; -}; - -/* - * Parameters of bpf_program_prep_t: - * - prog: The bpf_program being loaded. - * - n: Index of instance being generated. - * - insns: BPF instructions array. - * - insns_cnt:Number of instructions in insns. - * - res: Output parameter, result of transformation. - * - * Return value: - * - Zero: pre-processing success. - * - Non-zero: pre-processing error, stop loading. - */ -typedef int (*bpf_program_prep_t)(struct bpf_program *prog, int n, - struct bpf_insn *insns, int insns_cnt, - struct bpf_prog_prep_result *res); - -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_program__insns() for getting bpf_program instructions") -LIBBPF_API int bpf_program__set_prep(struct bpf_program *prog, int nr_instance, - bpf_program_prep_t prep); - -LIBBPF_DEPRECATED_SINCE(0, 7, "multi-instance bpf_program support is deprecated") -LIBBPF_API int bpf_program__nth_fd(const struct bpf_program *prog, int n); - -/* - * Adjust type of BPF program. Default is kprobe. - */ -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_socket_filter(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_tracepoint(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_raw_tracepoint(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_kprobe(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_lsm(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_sched_cls(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_sched_act(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_xdp(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_perf_event(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_tracing(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_struct_ops(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_extension(struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__set_type() instead") -LIBBPF_API int bpf_program__set_sk_lookup(struct bpf_program *prog); - LIBBPF_API enum bpf_prog_type bpf_program__type(const struct bpf_program *prog); /** @@ -787,47 +930,6 @@ LIBBPF_API int bpf_program__set_attach_target(struct bpf_program *prog, int attach_prog_fd, const char *attach_func_name); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_socket_filter(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_tracepoint(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_raw_tracepoint(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_kprobe(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_lsm(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_sched_cls(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_sched_act(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_xdp(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_perf_event(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_tracing(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_struct_ops(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_extension(const struct bpf_program *prog); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_program__type() instead") -LIBBPF_API bool bpf_program__is_sk_lookup(const struct bpf_program *prog); - -/* - * No need for __attribute__((packed)), all members of 'bpf_map_def' - * are all aligned. In addition, using __attribute__((packed)) - * would trigger a -Wpacked warning message, and lead to an error - * if -Werror is set. - */ -struct bpf_map_def { - unsigned int type; - unsigned int key_size; - unsigned int value_size; - unsigned int max_entries; - unsigned int map_flags; -}; - /** * @brief **bpf_object__find_map_by_name()** returns BPF map of * the given name, if it exists within the passed BPF object @@ -842,16 +944,6 @@ bpf_object__find_map_by_name(const struct bpf_object *obj, const char *name); LIBBPF_API int bpf_object__find_map_fd_by_name(const struct bpf_object *obj, const char *name); -/* - * Get bpf_map through the offset of corresponding struct bpf_map_def - * in the BPF object file. - */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__find_map_by_name() instead") -struct bpf_map * -bpf_object__find_map_by_offset(struct bpf_object *obj, size_t offset); - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__next_map() instead") -struct bpf_map *bpf_map__next(const struct bpf_map *map, const struct bpf_object *obj); LIBBPF_API struct bpf_map * bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map); @@ -861,11 +953,48 @@ bpf_object__next_map(const struct bpf_object *obj, const struct bpf_map *map); (pos) = bpf_object__next_map((obj), (pos))) #define bpf_map__for_each bpf_object__for_each_map -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__prev_map() instead") -struct bpf_map *bpf_map__prev(const struct bpf_map *map, const struct bpf_object *obj); LIBBPF_API struct bpf_map * bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *map); +/** + * @brief **bpf_map__set_autocreate()** sets whether libbpf has to auto-create + * BPF map during BPF object load phase. + * @param map the BPF map instance + * @param autocreate whether to create BPF map during BPF object load + * @return 0 on success; -EBUSY if BPF object was already loaded + * + * **bpf_map__set_autocreate()** allows to opt-out from libbpf auto-creating + * BPF map. By default, libbpf will attempt to create every single BPF map + * defined in BPF object file using BPF_MAP_CREATE command of bpf() syscall + * and fill in map FD in BPF instructions. + * + * This API allows to opt-out of this process for specific map instance. This + * can be useful if host kernel doesn't support such BPF map type or used + * combination of flags and user application wants to avoid creating such + * a map in the first place. User is still responsible to make sure that their + * BPF-side code that expects to use such missing BPF map is recognized by BPF + * verifier as dead code, otherwise BPF verifier will reject such BPF program. + */ +LIBBPF_API int bpf_map__set_autocreate(struct bpf_map *map, bool autocreate); +LIBBPF_API bool bpf_map__autocreate(const struct bpf_map *map); + +/** + * @brief **bpf_map__set_autoattach()** sets whether libbpf has to auto-attach + * map during BPF skeleton attach phase. + * @param map the BPF map instance + * @param autoattach whether to attach map during BPF skeleton attach phase + * @return 0 on success; negative error code, otherwise + */ +LIBBPF_API int bpf_map__set_autoattach(struct bpf_map *map, bool autoattach); + +/** + * @brief **bpf_map__autoattach()** returns whether BPF map is configured to + * auto-attach during BPF skeleton attach phase. + * @param map the BPF map instance + * @return true if map is set to auto-attach during skeleton attach phase; false, otherwise + */ +LIBBPF_API bool bpf_map__autoattach(const struct bpf_map *map); + /** * @brief **bpf_map__fd()** gets the file descriptor of the passed * BPF map @@ -874,9 +1003,6 @@ bpf_object__prev_map(const struct bpf_object *obj, const struct bpf_map *map); */ LIBBPF_API int bpf_map__fd(const struct bpf_map *map); LIBBPF_API int bpf_map__reuse_fd(struct bpf_map *map, int fd); -/* get map definition */ -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 8, "use appropriate getters or setters instead") -const struct bpf_map_def *bpf_map__def(const struct bpf_map *map); /* get map name */ LIBBPF_API const char *bpf_map__name(const struct bpf_map *map); /* get/set map type */ @@ -885,8 +1011,6 @@ LIBBPF_API int bpf_map__set_type(struct bpf_map *map, enum bpf_map_type type); /* get/set map size (max_entries) */ LIBBPF_API __u32 bpf_map__max_entries(const struct bpf_map *map); LIBBPF_API int bpf_map__set_max_entries(struct bpf_map *map, __u32 max_entries); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__set_max_entries() instead") -LIBBPF_API int bpf_map__resize(struct bpf_map *map, __u32 max_entries); /* get/set map flags */ LIBBPF_API __u32 bpf_map__map_flags(const struct bpf_map *map); LIBBPF_API int bpf_map__set_map_flags(struct bpf_map *map, __u32 flags); @@ -896,8 +1020,22 @@ LIBBPF_API int bpf_map__set_numa_node(struct bpf_map *map, __u32 numa_node); /* get/set map key size */ LIBBPF_API __u32 bpf_map__key_size(const struct bpf_map *map); LIBBPF_API int bpf_map__set_key_size(struct bpf_map *map, __u32 size); -/* get/set map value size */ +/* get map value size */ LIBBPF_API __u32 bpf_map__value_size(const struct bpf_map *map); +/** + * @brief **bpf_map__set_value_size()** sets map value size. + * @param map the BPF map instance + * @return 0, on success; negative error, otherwise + * + * There is a special case for maps with associated memory-mapped regions, like + * the global data section maps (bss, data, rodata). When this function is used + * on such a map, the mapped region is resized. Afterward, an attempt is made to + * adjust the corresponding BTF info. This attempt is best-effort and can only + * succeed if the last variable of the data section map is an array. The array + * BTF type is replaced by a new BTF array type with a different length. + * Any previously existing pointers returned from bpf_map__initial_value() or + * corresponding data section skeleton pointer must be reinitialized. + */ LIBBPF_API int bpf_map__set_value_size(struct bpf_map *map, __u32 size); /* get map key/value BTF type IDs */ LIBBPF_API __u32 bpf_map__btf_key_type_id(const struct bpf_map *map); @@ -909,17 +1047,9 @@ LIBBPF_API int bpf_map__set_ifindex(struct bpf_map *map, __u32 ifindex); LIBBPF_API __u64 bpf_map__map_extra(const struct bpf_map *map); LIBBPF_API int bpf_map__set_map_extra(struct bpf_map *map, __u64 map_extra); -typedef void (*bpf_map_clear_priv_t)(struct bpf_map *, void *); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API int bpf_map__set_priv(struct bpf_map *map, void *priv, - bpf_map_clear_priv_t clear_priv); -LIBBPF_DEPRECATED_SINCE(0, 7, "storage via set_priv/priv is deprecated") -LIBBPF_API void *bpf_map__priv(const struct bpf_map *map); LIBBPF_API int bpf_map__set_initial_value(struct bpf_map *map, const void *data, size_t size); -LIBBPF_API const void *bpf_map__initial_value(struct bpf_map *map, size_t *psize); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_map__type() instead") -LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map); +LIBBPF_API void *bpf_map__initial_value(const struct bpf_map *map, size_t *psize); /** * @brief **bpf_map__is_internal()** tells the caller whether or not the @@ -929,73 +1059,165 @@ LIBBPF_API bool bpf_map__is_offload_neutral(const struct bpf_map *map); * @return true, if the map is an internal map; false, otherwise */ LIBBPF_API bool bpf_map__is_internal(const struct bpf_map *map); + +/** + * @brief **bpf_map__set_pin_path()** sets the path attribute that tells where the + * BPF map should be pinned. This does not actually create the 'pin'. + * @param map The bpf_map + * @param path The path + * @return 0, on success; negative error, otherwise + */ LIBBPF_API int bpf_map__set_pin_path(struct bpf_map *map, const char *path); + +/** + * @brief **bpf_map__pin_path()** gets the path attribute that tells where the + * BPF map should be pinned. + * @param map The bpf_map + * @return The path string; which can be NULL + */ LIBBPF_API const char *bpf_map__pin_path(const struct bpf_map *map); + +/** + * @brief **bpf_map__is_pinned()** tells the caller whether or not the + * passed map has been pinned via a 'pin' file. + * @param map The bpf_map + * @return true, if the map is pinned; false, otherwise + */ LIBBPF_API bool bpf_map__is_pinned(const struct bpf_map *map); + +/** + * @brief **bpf_map__pin()** creates a file that serves as a 'pin' + * for the BPF map. This increments the reference count on the + * BPF map which will keep the BPF map loaded even after the + * userspace process which loaded it has exited. + * @param map The bpf_map to pin + * @param path A file path for the 'pin' + * @return 0, on success; negative error, otherwise + * + * If `path` is NULL the maps `pin_path` attribute will be used. If this is + * also NULL, an error will be returned and the map will not be pinned. + */ LIBBPF_API int bpf_map__pin(struct bpf_map *map, const char *path); + +/** + * @brief **bpf_map__unpin()** removes the file that serves as a + * 'pin' for the BPF map. + * @param map The bpf_map to unpin + * @param path A file path for the 'pin' + * @return 0, on success; negative error, otherwise + * + * The `path` parameter can be NULL, in which case the `pin_path` + * map attribute is unpinned. If both the `path` parameter and + * `pin_path` map attribute are set, they must be equal. + */ LIBBPF_API int bpf_map__unpin(struct bpf_map *map, const char *path); LIBBPF_API int bpf_map__set_inner_map_fd(struct bpf_map *map, int fd); LIBBPF_API struct bpf_map *bpf_map__inner_map(struct bpf_map *map); /** - * @brief **libbpf_get_error()** extracts the error code from the passed - * pointer - * @param ptr pointer returned from libbpf API function - * @return error code; or 0 if no error occured - * - * Many libbpf API functions which return pointers have logic to encode error - * codes as pointers, and do not return NULL. Meaning **libbpf_get_error()** - * should be used on the return value from these functions immediately after - * calling the API function, with no intervening calls that could clobber the - * `errno` variable. Consult the individual functions documentation to verify - * if this logic applies should be used. + * @brief **bpf_map__lookup_elem()** allows to lookup BPF map value + * corresponding to provided key. + * @param map BPF map to lookup element in + * @param key pointer to memory containing bytes of the key used for lookup + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory in which looked up value will be stored + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise * - * For these API functions, if `libbpf_set_strict_mode(LIBBPF_STRICT_CLEAN_PTRS)` - * is enabled, NULL is returned on error instead. - * - * If ptr is NULL, then errno should be already set by the failing - * API, because libbpf never returns NULL on success and it now always - * sets errno on error. - * - * Example usage: - * - * struct perf_buffer *pb; + * **bpf_map__lookup_elem()** is high-level equivalent of + * **bpf_map_lookup_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__lookup_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags); + +/** + * @brief **bpf_map__update_elem()** allows to insert or update value in BPF + * map that corresponds to provided key. + * @param map BPF map to insert to or update element in + * @param key pointer to memory containing bytes of the key + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory containing bytes of the value + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise * - * pb = perf_buffer__new(bpf_map__fd(obj->maps.events), PERF_BUFFER_PAGES, &opts); - * err = libbpf_get_error(pb); - * if (err) { - * pb = NULL; - * fprintf(stderr, "failed to open perf buffer: %d\n", err); - * goto cleanup; - * } + * **bpf_map__update_elem()** is high-level equivalent of + * **bpf_map_update_elem()** API with added check for key and value size. */ -LIBBPF_API long libbpf_get_error(const void *ptr); +LIBBPF_API int bpf_map__update_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + const void *value, size_t value_sz, __u64 flags); -struct bpf_prog_load_attr { - const char *file; - enum bpf_prog_type prog_type; - enum bpf_attach_type expected_attach_type; - int ifindex; - int log_level; - int prog_flags; -}; +/** + * @brief **bpf_map__delete_elem()** allows to delete element in BPF map that + * corresponds to provided key. + * @param map BPF map to delete element from + * @param key pointer to memory containing bytes of the key + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__delete_elem()** is high-level equivalent of + * **bpf_map_delete_elem()** API with added check for key size. + */ +LIBBPF_API int bpf_map__delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, __u64 flags); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_object__open() and bpf_object__load() instead") -LIBBPF_API int bpf_prog_load_xattr(const struct bpf_prog_load_attr *attr, - struct bpf_object **pobj, int *prog_fd); -LIBBPF_DEPRECATED_SINCE(0, 7, "use bpf_object__open() and bpf_object__load() instead") -LIBBPF_API int bpf_prog_load_deprecated(const char *file, enum bpf_prog_type type, - struct bpf_object **pobj, int *prog_fd); +/** + * @brief **bpf_map__lookup_and_delete_elem()** allows to lookup BPF map value + * corresponding to provided key and atomically delete it afterwards. + * @param map BPF map to lookup element in + * @param key pointer to memory containing bytes of the key used for lookup + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @param value pointer to memory in which looked up value will be stored + * @param value_sz size in byte of value data memory; it has to match BPF map + * definition's **value_size**. For per-CPU BPF maps value size has to be + * a product of BPF map value size and number of possible CPUs in the system + * (could be fetched with **libbpf_num_possible_cpus()**). Note also that for + * per-CPU values value size has to be aligned up to closest 8 bytes for + * alignment reasons, so expected size is: `round_up(value_size, 8) + * * libbpf_num_possible_cpus()`. + * @flags extra flags passed to kernel for this operation + * @return 0, on success; negative error, otherwise + * + * **bpf_map__lookup_and_delete_elem()** is high-level equivalent of + * **bpf_map_lookup_and_delete_elem()** API with added check for key and value size. + */ +LIBBPF_API int bpf_map__lookup_and_delete_elem(const struct bpf_map *map, + const void *key, size_t key_sz, + void *value, size_t value_sz, __u64 flags); -/* XDP related API */ -struct xdp_link_info { - __u32 prog_id; - __u32 drv_prog_id; - __u32 hw_prog_id; - __u32 skb_prog_id; - __u8 attach_mode; -}; +/** + * @brief **bpf_map__get_next_key()** allows to iterate BPF map keys by + * fetching next key that follows current key. + * @param map BPF map to fetch next key from + * @param cur_key pointer to memory containing bytes of current key or NULL to + * fetch the first key + * @param next_key pointer to memory to write next key into + * @param key_sz size in bytes of key data, needs to match BPF map definition's **key_size** + * @return 0, on success; -ENOENT if **cur_key** is the last key in BPF map; + * negative error, otherwise + * + * **bpf_map__get_next_key()** is high-level equivalent of + * **bpf_map_get_next_key()** API with added check for key size. + */ +LIBBPF_API int bpf_map__get_next_key(const struct bpf_map *map, + const void *cur_key, void *next_key, size_t key_sz); struct bpf_xdp_set_link_opts { size_t sz; @@ -1004,17 +1226,6 @@ struct bpf_xdp_set_link_opts { }; #define bpf_xdp_set_link_opts__last_field old_fd -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_attach() instead") -LIBBPF_API int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_attach() instead") -LIBBPF_API int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, - const struct bpf_xdp_set_link_opts *opts); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_query_id() instead") -LIBBPF_API int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags); -LIBBPF_DEPRECATED_SINCE(0, 8, "use bpf_xdp_query() instead") -LIBBPF_API int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, - size_t info_size, __u32 flags); - struct bpf_xdp_attach_opts { size_t sz; int old_prog_fd; @@ -1029,9 +1240,11 @@ struct bpf_xdp_query_opts { __u32 hw_prog_id; /* output */ __u32 skb_prog_id; /* output */ __u8 attach_mode; /* output */ + __u64 feature_flags; /* output */ + __u32 xdp_zc_max_segs; /* output */ size_t :0; }; -#define bpf_xdp_query_opts__last_field attach_mode +#define bpf_xdp_query_opts__last_field xdp_zc_max_segs LIBBPF_API int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts); @@ -1085,11 +1298,13 @@ LIBBPF_API int bpf_tc_query(const struct bpf_tc_hook *hook, /* Ring buffer APIs */ struct ring_buffer; +struct ring; +struct user_ring_buffer; typedef int (*ring_buffer_sample_fn)(void *ctx, void *data, size_t size); struct ring_buffer_opts { - size_t sz; /* size of this struct, for forward/backward compatiblity */ + size_t sz; /* size of this struct, for forward/backward compatibility */ }; #define ring_buffer_opts__last_field sz @@ -1102,8 +1317,204 @@ LIBBPF_API int ring_buffer__add(struct ring_buffer *rb, int map_fd, ring_buffer_sample_fn sample_cb, void *ctx); LIBBPF_API int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms); LIBBPF_API int ring_buffer__consume(struct ring_buffer *rb); +LIBBPF_API int ring_buffer__consume_n(struct ring_buffer *rb, size_t n); LIBBPF_API int ring_buffer__epoll_fd(const struct ring_buffer *rb); +/** + * @brief **ring_buffer__ring()** returns the ringbuffer object inside a given + * ringbuffer manager representing a single BPF_MAP_TYPE_RINGBUF map instance. + * + * @param rb A ringbuffer manager object. + * @param idx An index into the ringbuffers contained within the ringbuffer + * manager object. The index is 0-based and corresponds to the order in which + * ring_buffer__add was called. + * @return A ringbuffer object on success; NULL and errno set if the index is + * invalid. + */ +LIBBPF_API struct ring *ring_buffer__ring(struct ring_buffer *rb, + unsigned int idx); + +/** + * @brief **ring__consumer_pos()** returns the current consumer position in the + * given ringbuffer. + * + * @param r A ringbuffer object. + * @return The current consumer position. + */ +LIBBPF_API unsigned long ring__consumer_pos(const struct ring *r); + +/** + * @brief **ring__producer_pos()** returns the current producer position in the + * given ringbuffer. + * + * @param r A ringbuffer object. + * @return The current producer position. + */ +LIBBPF_API unsigned long ring__producer_pos(const struct ring *r); + +/** + * @brief **ring__avail_data_size()** returns the number of bytes in the + * ringbuffer not yet consumed. This has no locking associated with it, so it + * can be inaccurate if operations are ongoing while this is called. However, it + * should still show the correct trend over the long-term. + * + * @param r A ringbuffer object. + * @return The number of bytes not yet consumed. + */ +LIBBPF_API size_t ring__avail_data_size(const struct ring *r); + +/** + * @brief **ring__size()** returns the total size of the ringbuffer's map data + * area (excluding special producer/consumer pages). Effectively this gives the + * amount of usable bytes of data inside the ringbuffer. + * + * @param r A ringbuffer object. + * @return The total size of the ringbuffer map data area. + */ +LIBBPF_API size_t ring__size(const struct ring *r); + +/** + * @brief **ring__map_fd()** returns the file descriptor underlying the given + * ringbuffer. + * + * @param r A ringbuffer object. + * @return The underlying ringbuffer file descriptor + */ +LIBBPF_API int ring__map_fd(const struct ring *r); + +/** + * @brief **ring__consume()** consumes available ringbuffer data without event + * polling. + * + * @param r A ringbuffer object. + * @return The number of records consumed (or INT_MAX, whichever is less), or + * a negative number if any of the callbacks return an error. + */ +LIBBPF_API int ring__consume(struct ring *r); + +/** + * @brief **ring__consume_n()** consumes up to a requested amount of items from + * a ringbuffer without event polling. + * + * @param r A ringbuffer object. + * @param n Maximum amount of items to consume. + * @return The number of items consumed, or a negative number if any of the + * callbacks return an error. + */ +LIBBPF_API int ring__consume_n(struct ring *r, size_t n); + +struct user_ring_buffer_opts { + size_t sz; /* size of this struct, for forward/backward compatibility */ +}; + +#define user_ring_buffer_opts__last_field sz + +/** + * @brief **user_ring_buffer__new()** creates a new instance of a user ring + * buffer. + * + * @param map_fd A file descriptor to a BPF_MAP_TYPE_USER_RINGBUF map. + * @param opts Options for how the ring buffer should be created. + * @return A user ring buffer on success; NULL and errno being set on a + * failure. + */ +LIBBPF_API struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts); + +/** + * @brief **user_ring_buffer__reserve()** reserves a pointer to a sample in the + * user ring buffer. + * @param rb A pointer to a user ring buffer. + * @param size The size of the sample, in bytes. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize accessing + * this function if there are multiple producers. If a size is requested that + * is larger than the size of the entire ring buffer, errno will be set to + * E2BIG and NULL is returned. If the ring buffer could accommodate the size, + * but currently does not have enough space, errno is set to ENOSPC and NULL is + * returned. + * + * After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the kernel. Otherwise, + * the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size); + +/** + * @brief **user_ring_buffer__reserve_blocking()** reserves a record in the + * ring buffer, possibly blocking for up to @timeout_ms until a sample becomes + * available. + * @param rb The user ring buffer. + * @param size The size of the sample, in bytes. + * @param timeout_ms The amount of time, in milliseconds, for which the caller + * should block when waiting for a sample. -1 causes the caller to block + * indefinitely. + * @return A pointer to an 8-byte aligned reserved region of the user ring + * buffer; NULL, and errno being set if a sample could not be reserved. + * + * This function is *not* thread safe, and callers must synchronize + * accessing this function if there are multiple producers + * + * If **timeout_ms** is -1, the function will block indefinitely until a sample + * becomes available. Otherwise, **timeout_ms** must be non-negative, or errno + * is set to EINVAL, and NULL is returned. If **timeout_ms** is 0, no blocking + * will occur and the function will return immediately after attempting to + * reserve a sample. + * + * If **size** is larger than the size of the entire ring buffer, errno is set + * to E2BIG and NULL is returned. If the ring buffer could accommodate + * **size**, but currently does not have enough space, the caller will block + * until at most **timeout_ms** has elapsed. If insufficient space is available + * at that time, errno is set to ENOSPC, and NULL is returned. + * + * The kernel guarantees that it will wake up this thread to check if + * sufficient space is available in the ring buffer at least once per + * invocation of the **bpf_ringbuf_drain()** helper function, provided that at + * least one sample is consumed, and the BPF program did not invoke the + * function with BPF_RB_NO_WAKEUP. A wakeup may occur sooner than that, but the + * kernel does not guarantee this. If the helper function is invoked with + * BPF_RB_FORCE_WAKEUP, a wakeup event will be sent even if no sample is + * consumed. + * + * When a sample of size **size** is found within **timeout_ms**, a pointer to + * the sample is returned. After initializing the sample, callers must invoke + * **user_ring_buffer__submit()** to post the sample to the ring buffer. + * Otherwise, the sample must be freed with **user_ring_buffer__discard()**. + */ +LIBBPF_API void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, + __u32 size, + int timeout_ms); + +/** + * @brief **user_ring_buffer__submit()** submits a previously reserved sample + * into the ring buffer. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample); + +/** + * @brief **user_ring_buffer__discard()** discards a previously reserved sample. + * @param rb The user ring buffer. + * @param sample A reserved sample. + * + * It is not necessary to synchronize amongst multiple producers when invoking + * this function. + */ +LIBBPF_API void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample); + +/** + * @brief **user_ring_buffer__free()** frees a ring buffer that was previously + * created with **user_ring_buffer__new()**. + * @param rb The user ring buffer being freed. + */ +LIBBPF_API void user_ring_buffer__free(struct user_ring_buffer *rb); + /* Perf buffer APIs */ struct perf_buffer; @@ -1113,19 +1524,11 @@ typedef void (*perf_buffer_lost_fn)(void *ctx, int cpu, __u64 cnt); /* common use perf buffer options */ struct perf_buffer_opts { - union { - size_t sz; - struct { /* DEPRECATED: will be removed in v1.0 */ - /* if specified, sample_cb is called for each sample */ - perf_buffer_sample_fn sample_cb; - /* if specified, lost_cb is called for each batch of lost samples */ - perf_buffer_lost_fn lost_cb; - /* ctx is provided to sample_cb and lost_cb */ - void *ctx; - }; - }; + size_t sz; + __u32 sample_period; + size_t :0; }; -#define perf_buffer_opts__last_field sz +#define perf_buffer_opts__last_field sample_period /** * @brief **perf_buffer__new()** creates BPF perfbuf manager for a specified @@ -1144,21 +1547,6 @@ perf_buffer__new(int map_fd, size_t page_cnt, perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx, const struct perf_buffer_opts *opts); -LIBBPF_API struct perf_buffer * -perf_buffer__new_v0_6_0(int map_fd, size_t page_cnt, - perf_buffer_sample_fn sample_cb, perf_buffer_lost_fn lost_cb, void *ctx, - const struct perf_buffer_opts *opts); - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use new variant of perf_buffer__new() instead") -struct perf_buffer *perf_buffer__new_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_opts *opts); - -#define perf_buffer__new(...) ___libbpf_overload(___perf_buffer_new, __VA_ARGS__) -#define ___perf_buffer_new6(map_fd, page_cnt, sample_cb, lost_cb, ctx, opts) \ - perf_buffer__new(map_fd, page_cnt, sample_cb, lost_cb, ctx, opts) -#define ___perf_buffer_new3(map_fd, page_cnt, opts) \ - perf_buffer__new_deprecated(map_fd, page_cnt, opts) - enum bpf_perf_event_ret { LIBBPF_PERF_EVENT_DONE = 0, LIBBPF_PERF_EVENT_ERROR = -1, @@ -1172,21 +1560,9 @@ typedef enum bpf_perf_event_ret /* raw perf buffer options, giving most power and control */ struct perf_buffer_raw_opts { - union { - struct { - size_t sz; - long :0; - long :0; - }; - struct { /* DEPRECATED: will be removed in v1.0 */ - /* perf event attrs passed directly into perf_event_open() */ - struct perf_event_attr *attr; - /* raw event callback */ - perf_buffer_event_fn event_cb; - /* ctx is provided to event_cb */ - void *ctx; - }; - }; + size_t sz; + long :0; + long :0; /* if cpu_cnt == 0, open all on all possible CPUs (up to the number of * max_entries of given PERF_EVENT_ARRAY map) */ @@ -1198,26 +1574,13 @@ struct perf_buffer_raw_opts { }; #define perf_buffer_raw_opts__last_field map_keys +struct perf_event_attr; + LIBBPF_API struct perf_buffer * perf_buffer__new_raw(int map_fd, size_t page_cnt, struct perf_event_attr *attr, perf_buffer_event_fn event_cb, void *ctx, const struct perf_buffer_raw_opts *opts); -LIBBPF_API struct perf_buffer * -perf_buffer__new_raw_v0_6_0(int map_fd, size_t page_cnt, struct perf_event_attr *attr, - perf_buffer_event_fn event_cb, void *ctx, - const struct perf_buffer_raw_opts *opts); - -LIBBPF_API LIBBPF_DEPRECATED_SINCE(0, 7, "use new variant of perf_buffer__new_raw() instead") -struct perf_buffer *perf_buffer__new_raw_deprecated(int map_fd, size_t page_cnt, - const struct perf_buffer_raw_opts *opts); - -#define perf_buffer__new_raw(...) ___libbpf_overload(___perf_buffer_new_raw, __VA_ARGS__) -#define ___perf_buffer_new_raw6(map_fd, page_cnt, attr, event_cb, ctx, opts) \ - perf_buffer__new_raw(map_fd, page_cnt, attr, event_cb, ctx, opts) -#define ___perf_buffer_new_raw3(map_fd, page_cnt, opts) \ - perf_buffer__new_raw_deprecated(map_fd, page_cnt, opts) - LIBBPF_API void perf_buffer__free(struct perf_buffer *pb); LIBBPF_API int perf_buffer__epoll_fd(const struct perf_buffer *pb); LIBBPF_API int perf_buffer__poll(struct perf_buffer *pb, int timeout_ms); @@ -1225,15 +1588,22 @@ LIBBPF_API int perf_buffer__consume(struct perf_buffer *pb); LIBBPF_API int perf_buffer__consume_buffer(struct perf_buffer *pb, size_t buf_idx); LIBBPF_API size_t perf_buffer__buffer_cnt(const struct perf_buffer *pb); LIBBPF_API int perf_buffer__buffer_fd(const struct perf_buffer *pb, size_t buf_idx); - -typedef enum bpf_perf_event_ret - (*bpf_perf_event_print_t)(struct perf_event_header *hdr, - void *private_data); -LIBBPF_DEPRECATED_SINCE(0, 8, "use perf_buffer__poll() or perf_buffer__consume() instead") -LIBBPF_API enum bpf_perf_event_ret -bpf_perf_event_read_simple(void *mmap_mem, size_t mmap_size, size_t page_size, - void **copy_mem, size_t *copy_size, - bpf_perf_event_print_t fn, void *private_data); +/** + * @brief **perf_buffer__buffer()** returns the per-cpu raw mmap()'ed underlying + * memory region of the ring buffer. + * This ring buffer can be used to implement a custom events consumer. + * The ring buffer starts with the *struct perf_event_mmap_page*, which + * holds the ring buffer managment fields, when accessing the header + * structure it's important to be SMP aware. + * You can refer to *perf_event_read_simple* for a simple example. + * @param pb the perf buffer structure + * @param buf_idx the buffer index to retreive + * @param buf (out) gets the base pointer of the mmap()'ed memory + * @param buf_size (out) gets the size of the mmap()'ed region + * @return 0 on success, negative error code for failure + */ +LIBBPF_API int perf_buffer__buffer(struct perf_buffer *pb, int buf_idx, void **buf, + size_t *buf_size); struct bpf_prog_linfo; struct bpf_prog_info; @@ -1256,14 +1626,6 @@ bpf_prog_linfo__lfind(const struct bpf_prog_linfo *prog_linfo, * user, causing subsequent probes to fail. In this case, the caller may want * to adjust that limit with setrlimit(). */ -LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_prog_type() instead") -LIBBPF_API bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex); -LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_map_type() instead") -LIBBPF_API bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex); -LIBBPF_DEPRECATED_SINCE(0, 8, "use libbpf_probe_bpf_helper() instead") -LIBBPF_API bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, __u32 ifindex); -LIBBPF_DEPRECATED_SINCE(0, 8, "implement your own or use bpftool for feature detection") -LIBBPF_API bool bpf_probe_large_insn_limit(__u32 ifindex); /** * @brief **libbpf_probe_bpf_prog_type()** detects if host kernel supports @@ -1307,72 +1669,6 @@ LIBBPF_API int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void LIBBPF_API int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id, const void *opts); -/* - * Get bpf_prog_info in continuous memory - * - * struct bpf_prog_info has multiple arrays. The user has option to choose - * arrays to fetch from kernel. The following APIs provide an uniform way to - * fetch these data. All arrays in bpf_prog_info are stored in a single - * continuous memory region. This makes it easy to store the info in a - * file. - * - * Before writing bpf_prog_info_linear to files, it is necessary to - * translate pointers in bpf_prog_info to offsets. Helper functions - * bpf_program__bpil_addr_to_offs() and bpf_program__bpil_offs_to_addr() - * are introduced to switch between pointers and offsets. - * - * Examples: - * # To fetch map_ids and prog_tags: - * __u64 arrays = (1UL << BPF_PROG_INFO_MAP_IDS) | - * (1UL << BPF_PROG_INFO_PROG_TAGS); - * struct bpf_prog_info_linear *info_linear = - * bpf_program__get_prog_info_linear(fd, arrays); - * - * # To save data in file - * bpf_program__bpil_addr_to_offs(info_linear); - * write(f, info_linear, sizeof(*info_linear) + info_linear->data_len); - * - * # To read data from file - * read(f, info_linear, ); - * bpf_program__bpil_offs_to_addr(info_linear); - */ -enum bpf_prog_info_array { - BPF_PROG_INFO_FIRST_ARRAY = 0, - BPF_PROG_INFO_JITED_INSNS = 0, - BPF_PROG_INFO_XLATED_INSNS, - BPF_PROG_INFO_MAP_IDS, - BPF_PROG_INFO_JITED_KSYMS, - BPF_PROG_INFO_JITED_FUNC_LENS, - BPF_PROG_INFO_FUNC_INFO, - BPF_PROG_INFO_LINE_INFO, - BPF_PROG_INFO_JITED_LINE_INFO, - BPF_PROG_INFO_PROG_TAGS, - BPF_PROG_INFO_LAST_ARRAY, -}; - -struct bpf_prog_info_linear { - /* size of struct bpf_prog_info, when the tool is compiled */ - __u32 info_len; - /* total bytes allocated for data, round up to 8 bytes */ - __u32 data_len; - /* which arrays are included in data */ - __u64 arrays; - struct bpf_prog_info info; - __u8 data[]; -}; - -LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper") -LIBBPF_API struct bpf_prog_info_linear * -bpf_program__get_prog_info_linear(int fd, __u64 arrays); - -LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper") -LIBBPF_API void -bpf_program__bpil_addr_to_offs(struct bpf_prog_info_linear *info_linear); - -LIBBPF_DEPRECATED_SINCE(0, 6, "use a custom linear prog_info wrapper") -LIBBPF_API void -bpf_program__bpil_offs_to_addr(struct bpf_prog_info_linear *info_linear); - /** * @brief **libbpf_num_possible_cpus()** is a helper function to get the * number of possible CPUs that the host kernel supports and expects. @@ -1393,6 +1689,7 @@ struct bpf_map_skeleton { const char *name; struct bpf_map **map; void **mmaped; + struct bpf_link **link; }; struct bpf_prog_skeleton { @@ -1457,7 +1754,7 @@ LIBBPF_API void bpf_object__destroy_subskeleton(struct bpf_object_subskeleton *s); struct gen_loader_opts { - size_t sz; /* size of this struct, for forward/backward compatiblity */ + size_t sz; /* size of this struct, for forward/backward compatibility */ const char *data; const char *insns; __u32 data_sz; @@ -1475,13 +1772,13 @@ enum libbpf_tristate { }; struct bpf_linker_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; }; #define bpf_linker_opts__last_field sz struct bpf_linker_file_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; }; #define bpf_linker_file_opts__last_field sz @@ -1524,7 +1821,7 @@ typedef int (*libbpf_prog_attach_fn_t)(const struct bpf_program *prog, long cook struct bpf_link **link); struct libbpf_prog_handler_opts { - /* size of this struct, for forward/backward compatiblity */ + /* size of this struct, for forward/backward compatibility */ size_t sz; /* User-provided value that is passed to prog_setup_fn, * prog_prepare_load_fn, and prog_attach_fn callbacks. Allows user to diff --git a/pkg/plugin/lib/common/libbpf/_src/libbpf_common.h b/pkg/plugin/lib/common/libbpf/_src/libbpf_common.h index 000e37798f..8fe248e14e 100644 --- a/pkg/plugin/lib/common/libbpf/_src/libbpf_common.h +++ b/pkg/plugin/lib/common/libbpf/_src/libbpf_common.h @@ -30,20 +30,10 @@ /* Add checks for other versions below when planning deprecation of API symbols * with the LIBBPF_DEPRECATED_SINCE macro. */ -#if __LIBBPF_CURRENT_VERSION_GEQ(0, 6) -#define __LIBBPF_MARK_DEPRECATED_0_6(X) X +#if __LIBBPF_CURRENT_VERSION_GEQ(1, 0) +#define __LIBBPF_MARK_DEPRECATED_1_0(X) X #else -#define __LIBBPF_MARK_DEPRECATED_0_6(X) -#endif -#if __LIBBPF_CURRENT_VERSION_GEQ(0, 7) -#define __LIBBPF_MARK_DEPRECATED_0_7(X) X -#else -#define __LIBBPF_MARK_DEPRECATED_0_7(X) -#endif -#if __LIBBPF_CURRENT_VERSION_GEQ(0, 8) -#define __LIBBPF_MARK_DEPRECATED_0_8(X) X -#else -#define __LIBBPF_MARK_DEPRECATED_0_8(X) +#define __LIBBPF_MARK_DEPRECATED_1_0(X) #endif /* This set of internal macros allows to do "function overloading" based on @@ -80,4 +70,23 @@ }; \ }) +/* Helper macro to clear and optionally reinitialize libbpf options struct + * + * Small helper macro to reset all fields and to reinitialize the common + * structure size member. Values provided by users in struct initializer- + * syntax as varargs can be provided as well to reinitialize options struct + * specific members. + */ +#define LIBBPF_OPTS_RESET(NAME, ...) \ + do { \ + typeof(NAME) ___##NAME = ({ \ + memset(&___##NAME, 0, sizeof(NAME)); \ + (typeof(NAME)) { \ + .sz = sizeof(NAME), \ + __VA_ARGS__ \ + }; \ + }); \ + memcpy(&NAME, &___##NAME, sizeof(NAME)); \ + } while (0) + #endif /* __LIBBPF_LIBBPF_COMMON_H */ diff --git a/pkg/plugin/lib/common/libbpf/_src/libbpf_errno.c b/pkg/plugin/lib/common/libbpf/_src/libbpf_errno.c index 96f67a772a..6b180172ec 100644 --- a/pkg/plugin/lib/common/libbpf/_src/libbpf_errno.c +++ b/pkg/plugin/lib/common/libbpf/_src/libbpf_errno.c @@ -39,14 +39,14 @@ static const char *libbpf_strerror_table[NR_ERRNO] = { int libbpf_strerror(int err, char *buf, size_t size) { + int ret; + if (!buf || !size) return libbpf_err(-EINVAL); err = err > 0 ? err : -err; if (err < __LIBBPF_ERRNO__START) { - int ret; - ret = strerror_r(err, buf, size); buf[size - 1] = '\0'; return libbpf_err_errno(ret); @@ -56,12 +56,20 @@ int libbpf_strerror(int err, char *buf, size_t size) const char *msg; msg = libbpf_strerror_table[ERRNO_OFFSET(err)]; - snprintf(buf, size, "%s", msg); + ret = snprintf(buf, size, "%s", msg); buf[size - 1] = '\0'; + /* The length of the buf and msg is positive. + * A negative number may be returned only when the + * size exceeds INT_MAX. Not likely to appear. + */ + if (ret >= size) + return libbpf_err(-ERANGE); return 0; } - snprintf(buf, size, "Unknown libbpf error %d", err); + ret = snprintf(buf, size, "Unknown libbpf error %d", err); buf[size - 1] = '\0'; + if (ret >= size) + return libbpf_err(-ERANGE); return libbpf_err(-ENOENT); } diff --git a/pkg/plugin/lib/common/libbpf/_src/libbpf_internal.h b/pkg/plugin/lib/common/libbpf/_src/libbpf_internal.h index 4abdbe2fea..408df59e07 100644 --- a/pkg/plugin/lib/common/libbpf/_src/libbpf_internal.h +++ b/pkg/plugin/lib/common/libbpf/_src/libbpf_internal.h @@ -15,9 +15,24 @@ #include #include #include -#include "libbpf_legacy.h" +#include +#include #include "relo_core.h" +/* Android's libc doesn't support AT_EACCESS in faccessat() implementation + * ([0]), and just returns -EINVAL even if file exists and is accessible. + * See [1] for issues caused by this. + * + * So just redefine it to 0 on Android. + * + * [0] https://android.googlesource.com/platform/bionic/+/refs/heads/android13-release/libc/bionic/faccessat.cpp#50 + * [1] https://github.com/libbpf/libbpf-bootstrap/issues/250#issuecomment-1911324250 + */ +#ifdef __ANDROID__ +#undef AT_EACCESS +#define AT_EACCESS 0 +#endif + /* make sure libbpf doesn't use kernel-only integer typedefs */ #pragma GCC poison u8 u16 u32 u64 s8 s16 s32 s64 @@ -109,9 +124,9 @@ static inline bool str_has_sfx(const char *str, const char *sfx) size_t str_len = strlen(str); size_t sfx_len = strlen(sfx); - if (sfx_len <= str_len) - return strcmp(str + str_len - sfx_len, sfx); - return false; + if (sfx_len > str_len) + return false; + return strcmp(str + str_len - sfx_len, sfx) == 0; } /* Symbol versioning is different between static and shared library. @@ -219,6 +234,9 @@ struct btf_type; struct btf_type *btf_type_by_id(const struct btf *btf, __u32 type_id); const char *btf_kind_str(const struct btf_type *t); const struct btf_type *skip_mods_and_typedefs(const struct btf *btf, __u32 id, __u32 *res_id); +const struct btf_header *btf_header(const struct btf *btf); +void btf_set_base_btf(struct btf *btf, const struct btf *base_btf); +int btf_relocate(struct btf *btf, const struct btf *base_btf, __u32 **id_map); static inline enum btf_func_linkage btf_func_linkage(const struct btf_type *t) { @@ -351,18 +369,45 @@ enum kern_feature_id { FEAT_MEMCG_ACCOUNT, /* BPF cookie (bpf_get_attach_cookie() BPF helper) support */ FEAT_BPF_COOKIE, + /* BTF_KIND_ENUM64 support and BTF_KIND_ENUM kflag support */ + FEAT_BTF_ENUM64, + /* Kernel uses syscall wrapper (CONFIG_ARCH_HAS_SYSCALL_WRAPPER) */ + FEAT_SYSCALL_WRAPPER, + /* BPF multi-uprobe link support */ + FEAT_UPROBE_MULTI_LINK, + /* Kernel supports arg:ctx tag (__arg_ctx) for global subprogs natively */ + FEAT_ARG_CTX_TAG, + /* Kernel supports '?' at the front of datasec names */ + FEAT_BTF_QMARK_DATASEC, __FEAT_CNT, }; -int probe_memcg_account(void); +enum kern_feature_result { + FEAT_UNKNOWN = 0, + FEAT_SUPPORTED = 1, + FEAT_MISSING = 2, +}; + +struct kern_feature_cache { + enum kern_feature_result res[__FEAT_CNT]; + int token_fd; +}; + +bool feat_supported(struct kern_feature_cache *cache, enum kern_feature_id feat_id); bool kernel_supports(const struct bpf_object *obj, enum kern_feature_id feat_id); + +int probe_kern_syscall_wrapper(int token_fd); +int probe_memcg_account(int token_fd); int bump_rlimit_memlock(void); int parse_cpu_mask_str(const char *s, bool **mask, int *mask_sz); int parse_cpu_mask_file(const char *fcpu, bool **mask, int *mask_sz); int libbpf__load_raw_btf(const char *raw_types, size_t types_len, - const char *str_sec, size_t str_len); -int btf_load_into_kernel(struct btf *btf, char *log_buf, size_t log_sz, __u32 log_level); + const char *str_sec, size_t str_len, + int token_fd); +int btf_load_into_kernel(struct btf *btf, + char *log_buf, size_t log_sz, __u32 log_level, + int token_fd); struct btf *btf_get_from_fd(int btf_fd, struct btf *base_btf); void btf_get_kernel_prefix_kind(enum bpf_attach_type attach_type, @@ -466,23 +511,38 @@ struct bpf_line_info_min { __u32 line_col; }; +enum btf_field_iter_kind { + BTF_FIELD_ITER_IDS, + BTF_FIELD_ITER_STRS, +}; + +struct btf_field_desc { + /* once-per-type offsets */ + int t_off_cnt, t_offs[2]; + /* member struct size, or zero, if no members */ + int m_sz; + /* repeated per-member offsets */ + int m_off_cnt, m_offs[1]; +}; + +struct btf_field_iter { + struct btf_field_desc desc; + void *p; + int m_idx; + int off_idx; + int vlen; +}; + +int btf_field_iter_init(struct btf_field_iter *it, struct btf_type *t, enum btf_field_iter_kind iter_kind); +__u32 *btf_field_iter_next(struct btf_field_iter *it); typedef int (*type_id_visit_fn)(__u32 *type_id, void *ctx); typedef int (*str_off_visit_fn)(__u32 *str_off, void *ctx); -int btf_type_visit_type_ids(struct btf_type *t, type_id_visit_fn visit, void *ctx); -int btf_type_visit_str_offs(struct btf_type *t, str_off_visit_fn visit, void *ctx); int btf_ext_visit_type_ids(struct btf_ext *btf_ext, type_id_visit_fn visit, void *ctx); int btf_ext_visit_str_offs(struct btf_ext *btf_ext, str_off_visit_fn visit, void *ctx); __s32 btf__find_by_name_kind_own(const struct btf *btf, const char *type_name, __u32 kind); -extern enum libbpf_strict_mode libbpf_mode; - -typedef int (*kallsyms_cb_t)(unsigned long long sym_addr, char sym_type, - const char *sym_name, void *ctx); - -int libbpf_kallsyms_parse(kallsyms_cb_t cb, void *arg); - /* handle direct returned errors */ static inline int libbpf_err(int ret) { @@ -496,12 +556,8 @@ static inline int libbpf_err(int ret) */ static inline int libbpf_err_errno(int ret) { - if (libbpf_mode & LIBBPF_STRICT_DIRECT_ERRS) - /* errno is already assumed to be set on error */ - return ret < 0 ? -errno : ret; - - /* legacy: on error return -1 directly and don't touch errno */ - return ret; + /* errno is already assumed to be set on error */ + return ret < 0 ? -errno : ret; } /* handle error for pointer-returning APIs, err is assumed to be < 0 always */ @@ -509,12 +565,7 @@ static inline void *libbpf_err_ptr(int err) { /* set errno on error, this doesn't break anything */ errno = -err; - - if (libbpf_mode & LIBBPF_STRICT_CLEAN_PTRS) - return NULL; - - /* legacy: encode err as ptr */ - return ERR_PTR(err); + return NULL; } /* handle pointer-returning APIs' error handling */ @@ -524,11 +575,7 @@ static inline void *libbpf_ptr(void *ret) if (IS_ERR(ret)) errno = -PTR_ERR(ret); - if (libbpf_mode & LIBBPF_STRICT_CLEAN_PTRS) - return IS_ERR(ret) ? NULL : ret; - - /* legacy: pass-through original pointer */ - return ret; + return IS_ERR(ret) ? NULL : ret; } static inline bool str_is_empty(const char *s) @@ -541,6 +588,17 @@ static inline bool is_ldimm64_insn(struct bpf_insn *insn) return insn->code == (BPF_LD | BPF_IMM | BPF_DW); } +/* Unconditionally dup FD, ensuring it doesn't use [0, 2] range. + * Original FD is not closed or altered in any other way. + * Preserves original FD value, if it's invalid (negative). + */ +static inline int dup_good_fd(int fd) +{ + if (fd < 0) + return fd; + return fcntl(fd, F_DUPFD_CLOEXEC, 3); +} + /* if fd is stdin, stdout, or stderr, dup to a fd greater than 2 * Takes ownership of the fd passed in, and closes it if calling * fcntl(fd, F_DUPFD_CLOEXEC, 3). @@ -552,9 +610,10 @@ static inline int ensure_good_fd(int fd) if (fd < 0) return fd; if (fd < 3) { - fd = fcntl(fd, F_DUPFD_CLOEXEC, 3); + fd = dup_good_fd(fd); saved_errno = errno; close(old_fd); + errno = saved_errno; if (fd < 0) { pr_warn("failed to dup FD %d to FD > 2: %d\n", old_fd, -saved_errno); errno = saved_errno; @@ -563,6 +622,25 @@ static inline int ensure_good_fd(int fd) return fd; } +static inline int sys_dup3(int oldfd, int newfd, int flags) +{ + return syscall(__NR_dup3, oldfd, newfd, flags); +} + +/* Point *fixed_fd* to the same file that *tmp_fd* points to. + * Regardless of success, *tmp_fd* is closed. + * Whatever *fixed_fd* pointed to is closed silently. + */ +static inline int reuse_fd(int fixed_fd, int tmp_fd) +{ + int err; + + err = sys_dup3(tmp_fd, fixed_fd, O_CLOEXEC); + err = err < 0 ? -errno : 0; + close(tmp_fd); /* clean up temporary FD */ + return err; +} + /* The following two functions are exposed to bpftool */ int bpf_core_add_cands(struct bpf_core_cand *local_cand, size_t local_essent_len, @@ -580,4 +658,33 @@ struct bpf_link * usdt_manager_attach_usdt(struct usdt_manager *man, const char *usdt_provider, const char *usdt_name, __u64 usdt_cookie); +static inline bool is_pow_of_2(size_t x) +{ + return x && (x & (x - 1)) == 0; +} + +#define PROG_LOAD_ATTEMPTS 5 +int sys_bpf_prog_load(union bpf_attr *attr, unsigned int size, int attempts); + +bool glob_match(const char *str, const char *pat); + +long elf_find_func_offset(Elf *elf, const char *binary_path, const char *name); +long elf_find_func_offset_from_file(const char *binary_path, const char *name); + +struct elf_fd { + Elf *elf; + int fd; +}; + +int elf_open(const char *binary_path, struct elf_fd *elf_fd); +void elf_close(struct elf_fd *elf_fd); + +int elf_resolve_syms_offsets(const char *binary_path, int cnt, + const char **syms, unsigned long **poffsets, + int st_type); +int elf_resolve_pattern_offsets(const char *binary_path, const char *pattern, + unsigned long **poffsets, size_t *pcnt); + +int probe_fd(int fd); + #endif /* __LIBBPF_LIBBPF_INTERNAL_H */ diff --git a/pkg/plugin/lib/common/libbpf/_src/libbpf_legacy.h b/pkg/plugin/lib/common/libbpf/_src/libbpf_legacy.h index d7bcbd01f6..1e1be467be 100644 --- a/pkg/plugin/lib/common/libbpf/_src/libbpf_legacy.h +++ b/pkg/plugin/lib/common/libbpf/_src/libbpf_legacy.h @@ -20,6 +20,11 @@ extern "C" { #endif +/* As of libbpf 1.0 libbpf_set_strict_mode() and enum libbpf_struct_mode have + * no effect. But they are left in libbpf_legacy.h so that applications that + * prepared for libbpf 1.0 before final release by using + * libbpf_set_strict_mode() still work with libbpf 1.0+ without any changes. + */ enum libbpf_strict_mode { /* Turn on all supported strict features of libbpf to simulate libbpf * v1.0 behavior. @@ -71,8 +76,8 @@ enum libbpf_strict_mode { * first BPF program or map creation operation. This is done only if * kernel is too old to support memcg-based memory accounting for BPF * subsystem. By default, RLIMIT_MEMLOCK limit is set to RLIM_INFINITY, - * but it can be overriden with libbpf_set_memlock_rlim_max() API. - * Note that libbpf_set_memlock_rlim_max() needs to be called before + * but it can be overriden with libbpf_set_memlock_rlim() API. + * Note that libbpf_set_memlock_rlim() needs to be called before * the very first bpf_prog_load(), bpf_map_create() or bpf_object__load() * operation. */ @@ -88,6 +93,25 @@ enum libbpf_strict_mode { LIBBPF_API int libbpf_set_strict_mode(enum libbpf_strict_mode mode); +/** + * @brief **libbpf_get_error()** extracts the error code from the passed + * pointer + * @param ptr pointer returned from libbpf API function + * @return error code; or 0 if no error occured + * + * Note, as of libbpf 1.0 this function is not necessary and not recommended + * to be used. Libbpf doesn't return error code embedded into the pointer + * itself. Instead, NULL is returned on error and error code is passed through + * thread-local errno variable. **libbpf_get_error()** is just returning -errno + * value if it receives NULL, which is correct only if errno hasn't been + * modified between libbpf API call and corresponding **libbpf_get_error()** + * call. Prefer to check return for NULL and use errno directly. + * + * This API is left in libbpf 1.0 to allow applications that were 1.0-ready + * before final libbpf 1.0 without needing to change them. + */ +LIBBPF_API long libbpf_get_error(const void *ptr); + #define DECLARE_LIBBPF_OPTS LIBBPF_OPTS /* "Discouraged" APIs which don't follow consistent libbpf naming patterns. @@ -101,6 +125,8 @@ struct bpf_map; struct btf; struct btf_ext; +LIBBPF_API struct btf *libbpf_find_kernel_btf(void); + LIBBPF_API enum bpf_prog_type bpf_program__get_type(const struct bpf_program *prog); LIBBPF_API enum bpf_attach_type bpf_program__get_expected_attach_type(const struct bpf_program *prog); LIBBPF_API const char *bpf_map__get_pin_path(const struct bpf_map *map); diff --git a/pkg/plugin/lib/common/libbpf/_src/libbpf_probes.c b/pkg/plugin/lib/common/libbpf/_src/libbpf_probes.c index 97b06cede5..9dfbe7750f 100644 --- a/pkg/plugin/lib/common/libbpf/_src/libbpf_probes.c +++ b/pkg/plugin/lib/common/libbpf/_src/libbpf_probes.c @@ -12,52 +12,102 @@ #include #include #include +#include #include "bpf.h" #include "libbpf.h" #include "libbpf_internal.h" -static bool grep(const char *buffer, const char *pattern) +/* On Ubuntu LINUX_VERSION_CODE doesn't correspond to info.release, + * but Ubuntu provides /proc/version_signature file, as described at + * https://ubuntu.com/kernel, with an example contents below, which we + * can use to get a proper LINUX_VERSION_CODE. + * + * Ubuntu 5.4.0-12.15-generic 5.4.8 + * + * In the above, 5.4.8 is what kernel is actually expecting, while + * uname() call will return 5.4.0 in info.release. + */ +static __u32 get_ubuntu_kernel_version(void) +{ + const char *ubuntu_kver_file = "/proc/version_signature"; + __u32 major, minor, patch; + int ret; + FILE *f; + + if (faccessat(AT_FDCWD, ubuntu_kver_file, R_OK, AT_EACCESS) != 0) + return 0; + + f = fopen(ubuntu_kver_file, "re"); + if (!f) + return 0; + + ret = fscanf(f, "%*s %*s %u.%u.%u\n", &major, &minor, &patch); + fclose(f); + if (ret != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); +} + +/* On Debian LINUX_VERSION_CODE doesn't correspond to info.release. + * Instead, it is provided in info.version. An example content of + * Debian 10 looks like the below. + * + * utsname::release 4.19.0-22-amd64 + * utsname::version #1 SMP Debian 4.19.260-1 (2022-09-29) + * + * In the above, 4.19.260 is what kernel is actually expecting, while + * uname() call will return 4.19.0 in info.release. + */ +static __u32 get_debian_kernel_version(struct utsname *info) { - return !!strstr(buffer, pattern); + __u32 major, minor, patch; + char *p; + + p = strstr(info->version, "Debian "); + if (!p) { + /* This is not a Debian kernel. */ + return 0; + } + + if (sscanf(p, "Debian %u.%u.%u", &major, &minor, &patch) != 3) + return 0; + + return KERNEL_VERSION(major, minor, patch); } -static int get_vendor_id(int ifindex) +__u32 get_kernel_version(void) { - char ifname[IF_NAMESIZE], path[64], buf[8]; - ssize_t len; - int fd; + __u32 major, minor, patch, version; + struct utsname info; - if (!if_indextoname(ifindex, ifname)) - return -1; + /* Check if this is an Ubuntu kernel. */ + version = get_ubuntu_kernel_version(); + if (version != 0) + return version; - snprintf(path, sizeof(path), "/sys/class/net/%s/device/vendor", ifname); + uname(&info); - fd = open(path, O_RDONLY | O_CLOEXEC); - if (fd < 0) - return -1; + /* Check if this is a Debian kernel. */ + version = get_debian_kernel_version(&info); + if (version != 0) + return version; - len = read(fd, buf, sizeof(buf)); - close(fd); - if (len < 0) - return -1; - if (len >= (ssize_t)sizeof(buf)) - return -1; - buf[len] = '\0'; + if (sscanf(info.release, "%u.%u.%u", &major, &minor, &patch) != 3) + return 0; - return strtol(buf, NULL, 0); + return KERNEL_VERSION(major, minor, patch); } static int probe_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, size_t insns_cnt, - char *log_buf, size_t log_buf_sz, - __u32 ifindex) + char *log_buf, size_t log_buf_sz) { LIBBPF_OPTS(bpf_prog_load_opts, opts, .log_buf = log_buf, .log_size = log_buf_sz, .log_level = log_buf ? 1 : 0, - .prog_ifindex = ifindex, ); int fd, err, exp_err = 0; const char *exp_msg = NULL; @@ -131,6 +181,9 @@ static int probe_prog_load(enum bpf_prog_type prog_type, case BPF_PROG_TYPE_FLOW_DISSECTOR: case BPF_PROG_TYPE_CGROUP_SYSCTL: break; + case BPF_PROG_TYPE_NETFILTER: + opts.expected_attach_type = BPF_NETFILTER; + break; default: return -EOPNOTSUPP; } @@ -161,33 +214,13 @@ int libbpf_probe_bpf_prog_type(enum bpf_prog_type prog_type, const void *opts) if (opts) return libbpf_err(-EINVAL); - ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0, 0); + ret = probe_prog_load(prog_type, insns, insn_cnt, NULL, 0); return libbpf_err(ret); } -bool bpf_probe_prog_type(enum bpf_prog_type prog_type, __u32 ifindex) -{ - struct bpf_insn insns[2] = { - BPF_MOV64_IMM(BPF_REG_0, 0), - BPF_EXIT_INSN() - }; - - /* prefer libbpf_probe_bpf_prog_type() unless offload is requested */ - if (ifindex == 0) - return libbpf_probe_bpf_prog_type(prog_type, NULL) == 1; - - if (ifindex && prog_type == BPF_PROG_TYPE_SCHED_CLS) - /* nfp returns -EINVAL on exit(0) with TC offload */ - insns[0].imm = 2; - - errno = 0; - probe_prog_load(prog_type, insns, ARRAY_SIZE(insns), NULL, 0, ifindex); - - return errno != EINVAL && errno != EOPNOTSUPP; -} - int libbpf__load_raw_btf(const char *raw_types, size_t types_len, - const char *str_sec, size_t str_len) + const char *str_sec, size_t str_len, + int token_fd) { struct btf_header hdr = { .magic = BTF_MAGIC, @@ -197,6 +230,10 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len, .str_off = types_len, .str_len = str_len, }; + LIBBPF_OPTS(bpf_btf_load_opts, opts, + .token_fd = token_fd, + .btf_flags = token_fd ? BPF_F_TOKEN_FD : 0, + ); int btf_fd, btf_len; __u8 *raw_btf; @@ -209,7 +246,7 @@ int libbpf__load_raw_btf(const char *raw_types, size_t types_len, memcpy(raw_btf + hdr.hdr_len, raw_types, hdr.type_len); memcpy(raw_btf + hdr.hdr_len + hdr.type_len, str_sec, hdr.str_len); - btf_fd = bpf_btf_load(raw_btf, btf_len, NULL); + btf_fd = bpf_btf_load(raw_btf, btf_len, &opts); free(raw_btf); return btf_fd; @@ -239,17 +276,15 @@ static int load_local_storage_btf(void) }; return libbpf__load_raw_btf((char *)types, sizeof(types), - strs, sizeof(strs)); + strs, sizeof(strs), 0); } -static int probe_map_create(enum bpf_map_type map_type, __u32 ifindex) +static int probe_map_create(enum bpf_map_type map_type) { LIBBPF_OPTS(bpf_map_create_opts, opts); int key_size, value_size, max_entries; __u32 btf_key_type_id = 0, btf_value_type_id = 0; - int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err; - - opts.map_ifindex = ifindex; + int fd = -1, btf_fd = -1, fd_inner = -1, exp_err = 0, err = 0; key_size = sizeof(__u32); value_size = sizeof(__u32); @@ -277,6 +312,7 @@ static int probe_map_create(enum bpf_map_type map_type, __u32 ifindex) case BPF_MAP_TYPE_SK_STORAGE: case BPF_MAP_TYPE_INODE_STORAGE: case BPF_MAP_TYPE_TASK_STORAGE: + case BPF_MAP_TYPE_CGRP_STORAGE: btf_key_type_id = 1; btf_value_type_id = 3; value_size = 8; @@ -287,19 +323,28 @@ static int probe_map_create(enum bpf_map_type map_type, __u32 ifindex) return btf_fd; break; case BPF_MAP_TYPE_RINGBUF: + case BPF_MAP_TYPE_USER_RINGBUF: key_size = 0; value_size = 0; - max_entries = 4096; + max_entries = sysconf(_SC_PAGE_SIZE); break; case BPF_MAP_TYPE_STRUCT_OPS: /* we'll get -ENOTSUPP for invalid BTF type ID for struct_ops */ opts.btf_vmlinux_value_type_id = 1; + opts.value_type_btf_obj_fd = -1; exp_err = -524; /* -ENOTSUPP */ break; case BPF_MAP_TYPE_BLOOM_FILTER: key_size = 0; max_entries = 1; break; + case BPF_MAP_TYPE_ARENA: + key_size = 0; + value_size = 0; + max_entries = 1; /* one page */ + opts.map_extra = 0; /* can mmap() at any address */ + opts.map_flags = BPF_F_MMAPABLE; + break; case BPF_MAP_TYPE_HASH: case BPF_MAP_TYPE_ARRAY: case BPF_MAP_TYPE_PROG_ARRAY: @@ -326,12 +371,6 @@ static int probe_map_create(enum bpf_map_type map_type, __u32 ifindex) if (map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS || map_type == BPF_MAP_TYPE_HASH_OF_MAPS) { - /* TODO: probe for device, once libbpf has a function to create - * map-in-map for offload - */ - if (ifindex) - goto cleanup; - fd_inner = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, sizeof(__u32), sizeof(__u32), 1, NULL); if (fd_inner < 0) @@ -370,15 +409,10 @@ int libbpf_probe_bpf_map_type(enum bpf_map_type map_type, const void *opts) if (opts) return libbpf_err(-EINVAL); - ret = probe_map_create(map_type, 0); + ret = probe_map_create(map_type); return libbpf_err(ret); } -bool bpf_probe_map_type(enum bpf_map_type map_type, __u32 ifindex) -{ - return probe_map_create(map_type, ifindex) == 1; -} - int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helper_id, const void *opts) { @@ -407,14 +441,15 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe } buf[0] = '\0'; - ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf), 0); + ret = probe_prog_load(prog_type, insns, insn_cnt, buf, sizeof(buf)); if (ret < 0) return libbpf_err(ret); /* If BPF verifier doesn't recognize BPF helper ID (enum bpf_func_id) * at all, it will emit something like "invalid func unknown#181". * If BPF verifier recognizes BPF helper but it's not supported for - * given BPF program type, it will emit "unknown func bpf_sys_bpf#166". + * given BPF program type, it will emit "unknown func bpf_sys_bpf#166" + * or "program of this type cannot use helper bpf_sys_bpf#166". * In both cases, provided combination of BPF program type and BPF * helper is not supported by the kernel. * In all other cases, probe_prog_load() above will either succeed (e.g., @@ -423,55 +458,8 @@ int libbpf_probe_bpf_helper(enum bpf_prog_type prog_type, enum bpf_func_id helpe * that), or we'll get some more specific BPF verifier error about * some unsatisfied conditions. */ - if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func "))) + if (ret == 0 && (strstr(buf, "invalid func ") || strstr(buf, "unknown func ") || + strstr(buf, "program of this type cannot use helper "))) return 0; return 1; /* assume supported */ } - -bool bpf_probe_helper(enum bpf_func_id id, enum bpf_prog_type prog_type, - __u32 ifindex) -{ - struct bpf_insn insns[2] = { - BPF_EMIT_CALL(id), - BPF_EXIT_INSN() - }; - char buf[4096] = {}; - bool res; - - probe_prog_load(prog_type, insns, ARRAY_SIZE(insns), buf, sizeof(buf), ifindex); - res = !grep(buf, "invalid func ") && !grep(buf, "unknown func "); - - if (ifindex) { - switch (get_vendor_id(ifindex)) { - case 0x19ee: /* Netronome specific */ - res = res && !grep(buf, "not supported by FW") && - !grep(buf, "unsupported function id"); - break; - default: - break; - } - } - - return res; -} - -/* - * Probe for availability of kernel commit (5.3): - * - * c04c0d2b968a ("bpf: increase complexity limit and maximum program size") - */ -bool bpf_probe_large_insn_limit(__u32 ifindex) -{ - struct bpf_insn insns[BPF_MAXINSNS + 1]; - int i; - - for (i = 0; i < BPF_MAXINSNS; i++) - insns[i] = BPF_MOV64_IMM(BPF_REG_0, 1); - insns[BPF_MAXINSNS] = BPF_EXIT_INSN(); - - errno = 0; - probe_prog_load(BPF_PROG_TYPE_SCHED_CLS, insns, ARRAY_SIZE(insns), NULL, 0, - ifindex); - - return errno != E2BIG && errno != EINVAL; -} diff --git a/pkg/plugin/lib/common/libbpf/_src/libbpf_version.h b/pkg/plugin/lib/common/libbpf/_src/libbpf_version.h index 61f2039404..d6e5eff967 100644 --- a/pkg/plugin/lib/common/libbpf/_src/libbpf_version.h +++ b/pkg/plugin/lib/common/libbpf/_src/libbpf_version.h @@ -3,7 +3,7 @@ #ifndef __LIBBPF_VERSION_H #define __LIBBPF_VERSION_H -#define LIBBPF_MAJOR_VERSION 0 -#define LIBBPF_MINOR_VERSION 8 +#define LIBBPF_MAJOR_VERSION 1 +#define LIBBPF_MINOR_VERSION 5 #endif /* __LIBBPF_VERSION_H */ diff --git a/pkg/plugin/lib/common/libbpf/_src/linker.c b/pkg/plugin/lib/common/libbpf/_src/linker.c index 9aa016fb55..9cd3d41097 100644 --- a/pkg/plugin/lib/common/libbpf/_src/linker.c +++ b/pkg/plugin/lib/common/libbpf/_src/linker.c @@ -697,11 +697,6 @@ static int linker_load_obj_file(struct bpf_linker *linker, const char *filename, return err; } -static bool is_pow_of_2(size_t x) -{ - return x && (x & (x - 1)) == 0; -} - static int linker_sanity_check_elf(struct src_obj *obj) { struct src_sec *sec; @@ -724,13 +719,28 @@ static int linker_sanity_check_elf(struct src_obj *obj) return -EINVAL; } - if (sec->shdr->sh_addralign && !is_pow_of_2(sec->shdr->sh_addralign)) + if (is_dwarf_sec_name(sec->sec_name)) + continue; + + if (sec->shdr->sh_addralign && !is_pow_of_2(sec->shdr->sh_addralign)) { + pr_warn("ELF section #%zu alignment %llu is non pow-of-2 alignment in %s\n", + sec->sec_idx, (long long unsigned)sec->shdr->sh_addralign, + obj->filename); return -EINVAL; - if (sec->shdr->sh_addralign != sec->data->d_align) + } + if (sec->shdr->sh_addralign != sec->data->d_align) { + pr_warn("ELF section #%zu has inconsistent alignment addr=%llu != d=%llu in %s\n", + sec->sec_idx, (long long unsigned)sec->shdr->sh_addralign, + (long long unsigned)sec->data->d_align, obj->filename); return -EINVAL; + } - if (sec->shdr->sh_size != sec->data->d_size) + if (sec->shdr->sh_size != sec->data->d_size) { + pr_warn("ELF section #%zu has inconsistent section size sh=%llu != d=%llu in %s\n", + sec->sec_idx, (long long unsigned)sec->shdr->sh_size, + (long long unsigned)sec->data->d_size, obj->filename); return -EINVAL; + } switch (sec->shdr->sh_type) { case SHT_SYMTAB: @@ -742,8 +752,12 @@ static int linker_sanity_check_elf(struct src_obj *obj) break; case SHT_PROGBITS: if (sec->shdr->sh_flags & SHF_EXECINSTR) { - if (sec->shdr->sh_size % sizeof(struct bpf_insn) != 0) + if (sec->shdr->sh_size % sizeof(struct bpf_insn) != 0) { + pr_warn("ELF section #%zu has unexpected size alignment %llu in %s\n", + sec->sec_idx, (long long unsigned)sec->shdr->sh_size, + obj->filename); return -EINVAL; + } } break; case SHT_NOBITS: @@ -943,19 +957,33 @@ static int check_btf_str_off(__u32 *str_off, void *ctx) static int linker_sanity_check_btf(struct src_obj *obj) { struct btf_type *t; - int i, n, err = 0; + int i, n, err; if (!obj->btf) return 0; n = btf__type_cnt(obj->btf); for (i = 1; i < n; i++) { + struct btf_field_iter it; + __u32 *type_id, *str_off; + t = btf_type_by_id(obj->btf, i); - err = err ?: btf_type_visit_type_ids(t, check_btf_type_id, obj->btf); - err = err ?: btf_type_visit_str_offs(t, check_btf_str_off, obj->btf); + err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_IDS); if (err) return err; + while ((type_id = btf_field_iter_next(&it))) { + if (*type_id >= n) + return -EINVAL; + } + + err = btf_field_iter_init(&it, t, BTF_FIELD_ITER_STRS); + if (err) + return err; + while ((str_off = btf_field_iter_next(&it))) { + if (!btf__str_by_offset(obj->btf, *str_off)) + return -EINVAL; + } } return 0; @@ -1120,7 +1148,19 @@ static int extend_sec(struct bpf_linker *linker, struct dst_sec *dst, struct src if (src->shdr->sh_type != SHT_NOBITS) { tmp = realloc(dst->raw_data, dst_final_sz); - if (!tmp) + /* If dst_align_sz == 0, realloc() behaves in a special way: + * 1. When dst->raw_data is NULL it returns: + * "either NULL or a pointer suitable to be passed to free()" [1]. + * 2. When dst->raw_data is not-NULL it frees dst->raw_data and returns NULL, + * thus invalidating any "pointer suitable to be passed to free()" obtained + * at step (1). + * + * The dst_align_sz > 0 check avoids error exit after (2), otherwise + * dst->raw_data would be freed again in bpf_linker__free(). + * + * [1] man 3 realloc + */ + if (!tmp && dst_align_sz > 0) return -ENOMEM; dst->raw_data = tmp; @@ -1340,6 +1380,7 @@ static bool glob_sym_btf_matches(const char *sym_name, bool exact, case BTF_KIND_STRUCT: case BTF_KIND_UNION: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: case BTF_KIND_FWD: case BTF_KIND_FUNC: case BTF_KIND_VAR: @@ -1362,6 +1403,7 @@ static bool glob_sym_btf_matches(const char *sym_name, bool exact, case BTF_KIND_INT: case BTF_KIND_FLOAT: case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: /* ignore encoding for int and enum values for enum */ if (t1->size != t2->size) { pr_warn("global '%s': incompatible %s '%s' size %u and %u\n", @@ -2000,7 +2042,6 @@ static int linker_append_elf_sym(struct bpf_linker *linker, struct src_obj *obj, static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *obj) { struct src_sec *src_symtab = &obj->secs[obj->symtab_sec_idx]; - struct dst_sec *dst_symtab; int i, err; for (i = 1; i < obj->sec_cnt; i++) { @@ -2033,9 +2074,6 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob return -1; } - /* add_dst_sec() above could have invalidated linker->secs */ - dst_symtab = &linker->secs[linker->symtab_sec_idx]; - /* shdr->sh_link points to SYMTAB */ dst_sec->shdr->sh_link = linker->symtab_sec_idx; @@ -2052,16 +2090,13 @@ static int linker_append_elf_relos(struct bpf_linker *linker, struct src_obj *ob dst_rel = dst_sec->raw_data + src_sec->dst_off; n = src_sec->shdr->sh_size / src_sec->shdr->sh_entsize; for (j = 0; j < n; j++, src_rel++, dst_rel++) { - size_t src_sym_idx = ELF64_R_SYM(src_rel->r_info); - size_t sym_type = ELF64_R_TYPE(src_rel->r_info); - Elf64_Sym *src_sym, *dst_sym; - size_t dst_sym_idx; + size_t src_sym_idx, dst_sym_idx, sym_type; + Elf64_Sym *src_sym; src_sym_idx = ELF64_R_SYM(src_rel->r_info); src_sym = src_symtab->data->d_buf + sizeof(*src_sym) * src_sym_idx; dst_sym_idx = obj->sym_map[src_sym_idx]; - dst_sym = dst_symtab->raw_data + sizeof(*dst_sym) * dst_sym_idx; dst_rel->r_offset += src_linked_sec->dst_off; sym_type = ELF64_R_TYPE(src_rel->r_info); dst_rel->r_info = ELF64_R_INFO(dst_sym_idx, sym_type); @@ -2192,10 +2227,17 @@ static int linker_fixup_btf(struct src_obj *obj) vi = btf_var_secinfos(t); for (j = 0, m = btf_vlen(t); j < m; j++, vi++) { const struct btf_type *vt = btf__type_by_id(obj->btf, vi->type); - const char *var_name = btf__str_by_offset(obj->btf, vt->name_off); - int var_linkage = btf_var(vt)->linkage; + const char *var_name; + int var_linkage; Elf64_Sym *sym; + /* could be a variable or function */ + if (!btf_is_var(vt)) + continue; + + var_name = btf__str_by_offset(obj->btf, vt->name_off); + var_linkage = btf_var(vt)->linkage; + /* no need to patch up static or extern vars */ if (var_linkage != BTF_VAR_GLOBAL_ALLOCATED) continue; @@ -2213,26 +2255,10 @@ static int linker_fixup_btf(struct src_obj *obj) return 0; } -static int remap_type_id(__u32 *type_id, void *ctx) -{ - int *id_map = ctx; - int new_id = id_map[*type_id]; - - /* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */ - if (new_id == 0 && *type_id != 0) { - pr_warn("failed to find new ID mapping for original BTF type ID %u\n", *type_id); - return -EINVAL; - } - - *type_id = id_map[*type_id]; - - return 0; -} - static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) { const struct btf_type *t; - int i, j, n, start_id, id; + int i, j, n, start_id, id, err; const char *name; if (!obj->btf) @@ -2303,9 +2329,25 @@ static int linker_append_btf(struct bpf_linker *linker, struct src_obj *obj) n = btf__type_cnt(linker->btf); for (i = start_id; i < n; i++) { struct btf_type *dst_t = btf_type_by_id(linker->btf, i); + struct btf_field_iter it; + __u32 *type_id; - if (btf_type_visit_type_ids(dst_t, remap_type_id, obj->btf_type_map)) - return -EINVAL; + err = btf_field_iter_init(&it, dst_t, BTF_FIELD_ITER_IDS); + if (err) + return err; + + while ((type_id = btf_field_iter_next(&it))) { + int new_id = obj->btf_type_map[*type_id]; + + /* Error out if the type wasn't remapped. Ignore VOID which stays VOID. */ + if (new_id == 0 && *type_id != 0) { + pr_warn("failed to find new ID mapping for original BTF type ID %u\n", + *type_id); + return -EINVAL; + } + + *type_id = obj->btf_type_map[*type_id]; + } } /* Rewrite VAR/FUNC underlying types (i.e., FUNC's FUNC_PROTO and VAR's @@ -2711,7 +2753,7 @@ static int finalize_btf(struct bpf_linker *linker) /* Emit .BTF.ext section */ if (linker->btf_ext) { - raw_data = btf_ext__get_raw_data(linker->btf_ext, &raw_sz); + raw_data = btf_ext__raw_data(linker->btf_ext, &raw_sz); if (!raw_data) return -ENOMEM; diff --git a/pkg/plugin/lib/common/libbpf/_src/netlink.c b/pkg/plugin/lib/common/libbpf/_src/netlink.c index cbc8967d54..68a2def171 100644 --- a/pkg/plugin/lib/common/libbpf/_src/netlink.c +++ b/pkg/plugin/lib/common/libbpf/_src/netlink.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -27,13 +28,28 @@ typedef int (*libbpf_dump_nlmsg_t)(void *cookie, void *msg, struct nlattr **tb); typedef int (*__dump_nlmsg_t)(struct nlmsghdr *nlmsg, libbpf_dump_nlmsg_t, void *cookie); +struct xdp_link_info { + __u32 prog_id; + __u32 drv_prog_id; + __u32 hw_prog_id; + __u32 skb_prog_id; + __u8 attach_mode; +}; + struct xdp_id_md { int ifindex; __u32 flags; struct xdp_link_info info; + __u64 feature_flags; +}; + +struct xdp_features_md { + int ifindex; + __u32 xdp_zc_max_segs; + __u64 flags; }; -static int libbpf_netlink_open(__u32 *nl_pid) +static int libbpf_netlink_open(__u32 *nl_pid, int proto) { struct sockaddr_nl sa; socklen_t addrlen; @@ -43,7 +59,7 @@ static int libbpf_netlink_open(__u32 *nl_pid) memset(&sa, 0, sizeof(sa)); sa.nl_family = AF_NETLINK; - sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + sock = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, proto); if (sock < 0) return -errno; @@ -204,14 +220,14 @@ static int libbpf_netlink_recv(int sock, __u32 nl_pid, int seq, } static int libbpf_netlink_send_recv(struct libbpf_nla_req *req, - __dump_nlmsg_t parse_msg, + int proto, __dump_nlmsg_t parse_msg, libbpf_dump_nlmsg_t parse_attr, void *cookie) { __u32 nl_pid = 0; int sock, ret; - sock = libbpf_netlink_open(&nl_pid); + sock = libbpf_netlink_open(&nl_pid, proto); if (sock < 0) return sock; @@ -230,6 +246,43 @@ static int libbpf_netlink_send_recv(struct libbpf_nla_req *req, return ret; } +static int parse_genl_family_id(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct genlmsghdr *gnl = NLMSG_DATA(nh); + struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN); + struct nlattr *tb[CTRL_ATTR_FAMILY_ID + 1]; + __u16 *id = cookie; + + libbpf_nla_parse(tb, CTRL_ATTR_FAMILY_ID, na, + NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL); + if (!tb[CTRL_ATTR_FAMILY_ID]) + return NL_CONT; + + *id = libbpf_nla_getattr_u16(tb[CTRL_ATTR_FAMILY_ID]); + return NL_DONE; +} + +static int libbpf_netlink_resolve_genl_family_id(const char *name, + __u16 len, __u16 *id) +{ + struct libbpf_nla_req req = { + .nh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN), + .nh.nlmsg_type = GENL_ID_CTRL, + .nh.nlmsg_flags = NLM_F_REQUEST, + .gnl.cmd = CTRL_CMD_GETFAMILY, + .gnl.version = 2, + }; + int err; + + err = nlattr_add(&req, CTRL_ATTR_FAMILY_NAME, name, len); + if (err < 0) + return err; + + return libbpf_netlink_send_recv(&req, NETLINK_GENERIC, + parse_genl_family_id, NULL, id); +} + static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, __u32 flags) { @@ -263,7 +316,7 @@ static int __bpf_set_link_xdp_fd_replace(int ifindex, int fd, int old_fd, } nlattr_end_nested(&req, nla); - return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); } int bpf_xdp_attach(int ifindex, int prog_fd, __u32 flags, const struct bpf_xdp_attach_opts *opts) @@ -288,31 +341,6 @@ int bpf_xdp_detach(int ifindex, __u32 flags, const struct bpf_xdp_attach_opts *o return bpf_xdp_attach(ifindex, -1, flags, opts); } -int bpf_set_link_xdp_fd_opts(int ifindex, int fd, __u32 flags, - const struct bpf_xdp_set_link_opts *opts) -{ - int old_fd = -1, ret; - - if (!OPTS_VALID(opts, bpf_xdp_set_link_opts)) - return libbpf_err(-EINVAL); - - if (OPTS_HAS(opts, old_fd)) { - old_fd = OPTS_GET(opts, old_fd, -1); - flags |= XDP_FLAGS_REPLACE; - } - - ret = __bpf_set_link_xdp_fd_replace(ifindex, fd, old_fd, flags); - return libbpf_err(ret); -} - -int bpf_set_link_xdp_fd(int ifindex, int fd, __u32 flags) -{ - int ret; - - ret = __bpf_set_link_xdp_fd_replace(ifindex, fd, 0, flags); - return libbpf_err(ret); -} - static int __dump_link_nlmsg(struct nlmsghdr *nlh, libbpf_dump_nlmsg_t dump_link_nlmsg, void *cookie) { @@ -374,6 +402,32 @@ static int get_xdp_info(void *cookie, void *msg, struct nlattr **tb) return 0; } +static int parse_xdp_features(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, + void *cookie) +{ + struct genlmsghdr *gnl = NLMSG_DATA(nh); + struct nlattr *na = (struct nlattr *)((void *)gnl + GENL_HDRLEN); + struct nlattr *tb[NETDEV_CMD_MAX + 1]; + struct xdp_features_md *md = cookie; + __u32 ifindex; + + libbpf_nla_parse(tb, NETDEV_CMD_MAX, na, + NLMSG_PAYLOAD(nh, sizeof(*gnl)), NULL); + + if (!tb[NETDEV_A_DEV_IFINDEX] || !tb[NETDEV_A_DEV_XDP_FEATURES]) + return NL_CONT; + + ifindex = libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_IFINDEX]); + if (ifindex != md->ifindex) + return NL_CONT; + + md->flags = libbpf_nla_getattr_u64(tb[NETDEV_A_DEV_XDP_FEATURES]); + if (tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS]) + md->xdp_zc_max_segs = + libbpf_nla_getattr_u32(tb[NETDEV_A_DEV_XDP_ZC_MAX_SEGS]); + return NL_DONE; +} + int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) { struct libbpf_nla_req req = { @@ -383,6 +437,10 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) .ifinfo.ifi_family = AF_PACKET, }; struct xdp_id_md xdp_id = {}; + struct xdp_features_md md = { + .ifindex = ifindex, + }; + __u16 id; int err; if (!OPTS_VALID(opts, bpf_xdp_query_opts)) @@ -399,7 +457,7 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) xdp_id.ifindex = ifindex; xdp_id.flags = xdp_flags; - err = libbpf_netlink_send_recv(&req, __dump_link_nlmsg, + err = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, __dump_link_nlmsg, get_xdp_info, &xdp_id); if (err) return libbpf_err(err); @@ -410,30 +468,38 @@ int bpf_xdp_query(int ifindex, int xdp_flags, struct bpf_xdp_query_opts *opts) OPTS_SET(opts, skb_prog_id, xdp_id.info.skb_prog_id); OPTS_SET(opts, attach_mode, xdp_id.info.attach_mode); - return 0; -} + if (!OPTS_HAS(opts, feature_flags)) + return 0; -int bpf_get_link_xdp_info(int ifindex, struct xdp_link_info *info, - size_t info_size, __u32 flags) -{ - LIBBPF_OPTS(bpf_xdp_query_opts, opts); - size_t sz; - int err; + err = libbpf_netlink_resolve_genl_family_id("netdev", sizeof("netdev"), &id); + if (err < 0) { + if (err == -ENOENT) { + opts->feature_flags = 0; + goto skip_feature_flags; + } + return libbpf_err(err); + } - if (!info_size) - return libbpf_err(-EINVAL); + memset(&req, 0, sizeof(req)); + req.nh.nlmsg_len = NLMSG_LENGTH(GENL_HDRLEN); + req.nh.nlmsg_flags = NLM_F_REQUEST; + req.nh.nlmsg_type = id; + req.gnl.cmd = NETDEV_CMD_DEV_GET; + req.gnl.version = 2; + + err = nlattr_add(&req, NETDEV_A_DEV_IFINDEX, &ifindex, sizeof(ifindex)); + if (err < 0) + return libbpf_err(err); - err = bpf_xdp_query(ifindex, flags, &opts); + err = libbpf_netlink_send_recv(&req, NETLINK_GENERIC, + parse_xdp_features, NULL, &md); if (err) return libbpf_err(err); - /* struct xdp_link_info field layout matches struct bpf_xdp_query_opts - * layout after sz field - */ - sz = min(info_size, offsetofend(struct xdp_link_info, attach_mode)); - memcpy(info, &opts.prog_id, sz); - memset((void *)info + sz, 0, info_size - sz); + OPTS_SET(opts, feature_flags, md.flags); + OPTS_SET(opts, xdp_zc_max_segs, md.xdp_zc_max_segs); +skip_feature_flags: return 0; } @@ -463,11 +529,6 @@ int bpf_xdp_query_id(int ifindex, int flags, __u32 *prog_id) } -int bpf_get_link_xdp_id(int ifindex, __u32 *prog_id, __u32 flags) -{ - return bpf_xdp_query_id(ifindex, flags, prog_id); -} - typedef int (*qdisc_config_t)(struct libbpf_nla_req *req); static int clsact_config(struct libbpf_nla_req *req) @@ -539,7 +600,7 @@ static int tc_qdisc_modify(struct bpf_tc_hook *hook, int cmd, int flags) if (ret < 0) return ret; - return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); } static int tc_qdisc_create_excl(struct bpf_tc_hook *hook) @@ -633,12 +694,13 @@ static int get_tc_info(struct nlmsghdr *nh, libbpf_dump_nlmsg_t fn, static int tc_add_fd_and_name(struct libbpf_nla_req *req, int fd) { - struct bpf_prog_info info = {}; + struct bpf_prog_info info; __u32 info_len = sizeof(info); char name[256]; int len, ret; - ret = bpf_obj_get_info_by_fd(fd, &info, &info_len); + memset(&info, 0, info_len); + ret = bpf_prog_get_info_by_fd(fd, &info, &info_len); if (ret < 0) return ret; @@ -718,7 +780,8 @@ int bpf_tc_attach(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) info.opts = opts; - ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); + ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL, + &info); if (ret < 0) return libbpf_err(ret); if (!info.processed) @@ -784,7 +847,7 @@ static int __bpf_tc_detach(const struct bpf_tc_hook *hook, return ret; } - return libbpf_netlink_send_recv(&req, NULL, NULL, NULL); + return libbpf_netlink_send_recv(&req, NETLINK_ROUTE, NULL, NULL, NULL); } int bpf_tc_detach(const struct bpf_tc_hook *hook, @@ -849,7 +912,8 @@ int bpf_tc_query(const struct bpf_tc_hook *hook, struct bpf_tc_opts *opts) info.opts = opts; - ret = libbpf_netlink_send_recv(&req, get_tc_info, NULL, &info); + ret = libbpf_netlink_send_recv(&req, NETLINK_ROUTE, get_tc_info, NULL, + &info); if (ret < 0) return libbpf_err(ret); if (!info.processed) diff --git a/pkg/plugin/lib/common/libbpf/_src/nlattr.c b/pkg/plugin/lib/common/libbpf/_src/nlattr.c index f57e77a6e4..975e265eab 100644 --- a/pkg/plugin/lib/common/libbpf/_src/nlattr.c +++ b/pkg/plugin/lib/common/libbpf/_src/nlattr.c @@ -32,7 +32,7 @@ static struct nlattr *nla_next(const struct nlattr *nla, int *remaining) static int nla_ok(const struct nlattr *nla, int remaining) { - return remaining >= sizeof(*nla) && + return remaining >= (int)sizeof(*nla) && nla->nla_len >= sizeof(*nla) && nla->nla_len <= remaining; } @@ -178,7 +178,7 @@ int libbpf_nla_dump_errormsg(struct nlmsghdr *nlh) hlen += nlmsg_len(&err->msg); attr = (struct nlattr *) ((void *) err + hlen); - alen = nlh->nlmsg_len - hlen; + alen = (void *)nlh + nlh->nlmsg_len - (void *)attr; if (libbpf_nla_parse(tb, NLMSGERR_ATTR_MAX, attr, alen, extack_policy) != 0) { diff --git a/pkg/plugin/lib/common/libbpf/_src/nlattr.h b/pkg/plugin/lib/common/libbpf/_src/nlattr.h index 4d15ae2ff8..d92d1c1de7 100644 --- a/pkg/plugin/lib/common/libbpf/_src/nlattr.h +++ b/pkg/plugin/lib/common/libbpf/_src/nlattr.h @@ -14,6 +14,7 @@ #include #include #include +#include /* avoid multiple definition of netlink features */ #define __LINUX_NETLINK_H @@ -58,6 +59,7 @@ struct libbpf_nla_req { union { struct ifinfomsg ifinfo; struct tcmsg tc; + struct genlmsghdr gnl; }; char buf[128]; }; @@ -89,11 +91,21 @@ static inline uint8_t libbpf_nla_getattr_u8(const struct nlattr *nla) return *(uint8_t *)libbpf_nla_data(nla); } +static inline uint16_t libbpf_nla_getattr_u16(const struct nlattr *nla) +{ + return *(uint16_t *)libbpf_nla_data(nla); +} + static inline uint32_t libbpf_nla_getattr_u32(const struct nlattr *nla) { return *(uint32_t *)libbpf_nla_data(nla); } +static inline uint64_t libbpf_nla_getattr_u64(const struct nlattr *nla) +{ + return *(uint64_t *)libbpf_nla_data(nla); +} + static inline const char *libbpf_nla_getattr_str(const struct nlattr *nla) { return (const char *)libbpf_nla_data(nla); diff --git a/pkg/plugin/lib/common/libbpf/_src/relo_core.c b/pkg/plugin/lib/common/libbpf/_src/relo_core.c index ba4453dfd1..63a4d5ad12 100644 --- a/pkg/plugin/lib/common/libbpf/_src/relo_core.c +++ b/pkg/plugin/lib/common/libbpf/_src/relo_core.c @@ -95,6 +95,7 @@ static const char *core_relo_kind_str(enum bpf_core_relo_kind kind) case BPF_CORE_TYPE_ID_LOCAL: return "local_type_id"; case BPF_CORE_TYPE_ID_TARGET: return "target_type_id"; case BPF_CORE_TYPE_EXISTS: return "type_exists"; + case BPF_CORE_TYPE_MATCHES: return "type_matches"; case BPF_CORE_TYPE_SIZE: return "type_size"; case BPF_CORE_ENUMVAL_EXISTS: return "enumval_exists"; case BPF_CORE_ENUMVAL_VALUE: return "enumval_value"; @@ -123,6 +124,7 @@ static bool core_relo_is_type_based(enum bpf_core_relo_kind kind) case BPF_CORE_TYPE_ID_LOCAL: case BPF_CORE_TYPE_ID_TARGET: case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: case BPF_CORE_TYPE_SIZE: return true; default: @@ -141,6 +143,86 @@ static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) } } +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level) +{ + const struct btf_type *local_type, *targ_type; + int depth = 32; /* max recursion depth */ + + /* caller made sure that names match (ignoring flavor suffix) */ + local_type = btf_type_by_id(local_btf, local_id); + targ_type = btf_type_by_id(targ_btf, targ_id); + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_type = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_type = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_type || !targ_type) + return -EINVAL; + + if (!btf_kind_core_compat(local_type, targ_type)) + return 0; + + switch (btf_kind(local_type)) { + case BTF_KIND_UNKN: + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + case BTF_KIND_ENUM: + case BTF_KIND_FWD: + case BTF_KIND_ENUM64: + return 1; + case BTF_KIND_INT: + /* just reject deprecated bitfield-like integers; all other + * integers are by default compatible between each other + */ + return btf_int_offset(local_type) == 0 && btf_int_offset(targ_type) == 0; + case BTF_KIND_PTR: + local_id = local_type->type; + targ_id = targ_type->type; + goto recur; + case BTF_KIND_ARRAY: + local_id = btf_array(local_type)->type; + targ_id = btf_array(targ_type)->type; + goto recur; + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_type); + struct btf_param *targ_p = btf_params(targ_type); + __u16 local_vlen = btf_vlen(local_type); + __u16 targ_vlen = btf_vlen(targ_type); + int i, err; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + if (level <= 0) + return -EINVAL; + + skip_mods_and_typedefs(local_btf, local_p->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_p->type, &targ_id); + err = __bpf_core_types_are_compat(local_btf, local_id, targ_btf, targ_id, + level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + skip_mods_and_typedefs(local_btf, local_type->type, &local_id); + skip_mods_and_typedefs(targ_btf, targ_type->type, &targ_id); + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_type), local_id, targ_id); + return 0; + } +} + /* * Turn bpf_core_relo into a low- and high-level spec representation, * validating correctness along the way, as well as calculating resulting @@ -167,11 +249,11 @@ static bool core_relo_is_enumval_based(enum bpf_core_relo_kind kind) * just a parsed access string representation): [0, 1, 2, 3]. * * High-level spec will capture only 3 points: - * - intial zero-index access by pointer (&s->... is the same as &s[0]...); + * - initial zero-index access by pointer (&s->... is the same as &s[0]...); * - field 'a' access (corresponds to '2' in low-level spec); * - array element #3 access (corresponds to '3' in low-level spec). * - * Type-based relocations (TYPE_EXISTS/TYPE_SIZE, + * Type-based relocations (TYPE_EXISTS/TYPE_MATCHES/TYPE_SIZE, * TYPE_ID_LOCAL/TYPE_ID_TARGET) don't capture any field information. Their * spec and raw_spec are kept empty. * @@ -186,7 +268,7 @@ int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, struct bpf_core_accessor *acc; const struct btf_type *t; const char *name, *spec_str; - __u32 id; + __u32 id, name_off; __s64 sz; spec_str = btf__name_by_offset(btf, relo->access_str_off); @@ -231,11 +313,13 @@ int bpf_core_parse_spec(const char *prog_name, const struct btf *btf, spec->len++; if (core_relo_is_enumval_based(relo->kind)) { - if (!btf_is_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) + if (!btf_is_any_enum(t) || spec->raw_len > 1 || access_idx >= btf_vlen(t)) return -EINVAL; /* record enumerator name in a first accessor */ - acc->name = btf__name_by_offset(btf, btf_enum(t)[access_idx].name_off); + name_off = btf_is_enum(t) ? btf_enum(t)[access_idx].name_off + : btf_enum64(t)[access_idx].name_off; + acc->name = btf__name_by_offset(btf, name_off); return 0; } @@ -340,7 +424,7 @@ static int bpf_core_fields_are_compat(const struct btf *local_btf, if (btf_is_composite(local_type) && btf_is_composite(targ_type)) return 1; - if (btf_kind(local_type) != btf_kind(targ_type)) + if (!btf_kind_core_compat(local_type, targ_type)) return 0; switch (btf_kind(local_type)) { @@ -348,6 +432,7 @@ static int bpf_core_fields_are_compat(const struct btf *local_btf, case BTF_KIND_FLOAT: return 1; case BTF_KIND_FWD: + case BTF_KIND_ENUM64: case BTF_KIND_ENUM: { const char *local_name, *targ_name; size_t local_len, targ_len; @@ -477,6 +562,7 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, const struct bpf_core_accessor *local_acc; struct bpf_core_accessor *targ_acc; int i, sz, matched; + __u32 name_off; memset(targ_spec, 0, sizeof(*targ_spec)); targ_spec->btf = targ_btf; @@ -484,9 +570,14 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, targ_spec->relo_kind = local_spec->relo_kind; if (core_relo_is_type_based(local_spec->relo_kind)) { - return bpf_core_types_are_compat(local_spec->btf, - local_spec->root_type_id, - targ_btf, targ_id); + if (local_spec->relo_kind == BPF_CORE_TYPE_MATCHES) + return bpf_core_types_match(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); + else + return bpf_core_types_are_compat(local_spec->btf, + local_spec->root_type_id, + targ_btf, targ_id); } local_acc = &local_spec->spec[0]; @@ -494,18 +585,22 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, if (core_relo_is_enumval_based(local_spec->relo_kind)) { size_t local_essent_len, targ_essent_len; - const struct btf_enum *e; const char *targ_name; /* has to resolve to an enum */ targ_type = skip_mods_and_typedefs(targ_spec->btf, targ_id, &targ_id); - if (!btf_is_enum(targ_type)) + if (!btf_is_any_enum(targ_type)) return 0; local_essent_len = bpf_core_essential_name_len(local_acc->name); - for (i = 0, e = btf_enum(targ_type); i < btf_vlen(targ_type); i++, e++) { - targ_name = btf__name_by_offset(targ_spec->btf, e->name_off); + for (i = 0; i < btf_vlen(targ_type); i++) { + if (btf_is_enum(targ_type)) + name_off = btf_enum(targ_type)[i].name_off; + else + name_off = btf_enum64(targ_type)[i].name_off; + + targ_name = btf__name_by_offset(targ_spec->btf, name_off); targ_essent_len = bpf_core_essential_name_len(targ_name); if (targ_essent_len != local_essent_len) continue; @@ -583,7 +678,7 @@ static int bpf_core_spec_match(struct bpf_core_spec *local_spec, static int bpf_core_calc_field_relo(const char *prog_name, const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, - __u32 *val, __u32 *field_sz, __u32 *type_id, + __u64 *val, __u32 *field_sz, __u32 *type_id, bool *validate) { const struct bpf_core_accessor *acc; @@ -680,9 +775,8 @@ static int bpf_core_calc_field_relo(const char *prog_name, *val = byte_sz; break; case BPF_CORE_FIELD_SIGNED: - /* enums will be assumed unsigned */ - *val = btf_is_enum(mt) || - (btf_int_encoding(mt) & BTF_INT_SIGNED); + *val = (btf_is_any_enum(mt) && BTF_INFO_KFLAG(mt->info)) || + (btf_is_int(mt) && (btf_int_encoding(mt) & BTF_INT_SIGNED)); if (validate) *validate = true; /* signedness is never ambiguous */ break; @@ -708,7 +802,7 @@ static int bpf_core_calc_field_relo(const char *prog_name, static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, - __u32 *val, bool *validate) + __u64 *val, bool *validate) { __s64 sz; @@ -732,6 +826,7 @@ static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, *validate = false; break; case BPF_CORE_TYPE_EXISTS: + case BPF_CORE_TYPE_MATCHES: *val = 1; break; case BPF_CORE_TYPE_SIZE: @@ -751,10 +846,9 @@ static int bpf_core_calc_type_relo(const struct bpf_core_relo *relo, static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, const struct bpf_core_spec *spec, - __u32 *val) + __u64 *val) { const struct btf_type *t; - const struct btf_enum *e; switch (relo->kind) { case BPF_CORE_ENUMVAL_EXISTS: @@ -764,8 +858,10 @@ static int bpf_core_calc_enumval_relo(const struct bpf_core_relo *relo, if (!spec) return -EUCLEAN; /* request instruction poisoning */ t = btf_type_by_id(spec->btf, spec->spec[0].type_id); - e = btf_enum(t) + spec->spec[0].idx; - *val = e->val; + if (btf_is_enum(t)) + *val = btf_enum(t)[spec->spec[0].idx].val; + else + *val = btf_enum64_value(btf_enum64(t) + spec->spec[0].idx); break; default: return -EOPNOTSUPP; @@ -929,7 +1025,7 @@ int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, int insn_idx, const struct bpf_core_relo *relo, int relo_idx, const struct bpf_core_relo_res *res) { - __u32 orig_val, new_val; + __u64 orig_val, new_val; __u8 class; class = BPF_CLASS(insn->code); @@ -954,28 +1050,30 @@ int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, if (BPF_SRC(insn->code) != BPF_K) return -EINVAL; if (res->validate && insn->imm != orig_val) { - pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %u -> %u\n", + pr_warn("prog '%s': relo #%d: unexpected insn #%d (ALU/ALU64) value: got %u, exp %llu -> %llu\n", prog_name, relo_idx, - insn_idx, insn->imm, orig_val, new_val); + insn_idx, insn->imm, (unsigned long long)orig_val, + (unsigned long long)new_val); return -EINVAL; } orig_val = insn->imm; insn->imm = new_val; - pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %u -> %u\n", + pr_debug("prog '%s': relo #%d: patched insn #%d (ALU/ALU64) imm %llu -> %llu\n", prog_name, relo_idx, insn_idx, - orig_val, new_val); + (unsigned long long)orig_val, (unsigned long long)new_val); break; case BPF_LDX: case BPF_ST: case BPF_STX: if (res->validate && insn->off != orig_val) { - pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %u -> %u\n", - prog_name, relo_idx, insn_idx, insn->off, orig_val, new_val); + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDX/ST/STX) value: got %u, exp %llu -> %llu\n", + prog_name, relo_idx, insn_idx, insn->off, (unsigned long long)orig_val, + (unsigned long long)new_val); return -EINVAL; } if (new_val > SHRT_MAX) { - pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %u\n", - prog_name, relo_idx, insn_idx, new_val); + pr_warn("prog '%s': relo #%d: insn #%d (LDX/ST/STX) value too big: %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)new_val); return -ERANGE; } if (res->fail_memsz_adjust) { @@ -987,8 +1085,9 @@ int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, orig_val = insn->off; insn->off = new_val; - pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %u -> %u\n", - prog_name, relo_idx, insn_idx, orig_val, new_val); + pr_debug("prog '%s': relo #%d: patched insn #%d (LDX/ST/STX) off %llu -> %llu\n", + prog_name, relo_idx, insn_idx, (unsigned long long)orig_val, + (unsigned long long)new_val); if (res->new_sz != res->orig_sz) { int insn_bytes_sz, insn_bpf_sz; @@ -1024,20 +1123,20 @@ int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, return -EINVAL; } - imm = insn[0].imm + ((__u64)insn[1].imm << 32); + imm = (__u32)insn[0].imm | ((__u64)insn[1].imm << 32); if (res->validate && imm != orig_val) { - pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %u -> %u\n", + pr_warn("prog '%s': relo #%d: unexpected insn #%d (LDIMM64) value: got %llu, exp %llu -> %llu\n", prog_name, relo_idx, insn_idx, (unsigned long long)imm, - orig_val, new_val); + (unsigned long long)orig_val, (unsigned long long)new_val); return -EINVAL; } insn[0].imm = new_val; - insn[1].imm = 0; /* currently only 32-bit values are supported */ - pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %u\n", + insn[1].imm = new_val >> 32; + pr_debug("prog '%s': relo #%d: patched insn #%d (LDIMM64) imm64 %llu -> %llu\n", prog_name, relo_idx, insn_idx, - (unsigned long long)imm, new_val); + (unsigned long long)imm, (unsigned long long)new_val); break; } default: @@ -1057,7 +1156,6 @@ int bpf_core_patch_insn(const char *prog_name, struct bpf_insn *insn, int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *spec) { const struct btf_type *t; - const struct btf_enum *e; const char *s; __u32 type_id; int i, len = 0; @@ -1086,10 +1184,23 @@ int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *s if (core_relo_is_enumval_based(spec->relo_kind)) { t = skip_mods_and_typedefs(spec->btf, type_id, NULL); - e = btf_enum(t) + spec->raw_spec[0]; - s = btf__name_by_offset(spec->btf, e->name_off); + if (btf_is_enum(t)) { + const struct btf_enum *e; + const char *fmt_str; + + e = btf_enum(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %d" : "::%s = %u"; + append_buf(fmt_str, s, e->val); + } else { + const struct btf_enum64 *e; + const char *fmt_str; - append_buf("::%s = %u", s, e->val); + e = btf_enum64(t) + spec->raw_spec[0]; + s = btf__name_by_offset(spec->btf, e->name_off); + fmt_str = BTF_INFO_KFLAG(t->info) ? "::%s = %lld" : "::%s = %llu"; + append_buf(fmt_str, s, (unsigned long long)btf_enum64_value(e)); + } return len; } @@ -1148,11 +1259,11 @@ int bpf_core_format_spec(char *buf, size_t buf_sz, const struct bpf_core_spec *s * 3. It is supported and expected that there might be multiple flavors * matching the spec. As long as all the specs resolve to the same set of * offsets across all candidates, there is no error. If there is any - * ambiguity, CO-RE relocation will fail. This is necessary to accomodate - * imprefection of BTF deduplication, which can cause slight duplication of + * ambiguity, CO-RE relocation will fail. This is necessary to accommodate + * imperfection of BTF deduplication, which can cause slight duplication of * the same BTF type, if some directly or indirectly referenced (by * pointer) type gets resolved to different actual types in different - * object files. If such situation occurs, deduplicated BTF will end up + * object files. If such a situation occurs, deduplicated BTF will end up * with two (or more) structurally identical types, which differ only in * types they refer to through pointer. This should be OK in most cases and * is not an error. @@ -1261,10 +1372,12 @@ int bpf_core_calc_relo_insn(const char *prog_name, * decision and value, otherwise it's dangerous to * proceed due to ambiguity */ - pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %u != %s %u\n", + pr_warn("prog '%s': relo #%d: relocation decision ambiguity: %s %llu != %s %llu\n", prog_name, relo_idx, - cand_res.poison ? "failure" : "success", cand_res.new_val, - targ_res->poison ? "failure" : "success", targ_res->new_val); + cand_res.poison ? "failure" : "success", + (unsigned long long)cand_res.new_val, + targ_res->poison ? "failure" : "success", + (unsigned long long)targ_res->new_val); return -EINVAL; } @@ -1305,3 +1418,270 @@ int bpf_core_calc_relo_insn(const char *prog_name, return 0; } + +static bool bpf_core_names_match(const struct btf *local_btf, size_t local_name_off, + const struct btf *targ_btf, size_t targ_name_off) +{ + const char *local_n, *targ_n; + size_t local_len, targ_len; + + local_n = btf__name_by_offset(local_btf, local_name_off); + targ_n = btf__name_by_offset(targ_btf, targ_name_off); + + if (str_is_empty(targ_n)) + return str_is_empty(local_n); + + targ_len = bpf_core_essential_name_len(targ_n); + local_len = bpf_core_essential_name_len(local_n); + + return targ_len == local_len && strncmp(local_n, targ_n, local_len) == 0; +} + +static int bpf_core_enums_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t) +{ + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j; + + if (local_t->size != targ_t->size) + return 0; + + if (local_vlen > targ_vlen) + return 0; + + /* iterate over the local enum's variants and make sure each has + * a symbolic name correspondent in the target + */ + for (i = 0; i < local_vlen; i++) { + bool matched = false; + __u32 local_n_off, targ_n_off; + + local_n_off = btf_is_enum(local_t) ? btf_enum(local_t)[i].name_off : + btf_enum64(local_t)[i].name_off; + + for (j = 0; j < targ_vlen; j++) { + targ_n_off = btf_is_enum(targ_t) ? btf_enum(targ_t)[j].name_off : + btf_enum64(targ_t)[j].name_off; + + if (bpf_core_names_match(local_btf, local_n_off, targ_btf, targ_n_off)) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +static int bpf_core_composites_match(const struct btf *local_btf, const struct btf_type *local_t, + const struct btf *targ_btf, const struct btf_type *targ_t, + bool behind_ptr, int level) +{ + const struct btf_member *local_m = btf_members(local_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, j, err; + + if (local_vlen > targ_vlen) + return 0; + + /* check that all local members have a match in the target */ + for (i = 0; i < local_vlen; i++, local_m++) { + const struct btf_member *targ_m = btf_members(targ_t); + bool matched = false; + + for (j = 0; j < targ_vlen; j++, targ_m++) { + if (!bpf_core_names_match(local_btf, local_m->name_off, + targ_btf, targ_m->name_off)) + continue; + + err = __bpf_core_types_match(local_btf, local_m->type, targ_btf, + targ_m->type, behind_ptr, level - 1); + if (err < 0) + return err; + if (err > 0) { + matched = true; + break; + } + } + + if (!matched) + return 0; + } + return 1; +} + +/* Check that two types "match". This function assumes that root types were + * already checked for name match. + * + * The matching relation is defined as follows: + * - modifiers and typedefs are stripped (and, hence, effectively ignored) + * - generally speaking types need to be of same kind (struct vs. struct, union + * vs. union, etc.) + * - exceptions are struct/union behind a pointer which could also match a + * forward declaration of a struct or union, respectively, and enum vs. + * enum64 (see below) + * Then, depending on type: + * - integers: + * - match if size and signedness match + * - arrays & pointers: + * - target types are recursively matched + * - structs & unions: + * - local members need to exist in target with the same name + * - for each member we recursively check match unless it is already behind a + * pointer, in which case we only check matching names and compatible kind + * - enums: + * - local variants have to have a match in target by symbolic name (but not + * numeric value) + * - size has to match (but enum may match enum64 and vice versa) + * - function pointers: + * - number and position of arguments in local type has to match target + * - for each argument and the return value we recursively check match + */ +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level) +{ + const struct btf_type *local_t, *targ_t; + int depth = 32; /* max recursion depth */ + __u16 local_k, targ_k; + + if (level <= 0) + return -EINVAL; + +recur: + depth--; + if (depth < 0) + return -EINVAL; + + local_t = skip_mods_and_typedefs(local_btf, local_id, &local_id); + targ_t = skip_mods_and_typedefs(targ_btf, targ_id, &targ_id); + if (!local_t || !targ_t) + return -EINVAL; + + /* While the name check happens after typedefs are skipped, root-level + * typedefs would still be name-matched as that's the contract with + * callers. + */ + if (!bpf_core_names_match(local_btf, local_t->name_off, targ_btf, targ_t->name_off)) + return 0; + + local_k = btf_kind(local_t); + targ_k = btf_kind(targ_t); + + switch (local_k) { + case BTF_KIND_UNKN: + return local_k == targ_k; + case BTF_KIND_FWD: { + bool local_f = BTF_INFO_KFLAG(local_t->info); + + if (behind_ptr) { + if (local_k == targ_k) + return local_f == BTF_INFO_KFLAG(targ_t->info); + + /* for forward declarations kflag dictates whether the + * target is a struct (0) or union (1) + */ + return (targ_k == BTF_KIND_STRUCT && !local_f) || + (targ_k == BTF_KIND_UNION && local_f); + } else { + if (local_k != targ_k) + return 0; + + /* match if the forward declaration is for the same kind */ + return local_f == BTF_INFO_KFLAG(targ_t->info); + } + } + case BTF_KIND_ENUM: + case BTF_KIND_ENUM64: + if (!btf_is_any_enum(targ_t)) + return 0; + + return bpf_core_enums_match(local_btf, local_t, targ_btf, targ_t); + case BTF_KIND_STRUCT: + case BTF_KIND_UNION: + if (behind_ptr) { + bool targ_f = BTF_INFO_KFLAG(targ_t->info); + + if (local_k == targ_k) + return 1; + + if (targ_k != BTF_KIND_FWD) + return 0; + + return (local_k == BTF_KIND_UNION) == targ_f; + } else { + if (local_k != targ_k) + return 0; + + return bpf_core_composites_match(local_btf, local_t, targ_btf, targ_t, + behind_ptr, level); + } + case BTF_KIND_INT: { + __u8 local_sgn; + __u8 targ_sgn; + + if (local_k != targ_k) + return 0; + + local_sgn = btf_int_encoding(local_t) & BTF_INT_SIGNED; + targ_sgn = btf_int_encoding(targ_t) & BTF_INT_SIGNED; + + return local_t->size == targ_t->size && local_sgn == targ_sgn; + } + case BTF_KIND_PTR: + if (local_k != targ_k) + return 0; + + behind_ptr = true; + + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + case BTF_KIND_ARRAY: { + const struct btf_array *local_array = btf_array(local_t); + const struct btf_array *targ_array = btf_array(targ_t); + + if (local_k != targ_k) + return 0; + + if (local_array->nelems != targ_array->nelems) + return 0; + + local_id = local_array->type; + targ_id = targ_array->type; + goto recur; + } + case BTF_KIND_FUNC_PROTO: { + struct btf_param *local_p = btf_params(local_t); + struct btf_param *targ_p = btf_params(targ_t); + __u16 local_vlen = btf_vlen(local_t); + __u16 targ_vlen = btf_vlen(targ_t); + int i, err; + + if (local_k != targ_k) + return 0; + + if (local_vlen != targ_vlen) + return 0; + + for (i = 0; i < local_vlen; i++, local_p++, targ_p++) { + err = __bpf_core_types_match(local_btf, local_p->type, targ_btf, + targ_p->type, behind_ptr, level - 1); + if (err <= 0) + return err; + } + + /* tail recurse for return type check */ + local_id = local_t->type; + targ_id = targ_t->type; + goto recur; + } + default: + pr_warn("unexpected kind %s relocated, local [%d], target [%d]\n", + btf_kind_str(local_t), local_id, targ_id); + return 0; + } +} diff --git a/pkg/plugin/lib/common/libbpf/_src/relo_core.h b/pkg/plugin/lib/common/libbpf/_src/relo_core.h index 073039d8ca..1c0566daf8 100644 --- a/pkg/plugin/lib/common/libbpf/_src/relo_core.h +++ b/pkg/plugin/lib/common/libbpf/_src/relo_core.h @@ -46,9 +46,9 @@ struct bpf_core_spec { struct bpf_core_relo_res { /* expected value in the instruction, unless validate == false */ - __u32 orig_val; + __u64 orig_val; /* new value that needs to be patched up to */ - __u32 new_val; + __u64 new_val; /* relocation unsuccessful, poison instruction, but don't fail load */ bool poison; /* some relocations can't be validated against orig_val */ @@ -68,8 +68,14 @@ struct bpf_core_relo_res { __u32 new_type_id; }; +int __bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, + const struct btf *targ_btf, __u32 targ_id, int level); int bpf_core_types_are_compat(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, __u32 targ_id); +int __bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id, bool behind_ptr, int level); +int bpf_core_types_match(const struct btf *local_btf, __u32 local_id, const struct btf *targ_btf, + __u32 targ_id); size_t bpf_core_essential_name_len(const char *name); diff --git a/pkg/plugin/lib/common/libbpf/_src/ringbuf.c b/pkg/plugin/lib/common/libbpf/_src/ringbuf.c index 8bc117bcc7..bfd8dac4c0 100644 --- a/pkg/plugin/lib/common/libbpf/_src/ringbuf.c +++ b/pkg/plugin/lib/common/libbpf/_src/ringbuf.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "libbpf.h" #include "libbpf_internal.h" @@ -33,13 +34,30 @@ struct ring { struct ring_buffer { struct epoll_event *events; - struct ring *rings; + struct ring **rings; size_t page_size; int epoll_fd; int ring_cnt; }; -static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) +struct user_ring_buffer { + struct epoll_event event; + unsigned long *consumer_pos; + unsigned long *producer_pos; + void *data; + unsigned long mask; + size_t page_size; + int map_fd; + int epoll_fd; +}; + +/* 8-byte ring buffer header structure */ +struct ringbuf_hdr { + __u32 len; + __u32 pad; +}; + +static void ringbuf_free_ring(struct ring_buffer *rb, struct ring *r) { if (r->consumer_pos) { munmap(r->consumer_pos, rb->page_size); @@ -49,6 +67,8 @@ static void ringbuf_unmap_ring(struct ring_buffer *rb, struct ring *r) munmap(r->producer_pos, rb->page_size + 2 * (r->mask + 1)); r->producer_pos = NULL; } + + free(r); } /* Add extra RINGBUF maps to this ring buffer manager */ @@ -59,12 +79,13 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, __u32 len = sizeof(info); struct epoll_event *e; struct ring *r; + __u64 mmap_sz; void *tmp; int err; memset(&info, 0, sizeof(info)); - err = bpf_obj_get_info_by_fd(map_fd, &info, &len); + err = bpf_map_get_info_by_fd(map_fd, &info, &len); if (err) { err = -errno; pr_warn("ringbuf: failed to get map info for fd=%d: %d\n", @@ -88,8 +109,10 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, return libbpf_err(-ENOMEM); rb->events = tmp; - r = &rb->rings[rb->ring_cnt]; - memset(r, 0, sizeof(*r)); + r = calloc(1, sizeof(*r)); + if (!r) + return libbpf_err(-ENOMEM); + rb->rings[rb->ring_cnt] = r; r->map_fd = map_fd; r->sample_cb = sample_cb; @@ -97,28 +120,31 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, r->mask = info.max_entries - 1; /* Map writable consumer page */ - tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, - map_fd, 0); + tmp = mmap(NULL, rb->page_size, PROT_READ | PROT_WRITE, MAP_SHARED, map_fd, 0); if (tmp == MAP_FAILED) { err = -errno; pr_warn("ringbuf: failed to mmap consumer page for map fd=%d: %d\n", map_fd, err); - return libbpf_err(err); + goto err_out; } r->consumer_pos = tmp; /* Map read-only producer page and data pages. We map twice as big * data size to allow simple reading of samples that wrap around the * end of a ring buffer. See kernel implementation for details. - * */ - tmp = mmap(NULL, rb->page_size + 2 * info.max_entries, PROT_READ, - MAP_SHARED, map_fd, rb->page_size); + */ + mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; + if (mmap_sz != (__u64)(size_t)mmap_sz) { + err = -E2BIG; + pr_warn("ringbuf: ring buffer size (%u) is too big\n", info.max_entries); + goto err_out; + } + tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ, MAP_SHARED, map_fd, rb->page_size); if (tmp == MAP_FAILED) { err = -errno; - ringbuf_unmap_ring(rb, r); pr_warn("ringbuf: failed to mmap data pages for map fd=%d: %d\n", map_fd, err); - return libbpf_err(err); + goto err_out; } r->producer_pos = tmp; r->data = tmp + rb->page_size; @@ -130,14 +156,17 @@ int ring_buffer__add(struct ring_buffer *rb, int map_fd, e->data.fd = rb->ring_cnt; if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, e) < 0) { err = -errno; - ringbuf_unmap_ring(rb, r); pr_warn("ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err); - return libbpf_err(err); + goto err_out; } rb->ring_cnt++; return 0; + +err_out: + ringbuf_free_ring(rb, r); + return libbpf_err(err); } void ring_buffer__free(struct ring_buffer *rb) @@ -148,7 +177,7 @@ void ring_buffer__free(struct ring_buffer *rb) return; for (i = 0; i < rb->ring_cnt; ++i) - ringbuf_unmap_ring(rb, &rb->rings[i]); + ringbuf_free_ring(rb, rb->rings[i]); if (rb->epoll_fd >= 0) close(rb->epoll_fd); @@ -202,7 +231,7 @@ static inline int roundup_len(__u32 len) return (len + 7) / 8 * 8; } -static int64_t ringbuf_process_ring(struct ring* r) +static int64_t ringbuf_process_ring(struct ring *r, size_t n) { int *len_ptr, len, err; /* 64-bit to avoid overflow in case of extreme application behavior */ @@ -239,12 +268,42 @@ static int64_t ringbuf_process_ring(struct ring* r) } smp_store_release(r->consumer_pos, cons_pos); + + if (cnt >= n) + goto done; } } while (got_new_data); done: return cnt; } +/* Consume available ring buffer(s) data without event polling, up to n + * records. + * + * Returns number of records consumed across all registered ring buffers (or + * n, whichever is less), or negative number if any of the callbacks return + * error. + */ +int ring_buffer__consume_n(struct ring_buffer *rb, size_t n) +{ + int64_t err, res = 0; + int i; + + for (i = 0; i < rb->ring_cnt; i++) { + struct ring *ring = rb->rings[i]; + + err = ringbuf_process_ring(ring, n); + if (err < 0) + return libbpf_err(err); + res += err; + n -= err; + + if (n == 0) + break; + } + return res > INT_MAX ? INT_MAX : res; +} + /* Consume available ring buffer(s) data without event polling. * Returns number of records consumed across all registered ring buffers (or * INT_MAX, whichever is less), or negative number if any of the callbacks @@ -256,15 +315,17 @@ int ring_buffer__consume(struct ring_buffer *rb) int i; for (i = 0; i < rb->ring_cnt; i++) { - struct ring *ring = &rb->rings[i]; + struct ring *ring = rb->rings[i]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; + if (res > INT_MAX) { + res = INT_MAX; + break; + } } - if (res > INT_MAX) - return INT_MAX; return res; } @@ -283,15 +344,15 @@ int ring_buffer__poll(struct ring_buffer *rb, int timeout_ms) for (i = 0; i < cnt; i++) { __u32 ring_id = rb->events[i].data.fd; - struct ring *ring = &rb->rings[ring_id]; + struct ring *ring = rb->rings[ring_id]; - err = ringbuf_process_ring(ring); + err = ringbuf_process_ring(ring, INT_MAX); if (err < 0) return libbpf_err(err); res += err; } if (res > INT_MAX) - return INT_MAX; + res = INT_MAX; return res; } @@ -300,3 +361,323 @@ int ring_buffer__epoll_fd(const struct ring_buffer *rb) { return rb->epoll_fd; } + +struct ring *ring_buffer__ring(struct ring_buffer *rb, unsigned int idx) +{ + if (idx >= rb->ring_cnt) + return errno = ERANGE, NULL; + + return rb->rings[idx]; +} + +unsigned long ring__consumer_pos(const struct ring *r) +{ + /* Synchronizes with smp_store_release() in ringbuf_process_ring(). */ + return smp_load_acquire(r->consumer_pos); +} + +unsigned long ring__producer_pos(const struct ring *r) +{ + /* Synchronizes with smp_store_release() in __bpf_ringbuf_reserve() in + * the kernel. + */ + return smp_load_acquire(r->producer_pos); +} + +size_t ring__avail_data_size(const struct ring *r) +{ + unsigned long cons_pos, prod_pos; + + cons_pos = ring__consumer_pos(r); + prod_pos = ring__producer_pos(r); + return prod_pos - cons_pos; +} + +size_t ring__size(const struct ring *r) +{ + return r->mask + 1; +} + +int ring__map_fd(const struct ring *r) +{ + return r->map_fd; +} + +int ring__consume_n(struct ring *r, size_t n) +{ + int64_t res; + + res = ringbuf_process_ring(r, n); + if (res < 0) + return libbpf_err(res); + + return res > INT_MAX ? INT_MAX : res; +} + +int ring__consume(struct ring *r) +{ + return ring__consume_n(r, INT_MAX); +} + +static void user_ringbuf_unmap_ring(struct user_ring_buffer *rb) +{ + if (rb->consumer_pos) { + munmap(rb->consumer_pos, rb->page_size); + rb->consumer_pos = NULL; + } + if (rb->producer_pos) { + munmap(rb->producer_pos, rb->page_size + 2 * (rb->mask + 1)); + rb->producer_pos = NULL; + } +} + +void user_ring_buffer__free(struct user_ring_buffer *rb) +{ + if (!rb) + return; + + user_ringbuf_unmap_ring(rb); + + if (rb->epoll_fd >= 0) + close(rb->epoll_fd); + + free(rb); +} + +static int user_ringbuf_map(struct user_ring_buffer *rb, int map_fd) +{ + struct bpf_map_info info; + __u32 len = sizeof(info); + __u64 mmap_sz; + void *tmp; + struct epoll_event *rb_epoll; + int err; + + memset(&info, 0, sizeof(info)); + + err = bpf_map_get_info_by_fd(map_fd, &info, &len); + if (err) { + err = -errno; + pr_warn("user ringbuf: failed to get map info for fd=%d: %d\n", map_fd, err); + return err; + } + + if (info.type != BPF_MAP_TYPE_USER_RINGBUF) { + pr_warn("user ringbuf: map fd=%d is not BPF_MAP_TYPE_USER_RINGBUF\n", map_fd); + return -EINVAL; + } + + rb->map_fd = map_fd; + rb->mask = info.max_entries - 1; + + /* Map read-only consumer page */ + tmp = mmap(NULL, rb->page_size, PROT_READ, MAP_SHARED, map_fd, 0); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap consumer page for map fd=%d: %d\n", + map_fd, err); + return err; + } + rb->consumer_pos = tmp; + + /* Map read-write the producer page and data pages. We map the data + * region as twice the total size of the ring buffer to allow the + * simple reading and writing of samples that wrap around the end of + * the buffer. See the kernel implementation for details. + */ + mmap_sz = rb->page_size + 2 * (__u64)info.max_entries; + if (mmap_sz != (__u64)(size_t)mmap_sz) { + pr_warn("user ringbuf: ring buf size (%u) is too big\n", info.max_entries); + return -E2BIG; + } + tmp = mmap(NULL, (size_t)mmap_sz, PROT_READ | PROT_WRITE, MAP_SHARED, + map_fd, rb->page_size); + if (tmp == MAP_FAILED) { + err = -errno; + pr_warn("user ringbuf: failed to mmap data pages for map fd=%d: %d\n", + map_fd, err); + return err; + } + + rb->producer_pos = tmp; + rb->data = tmp + rb->page_size; + + rb_epoll = &rb->event; + rb_epoll->events = EPOLLOUT; + if (epoll_ctl(rb->epoll_fd, EPOLL_CTL_ADD, map_fd, rb_epoll) < 0) { + err = -errno; + pr_warn("user ringbuf: failed to epoll add map fd=%d: %d\n", map_fd, err); + return err; + } + + return 0; +} + +struct user_ring_buffer * +user_ring_buffer__new(int map_fd, const struct user_ring_buffer_opts *opts) +{ + struct user_ring_buffer *rb; + int err; + + if (!OPTS_VALID(opts, user_ring_buffer_opts)) + return errno = EINVAL, NULL; + + rb = calloc(1, sizeof(*rb)); + if (!rb) + return errno = ENOMEM, NULL; + + rb->page_size = getpagesize(); + + rb->epoll_fd = epoll_create1(EPOLL_CLOEXEC); + if (rb->epoll_fd < 0) { + err = -errno; + pr_warn("user ringbuf: failed to create epoll instance: %d\n", err); + goto err_out; + } + + err = user_ringbuf_map(rb, map_fd); + if (err) + goto err_out; + + return rb; + +err_out: + user_ring_buffer__free(rb); + return errno = -err, NULL; +} + +static void user_ringbuf_commit(struct user_ring_buffer *rb, void *sample, bool discard) +{ + __u32 new_len; + struct ringbuf_hdr *hdr; + uintptr_t hdr_offset; + + hdr_offset = rb->mask + 1 + (sample - rb->data) - BPF_RINGBUF_HDR_SZ; + hdr = rb->data + (hdr_offset & rb->mask); + + new_len = hdr->len & ~BPF_RINGBUF_BUSY_BIT; + if (discard) + new_len |= BPF_RINGBUF_DISCARD_BIT; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + __atomic_exchange_n(&hdr->len, new_len, __ATOMIC_ACQ_REL); +} + +void user_ring_buffer__discard(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, true); +} + +void user_ring_buffer__submit(struct user_ring_buffer *rb, void *sample) +{ + user_ringbuf_commit(rb, sample, false); +} + +void *user_ring_buffer__reserve(struct user_ring_buffer *rb, __u32 size) +{ + __u32 avail_size, total_size, max_size; + /* 64-bit to avoid overflow in case of extreme application behavior */ + __u64 cons_pos, prod_pos; + struct ringbuf_hdr *hdr; + + /* The top two bits are used as special flags */ + if (size & (BPF_RINGBUF_BUSY_BIT | BPF_RINGBUF_DISCARD_BIT)) + return errno = E2BIG, NULL; + + /* Synchronizes with smp_store_release() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + cons_pos = smp_load_acquire(rb->consumer_pos); + /* Synchronizes with smp_store_release() in user_ringbuf_commit() */ + prod_pos = smp_load_acquire(rb->producer_pos); + + max_size = rb->mask + 1; + avail_size = max_size - (prod_pos - cons_pos); + /* Round up total size to a multiple of 8. */ + total_size = (size + BPF_RINGBUF_HDR_SZ + 7) / 8 * 8; + + if (total_size > max_size) + return errno = E2BIG, NULL; + + if (avail_size < total_size) + return errno = ENOSPC, NULL; + + hdr = rb->data + (prod_pos & rb->mask); + hdr->len = size | BPF_RINGBUF_BUSY_BIT; + hdr->pad = 0; + + /* Synchronizes with smp_load_acquire() in __bpf_user_ringbuf_peek() in + * the kernel. + */ + smp_store_release(rb->producer_pos, prod_pos + total_size); + + return (void *)rb->data + ((prod_pos + BPF_RINGBUF_HDR_SZ) & rb->mask); +} + +static __u64 ns_elapsed_timespec(const struct timespec *start, const struct timespec *end) +{ + __u64 start_ns, end_ns, ns_per_s = 1000000000; + + start_ns = (__u64)start->tv_sec * ns_per_s + start->tv_nsec; + end_ns = (__u64)end->tv_sec * ns_per_s + end->tv_nsec; + + return end_ns - start_ns; +} + +void *user_ring_buffer__reserve_blocking(struct user_ring_buffer *rb, __u32 size, int timeout_ms) +{ + void *sample; + int err, ms_remaining = timeout_ms; + struct timespec start; + + if (timeout_ms < 0 && timeout_ms != -1) + return errno = EINVAL, NULL; + + if (timeout_ms != -1) { + err = clock_gettime(CLOCK_MONOTONIC, &start); + if (err) + return NULL; + } + + do { + int cnt, ms_elapsed; + struct timespec curr; + __u64 ns_per_ms = 1000000; + + sample = user_ring_buffer__reserve(rb, size); + if (sample) + return sample; + else if (errno != ENOSPC) + return NULL; + + /* The kernel guarantees at least one event notification + * delivery whenever at least one sample is drained from the + * ring buffer in an invocation to bpf_ringbuf_drain(). Other + * additional events may be delivered at any time, but only one + * event is guaranteed per bpf_ringbuf_drain() invocation, + * provided that a sample is drained, and the BPF program did + * not pass BPF_RB_NO_WAKEUP to bpf_ringbuf_drain(). If + * BPF_RB_FORCE_WAKEUP is passed to bpf_ringbuf_drain(), a + * wakeup event will be delivered even if no samples are + * drained. + */ + cnt = epoll_wait(rb->epoll_fd, &rb->event, 1, ms_remaining); + if (cnt < 0) + return NULL; + + if (timeout_ms == -1) + continue; + + err = clock_gettime(CLOCK_MONOTONIC, &curr); + if (err) + return NULL; + + ms_elapsed = ns_elapsed_timespec(&start, &curr) / ns_per_ms; + ms_remaining = timeout_ms - ms_elapsed; + } while (ms_remaining > 0); + + /* Try one more time to reserve a sample after the specified timeout has elapsed. */ + return user_ring_buffer__reserve(rb, size); +} diff --git a/pkg/plugin/lib/common/libbpf/_src/skel_internal.h b/pkg/plugin/lib/common/libbpf/_src/skel_internal.h index bd6f4505e7..1e82ab06c3 100644 --- a/pkg/plugin/lib/common/libbpf/_src/skel_internal.h +++ b/pkg/plugin/lib/common/libbpf/_src/skel_internal.h @@ -66,13 +66,13 @@ struct bpf_load_and_run_opts { const char *errstr; }; -long bpf_sys_bpf(__u32 cmd, void *attr, __u32 attr_size); +long kern_sys_bpf(__u32 cmd, void *attr, __u32 attr_size); static inline int skel_sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, unsigned int size) { #ifdef __KERNEL__ - return bpf_sys_bpf(cmd, attr, size); + return kern_sys_bpf(cmd, attr, size); #else return syscall(__NR_bpf, cmd, attr, size); #endif @@ -251,6 +251,29 @@ static inline int skel_map_update_elem(int fd, const void *key, return skel_sys_bpf(BPF_MAP_UPDATE_ELEM, &attr, attr_sz); } +static inline int skel_map_delete_elem(int fd, const void *key) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_fd = fd; + attr.key = (long)key; + + return skel_sys_bpf(BPF_MAP_DELETE_ELEM, &attr, attr_sz); +} + +static inline int skel_map_get_fd_by_id(__u32 id) +{ + const size_t attr_sz = offsetofend(union bpf_attr, flags); + union bpf_attr attr; + + memset(&attr, 0, attr_sz); + attr.map_id = id; + + return skel_sys_bpf(BPF_MAP_GET_FD_BY_ID, &attr, attr_sz); +} + static inline int skel_raw_tracepoint_open(const char *name, int prog_fd) { const size_t attr_sz = offsetofend(union bpf_attr, raw_tracepoint.prog_fd); @@ -285,6 +308,8 @@ static inline int skel_link_create(int prog_fd, int target_fd, static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) { + const size_t prog_load_attr_sz = offsetofend(union bpf_attr, fd_array); + const size_t test_run_attr_sz = offsetofend(union bpf_attr, test); int map_fd = -1, prog_fd = -1, key = 0, err; union bpf_attr attr; @@ -302,7 +327,7 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) goto out; } - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, prog_load_attr_sz); attr.prog_type = BPF_PROG_TYPE_SYSCALL; attr.insns = (long) opts->insns; attr.insn_cnt = opts->insns_sz / sizeof(struct bpf_insn); @@ -313,18 +338,18 @@ static inline int bpf_load_and_run(struct bpf_load_and_run_opts *opts) attr.log_size = opts->ctx->log_size; attr.log_buf = opts->ctx->log_buf; attr.prog_flags = BPF_F_SLEEPABLE; - err = prog_fd = skel_sys_bpf(BPF_PROG_LOAD, &attr, sizeof(attr)); + err = prog_fd = skel_sys_bpf(BPF_PROG_LOAD, &attr, prog_load_attr_sz); if (prog_fd < 0) { opts->errstr = "failed to load loader prog"; set_err; goto out; } - memset(&attr, 0, sizeof(attr)); + memset(&attr, 0, test_run_attr_sz); attr.test.prog_fd = prog_fd; attr.test.ctx_in = (long) opts->ctx; attr.test.ctx_size_in = opts->ctx->sz; - err = skel_sys_bpf(BPF_PROG_RUN, &attr, sizeof(attr)); + err = skel_sys_bpf(BPF_PROG_RUN, &attr, test_run_attr_sz); if (err < 0 || (int)attr.test.retval < 0) { opts->errstr = "failed to execute loader prog"; if (err < 0) { diff --git a/pkg/plugin/lib/common/libbpf/_src/str_error.c b/pkg/plugin/lib/common/libbpf/_src/str_error.c index 146da01979..5e6a1e27dd 100644 --- a/pkg/plugin/lib/common/libbpf/_src/str_error.c +++ b/pkg/plugin/lib/common/libbpf/_src/str_error.c @@ -2,6 +2,7 @@ #undef _GNU_SOURCE #include #include +#include #include "str_error.h" /* make sure libbpf doesn't use kernel-only integer typedefs */ @@ -15,7 +16,18 @@ char *libbpf_strerror_r(int err, char *dst, int len) { int ret = strerror_r(err < 0 ? -err : err, dst, len); - if (ret) - snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + /* on glibc <2.13, ret == -1 and errno is set, if strerror_r() can't + * handle the error, on glibc >=2.13 *positive* (errno-like) error + * code is returned directly + */ + if (ret == -1) + ret = errno; + if (ret) { + if (ret == EINVAL) + /* strerror_r() doesn't recognize this specific error */ + snprintf(dst, len, "unknown error (%d)", err < 0 ? err : -err); + else + snprintf(dst, len, "ERROR: strerror_r(%d)=%d", err, ret); + } return dst; } diff --git a/pkg/plugin/lib/common/libbpf/_src/str_error.h b/pkg/plugin/lib/common/libbpf/_src/str_error.h index a139334d57..626d7ffb03 100644 --- a/pkg/plugin/lib/common/libbpf/_src/str_error.h +++ b/pkg/plugin/lib/common/libbpf/_src/str_error.h @@ -2,5 +2,8 @@ #ifndef __LIBBPF_STR_ERROR_H #define __LIBBPF_STR_ERROR_H +#define STRERR_BUFSIZE 128 + char *libbpf_strerror_r(int err, char *dst, int len); + #endif /* __LIBBPF_STR_ERROR_H */ diff --git a/pkg/plugin/lib/common/libbpf/_src/strset.c b/pkg/plugin/lib/common/libbpf/_src/strset.c index ea65531815..2464bcbd04 100644 --- a/pkg/plugin/lib/common/libbpf/_src/strset.c +++ b/pkg/plugin/lib/common/libbpf/_src/strset.c @@ -19,19 +19,19 @@ struct strset { struct hashmap *strs_hash; }; -static size_t strset_hash_fn(const void *key, void *ctx) +static size_t strset_hash_fn(long key, void *ctx) { const struct strset *s = ctx; - const char *str = s->strs_data + (long)key; + const char *str = s->strs_data + key; return str_hash(str); } -static bool strset_equal_fn(const void *key1, const void *key2, void *ctx) +static bool strset_equal_fn(long key1, long key2, void *ctx) { const struct strset *s = ctx; - const char *str1 = s->strs_data + (long)key1; - const char *str2 = s->strs_data + (long)key2; + const char *str1 = s->strs_data + key1; + const char *str2 = s->strs_data + key2; return strcmp(str1, str2) == 0; } @@ -67,7 +67,7 @@ struct strset *strset__new(size_t max_data_sz, const char *init_data, size_t ini /* hashmap__add() returns EEXIST if string with the same * content already is in the hash map */ - err = hashmap__add(hash, (void *)off, (void *)off); + err = hashmap__add(hash, off, off); if (err == -EEXIST) continue; /* duplicate */ if (err) @@ -127,7 +127,7 @@ int strset__find_str(struct strset *set, const char *s) new_off = set->strs_data_len; memcpy(p, s, len); - if (hashmap__find(set->strs_hash, (void *)new_off, (void **)&old_off)) + if (hashmap__find(set->strs_hash, new_off, &old_off)) return old_off; return -ENOENT; @@ -165,8 +165,8 @@ int strset__add_str(struct strset *set, const char *s) * contents doesn't exist already (HASHMAP_ADD strategy). If such * string exists, we'll get its offset in old_off (that's old_key). */ - err = hashmap__insert(set->strs_hash, (void *)new_off, (void *)new_off, - HASHMAP_ADD, (const void **)&old_off, NULL); + err = hashmap__insert(set->strs_hash, new_off, new_off, + HASHMAP_ADD, &old_off, NULL); if (err == -EEXIST) return old_off; /* duplicated string, return existing offset */ if (err) diff --git a/pkg/plugin/lib/common/libbpf/_src/usdt.bpf.h b/pkg/plugin/lib/common/libbpf/_src/usdt.bpf.h index 4181fddb36..76359bcdc9 100644 --- a/pkg/plugin/lib/common/libbpf/_src/usdt.bpf.h +++ b/pkg/plugin/lib/common/libbpf/_src/usdt.bpf.h @@ -4,9 +4,8 @@ #define __USDT_BPF_H__ #include -#include -#include -#include +#include "bpf_helpers.h" +#include "bpf_tracing.h" /* Below types and maps are internal implementation details of libbpf's USDT * support and are subjects to change. Also, bpf_usdt_xxx() API helpers should @@ -30,14 +29,6 @@ #ifndef BPF_USDT_MAX_IP_CNT #define BPF_USDT_MAX_IP_CNT (4 * BPF_USDT_MAX_SPEC_CNT) #endif -/* We use BPF CO-RE to detect support for BPF cookie from BPF side. This is - * the only dependency on CO-RE, so if it's undesirable, user can override - * BPF_USDT_HAS_BPF_COOKIE to specify whether to BPF cookie is supported or not. - */ -#ifndef BPF_USDT_HAS_BPF_COOKIE -#define BPF_USDT_HAS_BPF_COOKIE \ - bpf_core_enum_value_exists(enum bpf_func_id___usdt, BPF_FUNC_get_attach_cookie___usdt) -#endif enum __bpf_usdt_arg_type { BPF_USDT_ARG_CONST, @@ -83,15 +74,12 @@ struct { __type(value, __u32); } __bpf_usdt_ip_to_spec_id SEC(".maps") __weak; -/* don't rely on user's BPF code to have latest definition of bpf_func_id */ -enum bpf_func_id___usdt { - BPF_FUNC_get_attach_cookie___usdt = 0xBAD, /* value doesn't matter */ -}; +extern const _Bool LINUX_HAS_BPF_COOKIE __kconfig; static __always_inline int __bpf_usdt_spec_id(struct pt_regs *ctx) { - if (!BPF_USDT_HAS_BPF_COOKIE) { + if (!LINUX_HAS_BPF_COOKIE) { long ip = PT_REGS_IP(ctx); int *spec_id_ptr; @@ -142,7 +130,10 @@ int bpf_usdt_arg(struct pt_regs *ctx, __u64 arg_num, long *res) if (!spec) return -ESRCH; - if (arg_num >= BPF_USDT_MAX_ARG_CNT || arg_num >= spec->arg_cnt) + if (arg_num >= BPF_USDT_MAX_ARG_CNT) + return -ENOENT; + barrier_var(arg_num); + if (arg_num >= spec->arg_cnt) return -ENOENT; arg_spec = &spec->args[arg_num]; @@ -223,18 +214,18 @@ long bpf_usdt_cookie(struct pt_regs *ctx) /* we rely on ___bpf_apply() and ___bpf_narg() macros already defined in bpf_tracing.h */ #define ___bpf_usdt_args0() ctx -#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); (void *)_x; }) -#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); (void *)_x; }) -#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); (void *)_x; }) -#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); (void *)_x; }) -#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); (void *)_x; }) -#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); (void *)_x; }) -#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); (void *)_x; }) -#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); (void *)_x; }) -#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); (void *)_x; }) -#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); (void *)_x; }) -#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); (void *)_x; }) -#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); (void *)_x; }) +#define ___bpf_usdt_args1(x) ___bpf_usdt_args0(), ({ long _x; bpf_usdt_arg(ctx, 0, &_x); _x; }) +#define ___bpf_usdt_args2(x, args...) ___bpf_usdt_args1(args), ({ long _x; bpf_usdt_arg(ctx, 1, &_x); _x; }) +#define ___bpf_usdt_args3(x, args...) ___bpf_usdt_args2(args), ({ long _x; bpf_usdt_arg(ctx, 2, &_x); _x; }) +#define ___bpf_usdt_args4(x, args...) ___bpf_usdt_args3(args), ({ long _x; bpf_usdt_arg(ctx, 3, &_x); _x; }) +#define ___bpf_usdt_args5(x, args...) ___bpf_usdt_args4(args), ({ long _x; bpf_usdt_arg(ctx, 4, &_x); _x; }) +#define ___bpf_usdt_args6(x, args...) ___bpf_usdt_args5(args), ({ long _x; bpf_usdt_arg(ctx, 5, &_x); _x; }) +#define ___bpf_usdt_args7(x, args...) ___bpf_usdt_args6(args), ({ long _x; bpf_usdt_arg(ctx, 6, &_x); _x; }) +#define ___bpf_usdt_args8(x, args...) ___bpf_usdt_args7(args), ({ long _x; bpf_usdt_arg(ctx, 7, &_x); _x; }) +#define ___bpf_usdt_args9(x, args...) ___bpf_usdt_args8(args), ({ long _x; bpf_usdt_arg(ctx, 8, &_x); _x; }) +#define ___bpf_usdt_args10(x, args...) ___bpf_usdt_args9(args), ({ long _x; bpf_usdt_arg(ctx, 9, &_x); _x; }) +#define ___bpf_usdt_args11(x, args...) ___bpf_usdt_args10(args), ({ long _x; bpf_usdt_arg(ctx, 10, &_x); _x; }) +#define ___bpf_usdt_args12(x, args...) ___bpf_usdt_args11(args), ({ long _x; bpf_usdt_arg(ctx, 11, &_x); _x; }) #define ___bpf_usdt_args(args...) ___bpf_apply(___bpf_usdt_args, ___bpf_narg(args))(args) /* @@ -244,7 +235,7 @@ long bpf_usdt_cookie(struct pt_regs *ctx) */ #define BPF_USDT(name, args...) \ name(struct pt_regs *ctx); \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args); \ typeof(name(0)) name(struct pt_regs *ctx) \ { \ @@ -253,7 +244,7 @@ typeof(name(0)) name(struct pt_regs *ctx) \ return ____##name(___bpf_usdt_args(args)); \ _Pragma("GCC diagnostic pop") \ } \ -static __attribute__((always_inline)) typeof(name(0)) \ +static __always_inline typeof(name(0)) \ ____##name(struct pt_regs *ctx, ##args) #endif /* __USDT_BPF_H__ */ diff --git a/pkg/plugin/lib/common/libbpf/_src/usdt.c b/pkg/plugin/lib/common/libbpf/_src/usdt.c index f1c9339cfb..93794f01bb 100644 --- a/pkg/plugin/lib/common/libbpf/_src/usdt.c +++ b/pkg/plugin/lib/common/libbpf/_src/usdt.c @@ -250,6 +250,7 @@ struct usdt_manager { bool has_bpf_cookie; bool has_sema_refcnt; + bool has_uprobe_multi; }; struct usdt_manager *usdt_manager_new(struct bpf_object *obj) @@ -282,8 +283,13 @@ struct usdt_manager *usdt_manager_new(struct bpf_object *obj) * If this is not supported, USDTs with semaphores will not be supported. * Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe") */ - man->has_sema_refcnt = access(ref_ctr_sysfs_path, F_OK) == 0; + man->has_sema_refcnt = faccessat(AT_FDCWD, ref_ctr_sysfs_path, F_OK, AT_EACCESS) == 0; + /* + * Detect kernel support for uprobe multi link to be used for attaching + * usdt probes. + */ + man->has_uprobe_multi = kernel_supports(obj, FEAT_UPROBE_MULTI_LINK); return man; } @@ -441,7 +447,7 @@ static int parse_elf_segs(Elf *elf, const char *path, struct elf_seg **segs, siz return 0; } -static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) +static int parse_vma_segs(int pid, const char *lib_path, struct elf_seg **segs, size_t *seg_cnt) { char path[PATH_MAX], line[PATH_MAX], mode[16]; size_t seg_start, seg_end, seg_off; @@ -466,7 +472,7 @@ static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, proceed: sprintf(line, "/proc/%d/maps", pid); - f = fopen(line, "r"); + f = fopen(line, "re"); if (!f) { err = -errno; pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n", @@ -531,35 +537,40 @@ static int parse_lib_segs(int pid, const char *lib_path, struct elf_seg **segs, return err; } -static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long addr, bool relative) +static struct elf_seg *find_elf_seg(struct elf_seg *segs, size_t seg_cnt, long virtaddr) { struct elf_seg *seg; int i; - if (relative) { - /* for shared libraries, address is relative offset and thus - * should be fall within logical offset-based range of - * [offset_start, offset_end) - */ - for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { - if (seg->offset <= addr && addr < seg->offset + (seg->end - seg->start)) - return seg; - } - } else { - /* for binaries, address is absolute and thus should be within - * absolute address range of [seg_start, seg_end) - */ - for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { - if (seg->start <= addr && addr < seg->end) - return seg; - } + /* for ELF binaries (both executables and shared libraries), we are + * given virtual address (absolute for executables, relative for + * libraries) which should match address range of [seg_start, seg_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->start <= virtaddr && virtaddr < seg->end) + return seg; } + return NULL; +} +static struct elf_seg *find_vma_seg(struct elf_seg *segs, size_t seg_cnt, long offset) +{ + struct elf_seg *seg; + int i; + + /* for VMA segments from /proc//maps file, provided "address" is + * actually a file offset, so should be fall within logical + * offset-based range of [offset_start, offset_end) + */ + for (i = 0, seg = segs; i < seg_cnt; i++, seg++) { + if (seg->offset <= offset && offset < seg->offset + (seg->end - seg->start)) + return seg; + } return NULL; } -static int parse_usdt_note(Elf *elf, const char *path, long base_addr, - GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, struct usdt_note *usdt_note); static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie); @@ -568,8 +579,8 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * const char *usdt_provider, const char *usdt_name, __u64 usdt_cookie, struct usdt_target **out_targets, size_t *out_target_cnt) { - size_t off, name_off, desc_off, seg_cnt = 0, lib_seg_cnt = 0, target_cnt = 0; - struct elf_seg *segs = NULL, *lib_segs = NULL; + size_t off, name_off, desc_off, seg_cnt = 0, vma_seg_cnt = 0, target_cnt = 0; + struct elf_seg *segs = NULL, *vma_segs = NULL; struct usdt_target *targets = NULL, *target; long base_addr = 0; Elf_Scn *notes_scn, *base_scn; @@ -613,8 +624,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * struct elf_seg *seg = NULL; void *tmp; - err = parse_usdt_note(elf, path, base_addr, &nhdr, - data->d_buf, name_off, desc_off, ¬e); + err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, ¬e); if (err) goto err_out; @@ -648,36 +658,33 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * * * [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation */ - usdt_rel_ip = usdt_abs_ip = note.loc_addr; - if (base_addr) { + usdt_abs_ip = note.loc_addr; + if (base_addr) usdt_abs_ip += base_addr - note.base_addr; - usdt_rel_ip += base_addr - note.base_addr; - } - if (ehdr.e_type == ET_EXEC) { - /* When attaching uprobes (which what USDTs basically - * are) kernel expects a relative IP to be specified, - * so if we are attaching to an executable ELF binary - * (i.e., not a shared library), we need to calculate - * proper relative IP based on ELF's load address - */ - seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip, false /* relative */); - if (!seg) { - err = -ESRCH; - pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", - usdt_provider, usdt_name, path, usdt_abs_ip); - goto err_out; - } - if (!seg->is_exec) { - err = -ESRCH; - pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", - path, seg->start, seg->end, usdt_provider, usdt_name, - usdt_abs_ip); - goto err_out; - } + /* When attaching uprobes (which is what USDTs basically are) + * kernel expects file offset to be specified, not a relative + * virtual address, so we need to translate virtual address to + * file offset, for both ET_EXEC and ET_DYN binaries. + */ + seg = find_elf_seg(segs, seg_cnt, usdt_abs_ip); + if (!seg) { + err = -ESRCH; + pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n", + usdt_provider, usdt_name, path, usdt_abs_ip); + goto err_out; + } + if (!seg->is_exec) { + err = -ESRCH; + pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n", + path, seg->start, seg->end, usdt_provider, usdt_name, + usdt_abs_ip); + goto err_out; + } + /* translate from virtual address to file offset */ + usdt_rel_ip = usdt_abs_ip - seg->start + seg->offset; - usdt_rel_ip = usdt_abs_ip - (seg->start - seg->offset); - } else if (!man->has_bpf_cookie) { /* ehdr.e_type == ET_DYN */ + if (ehdr.e_type == ET_DYN && !man->has_bpf_cookie) { /* If we don't have BPF cookie support but need to * attach to a shared library, we'll need to know and * record absolute addresses of attach points due to @@ -697,9 +704,9 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * goto err_out; } - /* lib_segs are lazily initialized only if necessary */ - if (lib_seg_cnt == 0) { - err = parse_lib_segs(pid, path, &lib_segs, &lib_seg_cnt); + /* vma_segs are lazily initialized only if necessary */ + if (vma_seg_cnt == 0) { + err = parse_vma_segs(pid, path, &vma_segs, &vma_seg_cnt); if (err) { pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n", pid, path, err); @@ -707,7 +714,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * } } - seg = find_elf_seg(lib_segs, lib_seg_cnt, usdt_rel_ip, true /* relative */); + seg = find_vma_seg(vma_segs, vma_seg_cnt, usdt_rel_ip); if (!seg) { err = -ESRCH; pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n", @@ -715,7 +722,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * goto err_out; } - usdt_abs_ip = seg->start + (usdt_rel_ip - seg->offset); + usdt_abs_ip = seg->start - seg->offset + usdt_rel_ip; } pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n", @@ -723,7 +730,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args, seg ? seg->start : 0, seg ? seg->end : 0, seg ? seg->offset : 0); - /* Adjust semaphore address to be a relative offset */ + /* Adjust semaphore address to be a file offset */ if (note.sema_addr) { if (!man->has_sema_refcnt) { pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n", @@ -732,7 +739,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * goto err_out; } - seg = find_elf_seg(segs, seg_cnt, note.sema_addr, false /* relative */); + seg = find_elf_seg(segs, seg_cnt, note.sema_addr); if (!seg) { err = -ESRCH; pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n", @@ -747,7 +754,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * goto err_out; } - usdt_sema_off = note.sema_addr - (seg->start - seg->offset); + usdt_sema_off = note.sema_addr - seg->start + seg->offset; pr_debug("usdt: sema for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n", usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", @@ -770,7 +777,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * target->rel_ip = usdt_rel_ip; target->sema_off = usdt_sema_off; - /* notes->args references strings from Elf itself, so they can + /* notes.args references strings from ELF itself, so they can * be referenced safely until elf_end() call */ target->spec_str = note.args; @@ -788,7 +795,7 @@ static int collect_usdt_targets(struct usdt_manager *man, Elf *elf, const char * err_out: free(segs); - free(lib_segs); + free(vma_segs); if (err < 0) free(targets); return err; @@ -807,6 +814,8 @@ struct bpf_link_usdt { long abs_ip; struct bpf_link *link; } *uprobes; + + struct bpf_link *multi_link; }; static int bpf_link_usdt_detach(struct bpf_link *link) @@ -815,6 +824,9 @@ static int bpf_link_usdt_detach(struct bpf_link *link) struct usdt_manager *man = usdt_link->usdt_man; int i; + bpf_link__destroy(usdt_link->multi_link); + + /* When having multi_link, uprobe_cnt is 0 */ for (i = 0; i < usdt_link->uprobe_cnt; i++) { /* detach underlying uprobe link */ bpf_link__destroy(usdt_link->uprobes[i].link); @@ -851,8 +863,11 @@ static int bpf_link_usdt_detach(struct bpf_link *link) * system is so exhausted on memory, it's the least of user's * concerns, probably. * So just do our best here to return those IDs to usdt_manager. + * Another edge case when we can legitimately get NULL is when + * new_cnt is zero, which can happen in some edge cases, so we + * need to be careful about that. */ - if (new_free_ids) { + if (new_free_ids || new_cnt == 0) { memcpy(new_free_ids + man->free_spec_cnt, usdt_link->spec_ids, usdt_link->spec_cnt * sizeof(*usdt_link->spec_ids)); man->free_spec_ids = new_free_ids; @@ -872,31 +887,27 @@ static void bpf_link_usdt_dealloc(struct bpf_link *link) free(usdt_link); } -static size_t specs_hash_fn(const void *key, void *ctx) +static size_t specs_hash_fn(long key, void *ctx) { - const char *s = key; - - return str_hash(s); + return str_hash((char *)key); } -static bool specs_equal_fn(const void *key1, const void *key2, void *ctx) +static bool specs_equal_fn(long key1, long key2, void *ctx) { - const char *s1 = key1; - const char *s2 = key2; - - return strcmp(s1, s2) == 0; + return strcmp((char *)key1, (char *)key2) == 0; } static int allocate_spec_id(struct usdt_manager *man, struct hashmap *specs_hash, struct bpf_link_usdt *link, struct usdt_target *target, int *spec_id, bool *is_new) { - void *tmp; + long tmp; + void *new_ids; int err; /* check if we already allocated spec ID for this spec string */ if (hashmap__find(specs_hash, target->spec_str, &tmp)) { - *spec_id = (long)tmp; + *spec_id = tmp; *is_new = false; return 0; } @@ -904,17 +915,17 @@ static int allocate_spec_id(struct usdt_manager *man, struct hashmap *specs_hash /* otherwise it's a new ID that needs to be set up in specs map and * returned back to usdt_manager when USDT link is detached */ - tmp = libbpf_reallocarray(link->spec_ids, link->spec_cnt + 1, sizeof(*link->spec_ids)); - if (!tmp) + new_ids = libbpf_reallocarray(link->spec_ids, link->spec_cnt + 1, sizeof(*link->spec_ids)); + if (!new_ids) return -ENOMEM; - link->spec_ids = tmp; + link->spec_ids = new_ids; /* get next free spec ID, giving preference to free list, if not empty */ if (man->free_spec_cnt) { *spec_id = man->free_spec_ids[man->free_spec_cnt - 1]; /* cache spec ID for current spec string for future lookups */ - err = hashmap__add(specs_hash, target->spec_str, (void *)(long)*spec_id); + err = hashmap__add(specs_hash, target->spec_str, *spec_id); if (err) return err; @@ -927,7 +938,7 @@ static int allocate_spec_id(struct usdt_manager *man, struct hashmap *specs_hash *spec_id = man->next_free_spec_id; /* cache spec ID for current spec string for future lookups */ - err = hashmap__add(specs_hash, target->spec_str, (void *)(long)*spec_id); + err = hashmap__add(specs_hash, target->spec_str, *spec_id); if (err) return err; @@ -946,33 +957,24 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct const char *usdt_provider, const char *usdt_name, __u64 usdt_cookie) { - int i, fd, err, spec_map_fd, ip_map_fd; + unsigned long *offsets = NULL, *ref_ctr_offsets = NULL; + int i, err, spec_map_fd, ip_map_fd; LIBBPF_OPTS(bpf_uprobe_opts, opts); struct hashmap *specs_hash = NULL; struct bpf_link_usdt *link = NULL; struct usdt_target *targets = NULL; + __u64 *cookies = NULL; + struct elf_fd elf_fd; size_t target_cnt; - Elf *elf; spec_map_fd = bpf_map__fd(man->specs_map); ip_map_fd = bpf_map__fd(man->ip_to_spec_id_map); - /* TODO: perform path resolution similar to uprobe's */ - fd = open(path, O_RDONLY); - if (fd < 0) { - err = -errno; - pr_warn("usdt: failed to open ELF binary '%s': %d\n", path, err); + err = elf_open(path, &elf_fd); + if (err) return libbpf_err_ptr(err); - } - - elf = elf_begin(fd, ELF_C_READ_MMAP, NULL); - if (!elf) { - err = -EBADF; - pr_warn("usdt: failed to parse ELF binary '%s': %s\n", path, elf_errmsg(-1)); - goto err_out; - } - err = sanity_check_usdt_elf(elf, path); + err = sanity_check_usdt_elf(elf_fd.elf, path); if (err) goto err_out; @@ -985,7 +987,7 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct /* discover USDT in given binary, optionally limiting * activations to a given PID, if pid > 0 */ - err = collect_usdt_targets(man, elf, path, pid, usdt_provider, usdt_name, + err = collect_usdt_targets(man, elf_fd.elf, path, pid, usdt_provider, usdt_name, usdt_cookie, &targets, &target_cnt); if (err <= 0) { err = (err == 0) ? -ENOENT : err; @@ -1008,10 +1010,21 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct link->link.detach = &bpf_link_usdt_detach; link->link.dealloc = &bpf_link_usdt_dealloc; - link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); - if (!link->uprobes) { - err = -ENOMEM; - goto err_out; + if (man->has_uprobe_multi) { + offsets = calloc(target_cnt, sizeof(*offsets)); + cookies = calloc(target_cnt, sizeof(*cookies)); + ref_ctr_offsets = calloc(target_cnt, sizeof(*ref_ctr_offsets)); + + if (!offsets || !ref_ctr_offsets || !cookies) { + err = -ENOMEM; + goto err_out; + } + } else { + link->uprobes = calloc(target_cnt, sizeof(*link->uprobes)); + if (!link->uprobes) { + err = -ENOMEM; + goto err_out; + } } for (i = 0; i < target_cnt; i++) { @@ -1052,45 +1065,73 @@ struct bpf_link *usdt_manager_attach_usdt(struct usdt_manager *man, const struct goto err_out; } - opts.ref_ctr_offset = target->sema_off; - opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0; - uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, - target->rel_ip, &opts); - err = libbpf_get_error(uprobe_link); - if (err) { - pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", - i, usdt_provider, usdt_name, path, err); + if (man->has_uprobe_multi) { + offsets[i] = target->rel_ip; + ref_ctr_offsets[i] = target->sema_off; + cookies[i] = spec_id; + } else { + opts.ref_ctr_offset = target->sema_off; + opts.bpf_cookie = man->has_bpf_cookie ? spec_id : 0; + uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, path, + target->rel_ip, &opts); + err = libbpf_get_error(uprobe_link); + if (err) { + pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n", + i, usdt_provider, usdt_name, path, err); + goto err_out; + } + + link->uprobes[i].link = uprobe_link; + link->uprobes[i].abs_ip = target->abs_ip; + link->uprobe_cnt++; + } + } + + if (man->has_uprobe_multi) { + LIBBPF_OPTS(bpf_uprobe_multi_opts, opts_multi, + .ref_ctr_offsets = ref_ctr_offsets, + .offsets = offsets, + .cookies = cookies, + .cnt = target_cnt, + ); + + link->multi_link = bpf_program__attach_uprobe_multi(prog, pid, path, + NULL, &opts_multi); + if (!link->multi_link) { + err = -errno; + pr_warn("usdt: failed to attach uprobe multi for '%s:%s' in '%s': %d\n", + usdt_provider, usdt_name, path, err); goto err_out; } - link->uprobes[i].link = uprobe_link; - link->uprobes[i].abs_ip = target->abs_ip; - link->uprobe_cnt++; + free(offsets); + free(ref_ctr_offsets); + free(cookies); } free(targets); hashmap__free(specs_hash); - elf_end(elf); - close(fd); - + elf_close(&elf_fd); return &link->link; err_out: + free(offsets); + free(ref_ctr_offsets); + free(cookies); + if (link) bpf_link__destroy(&link->link); free(targets); hashmap__free(specs_hash); - if (elf) - elf_end(elf); - close(fd); + elf_close(&elf_fd); return libbpf_err_ptr(err); } /* Parse out USDT ELF note from '.note.stapsdt' section. * Logic inspired by perf's code. */ -static int parse_usdt_note(Elf *elf, const char *path, long base_addr, - GElf_Nhdr *nhdr, const char *data, size_t name_off, size_t desc_off, +static int parse_usdt_note(Elf *elf, const char *path, GElf_Nhdr *nhdr, + const char *data, size_t name_off, size_t desc_off, struct usdt_note *note) { const char *provider, *name, *args; @@ -1144,12 +1185,13 @@ static int parse_usdt_note(Elf *elf, const char *path, long base_addr, return 0; } -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg); +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz); static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, __u64 usdt_cookie) { + struct usdt_arg_spec *arg; const char *s; - int len; + int arg_sz, len; spec->usdt_cookie = usdt_cookie; spec->arg_cnt = 0; @@ -1162,10 +1204,25 @@ static int parse_usdt_spec(struct usdt_spec *spec, const struct usdt_note *note, return -E2BIG; } - len = parse_usdt_arg(s, spec->arg_cnt, &spec->args[spec->arg_cnt]); + arg = &spec->args[spec->arg_cnt]; + len = parse_usdt_arg(s, spec->arg_cnt, arg, &arg_sz); if (len < 0) return len; + arg->arg_signed = arg_sz < 0; + if (arg_sz < 0) + arg_sz = -arg_sz; + + switch (arg_sz) { + case 1: case 2: case 4: case 8: + arg->arg_bitshift = 64 - arg_sz * 8; + break; + default: + pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", + spec->arg_cnt, s, arg_sz); + return -EINVAL; + } + s += len; spec->arg_cnt++; } @@ -1222,32 +1279,38 @@ static int calc_pt_regs_off(const char *reg_name) return -ENOENT; } -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { - char *reg_name = NULL; - int arg_sz, len, reg_off; + char reg_name[16]; + int len, reg_off; long off; - if (sscanf(arg_str, " %d @ %ld ( %%%m[^)] ) %n", &arg_sz, &off, ®_name, &len) == 3) { + if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == 3) { /* Memory dereference case, e.g., -4@-20(%rbp) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; reg_off = calc_pt_regs_off(reg_name); - free(reg_name); if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ %%%ms %n", &arg_sz, ®_name, &len) == 2) { + } else if (sscanf(arg_str, " %d @ ( %%%15[^)] ) %n", arg_sz, reg_name, &len) == 2) { + /* Memory dereference case without offset, e.g., 8@(%rsp) */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -4@%eax */ arg->arg_type = USDT_ARG_REG; arg->val_off = 0; reg_off = calc_pt_regs_off(reg_name); - free(reg_name); if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ $%ld %n", &arg_sz, &off, &len) == 2) { + } else if (sscanf(arg_str, " %d @ $%ld %n", arg_sz, &off, &len) == 2) { /* Constant value case, e.g., 4@$71 */ arg->arg_type = USDT_ARG_CONST; arg->val_off = off; @@ -1257,20 +1320,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } - arg->arg_signed = arg_sz < 0; - if (arg_sz < 0) - arg_sz = -arg_sz; - - switch (arg_sz) { - case 1: case 2: case 4: case 8: - arg->arg_bitshift = 64 - arg_sz * 8; - break; - default: - pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", - arg_num, arg_str, arg_sz); - return -EINVAL; - } - return len; } @@ -1278,13 +1327,13 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec /* Do not support __s390__ for now, since user_pt_regs is broken with -m31. */ -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { unsigned int reg; - int arg_sz, len; + int len; long off; - if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", &arg_sz, &off, ®, &len) == 3) { + if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", arg_sz, &off, ®, &len) == 3) { /* Memory dereference case, e.g., -2@-28(%r15) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; @@ -1293,7 +1342,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } arg->reg_off = offsetof(user_pt_regs, gprs[reg]); - } else if (sscanf(arg_str, " %d @ %%r%u %n", &arg_sz, ®, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %%r%u %n", arg_sz, ®, &len) == 2) { /* Register read case, e.g., -8@%r0 */ arg->arg_type = USDT_ARG_REG; arg->val_off = 0; @@ -1302,7 +1351,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } arg->reg_off = offsetof(user_pt_regs, gprs[reg]); - } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { /* Constant value case, e.g., 4@71 */ arg->arg_type = USDT_ARG_CONST; arg->val_off = off; @@ -1312,20 +1361,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } - arg->arg_signed = arg_sz < 0; - if (arg_sz < 0) - arg_sz = -arg_sz; - - switch (arg_sz) { - case 1: case 2: case 4: case 8: - arg->arg_bitshift = 64 - arg_sz * 8; - break; - default: - pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", - arg_num, arg_str, arg_sz); - return -EINVAL; - } - return len; } @@ -1345,41 +1380,38 @@ static int calc_pt_regs_off(const char *reg_name) return -ENOENT; } -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { - char *reg_name = NULL; - int arg_sz, len, reg_off; + char reg_name[16]; + int len, reg_off; long off; - if (sscanf(arg_str, " %d @ \[ %m[a-z0-9], %ld ] %n", &arg_sz, ®_name, &off, &len) == 3) { + if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , %ld ] %n", arg_sz, reg_name, &off, &len) == 3) { /* Memory dereference case, e.g., -4@[sp, 96] */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; reg_off = calc_pt_regs_off(reg_name); - free(reg_name); if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ \[ %m[a-z0-9] ] %n", &arg_sz, ®_name, &len) == 2) { + } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == 2) { /* Memory dereference case, e.g., -4@[sp] */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = 0; reg_off = calc_pt_regs_off(reg_name); - free(reg_name); if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { /* Constant value case, e.g., 4@5 */ arg->arg_type = USDT_ARG_CONST; arg->val_off = off; arg->reg_off = 0; - } else if (sscanf(arg_str, " %d @ %m[a-z0-9] %n", &arg_sz, ®_name, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -8@x4 */ arg->arg_type = USDT_ARG_REG; arg->val_off = 0; reg_off = calc_pt_regs_off(reg_name); - free(reg_name); if (reg_off < 0) return reg_off; arg->reg_off = reg_off; @@ -1388,20 +1420,6 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } - arg->arg_signed = arg_sz < 0; - if (arg_sz < 0) - arg_sz = -arg_sz; - - switch (arg_sz) { - case 1: case 2: case 4: case 8: - arg->arg_bitshift = 64 - arg_sz * 8; - break; - default: - pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", - arg_num, arg_str, arg_sz); - return -EINVAL; - } - return len; } @@ -1456,32 +1474,30 @@ static int calc_pt_regs_off(const char *reg_name) return -ENOENT; } -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { - char *reg_name = NULL; - int arg_sz, len, reg_off; + char reg_name[16]; + int len, reg_off; long off; - if (sscanf(arg_str, " %d @ %ld ( %m[a-z0-9] ) %n", &arg_sz, &off, ®_name, &len) == 3) { + if (sscanf(arg_str, " %d @ %ld ( %15[a-z0-9] ) %n", arg_sz, &off, reg_name, &len) == 3) { /* Memory dereference case, e.g., -8@-88(s0) */ arg->arg_type = USDT_ARG_REG_DEREF; arg->val_off = off; reg_off = calc_pt_regs_off(reg_name); - free(reg_name); if (reg_off < 0) return reg_off; arg->reg_off = reg_off; - } else if (sscanf(arg_str, " %d @ %ld %n", &arg_sz, &off, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == 2) { /* Constant value case, e.g., 4@5 */ arg->arg_type = USDT_ARG_CONST; arg->val_off = off; arg->reg_off = 0; - } else if (sscanf(arg_str, " %d @ %m[a-z0-9] %n", &arg_sz, ®_name, &len) == 2) { + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { /* Register read case, e.g., -8@a1 */ arg->arg_type = USDT_ARG_REG; arg->val_off = 0; reg_off = calc_pt_regs_off(reg_name); - free(reg_name); if (reg_off < 0) return reg_off; arg->reg_off = reg_off; @@ -1490,17 +1506,83 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec return -EINVAL; } - arg->arg_signed = arg_sz < 0; - if (arg_sz < 0) - arg_sz = -arg_sz; + return len; +} - switch (arg_sz) { - case 1: case 2: case 4: case 8: - arg->arg_bitshift = 64 - arg_sz * 8; - break; - default: - pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n", - arg_num, arg_str, arg_sz); +#elif defined(__arm__) + +static int calc_pt_regs_off(const char *reg_name) +{ + static struct { + const char *name; + size_t pt_regs_off; + } reg_map[] = { + { "r0", offsetof(struct pt_regs, uregs[0]) }, + { "r1", offsetof(struct pt_regs, uregs[1]) }, + { "r2", offsetof(struct pt_regs, uregs[2]) }, + { "r3", offsetof(struct pt_regs, uregs[3]) }, + { "r4", offsetof(struct pt_regs, uregs[4]) }, + { "r5", offsetof(struct pt_regs, uregs[5]) }, + { "r6", offsetof(struct pt_regs, uregs[6]) }, + { "r7", offsetof(struct pt_regs, uregs[7]) }, + { "r8", offsetof(struct pt_regs, uregs[8]) }, + { "r9", offsetof(struct pt_regs, uregs[9]) }, + { "r10", offsetof(struct pt_regs, uregs[10]) }, + { "fp", offsetof(struct pt_regs, uregs[11]) }, + { "ip", offsetof(struct pt_regs, uregs[12]) }, + { "sp", offsetof(struct pt_regs, uregs[13]) }, + { "lr", offsetof(struct pt_regs, uregs[14]) }, + { "pc", offsetof(struct pt_regs, uregs[15]) }, + }; + int i; + + for (i = 0; i < ARRAY_SIZE(reg_map); i++) { + if (strcmp(reg_name, reg_map[i].name) == 0) + return reg_map[i].pt_regs_off; + } + + pr_warn("usdt: unrecognized register '%s'\n", reg_name); + return -ENOENT; +} + +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) +{ + char reg_name[16]; + int len, reg_off; + long off; + + if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , #%ld ] %n", + arg_sz, reg_name, &off, &len) == 3) { + /* Memory dereference case, e.g., -4@[fp, #96] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = off; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == 2) { + /* Memory dereference case, e.g., -4@[sp] */ + arg->arg_type = USDT_ARG_REG_DEREF; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else if (sscanf(arg_str, " %d @ #%ld %n", arg_sz, &off, &len) == 2) { + /* Constant value case, e.g., 4@#5 */ + arg->arg_type = USDT_ARG_CONST; + arg->val_off = off; + arg->reg_off = 0; + } else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == 2) { + /* Register read case, e.g., -8@r4 */ + arg->arg_type = USDT_ARG_REG; + arg->val_off = 0; + reg_off = calc_pt_regs_off(reg_name); + if (reg_off < 0) + return reg_off; + arg->reg_off = reg_off; + } else { + pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str); return -EINVAL; } @@ -1509,7 +1591,7 @@ static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec #else -static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg) +static int parse_usdt_arg(const char *arg_str, int arg_num, struct usdt_arg_spec *arg, int *arg_sz) { pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n"); return -ENOTSUP; diff --git a/pkg/plugin/lib/common/libbpf/_src/zip.c b/pkg/plugin/lib/common/libbpf/_src/zip.c new file mode 100644 index 0000000000..3f26d629b2 --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_src/zip.c @@ -0,0 +1,333 @@ +// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +/* + * Routines for dealing with .zip archives. + * + * Copyright (c) Meta Platforms, Inc. and affiliates. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "libbpf_internal.h" +#include "zip.h" + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wpacked" +#pragma GCC diagnostic ignored "-Wattributes" + +/* Specification of ZIP file format can be found here: + * https://pkware.cachefly.net/webdocs/casestudies/APPNOTE.TXT + * For a high level overview of the structure of a ZIP file see + * sections 4.3.1 - 4.3.6. + * + * Data structures appearing in ZIP files do not contain any + * padding and they might be misaligned. To allow us to safely + * operate on pointers to such structures and their members, we + * declare the types as packed. + */ + +#define END_OF_CD_RECORD_MAGIC 0x06054b50 + +/* See section 4.3.16 of the spec. */ +struct end_of_cd_record { + /* Magic value equal to END_OF_CD_RECORD_MAGIC */ + __u32 magic; + + /* Number of the file containing this structure or 0xFFFF if ZIP64 archive. + * Zip archive might span multiple files (disks). + */ + __u16 this_disk; + + /* Number of the file containing the beginning of the central directory or + * 0xFFFF if ZIP64 archive. + */ + __u16 cd_disk; + + /* Number of central directory records on this disk or 0xFFFF if ZIP64 + * archive. + */ + __u16 cd_records; + + /* Number of central directory records on all disks or 0xFFFF if ZIP64 + * archive. + */ + __u16 cd_records_total; + + /* Size of the central directory record or 0xFFFFFFFF if ZIP64 archive. */ + __u32 cd_size; + + /* Offset of the central directory from the beginning of the archive or + * 0xFFFFFFFF if ZIP64 archive. + */ + __u32 cd_offset; + + /* Length of comment data following end of central directory record. */ + __u16 comment_length; + + /* Up to 64k of arbitrary bytes. */ + /* uint8_t comment[comment_length] */ +} __attribute__((packed)); + +#define CD_FILE_HEADER_MAGIC 0x02014b50 +#define FLAG_ENCRYPTED (1 << 0) +#define FLAG_HAS_DATA_DESCRIPTOR (1 << 3) + +/* See section 4.3.12 of the spec. */ +struct cd_file_header { + /* Magic value equal to CD_FILE_HEADER_MAGIC. */ + __u32 magic; + __u16 version; + /* Minimum zip version needed to extract the file. */ + __u16 min_version; + __u16 flags; + __u16 compression; + __u16 last_modified_time; + __u16 last_modified_date; + __u32 crc; + __u32 compressed_size; + __u32 uncompressed_size; + __u16 file_name_length; + __u16 extra_field_length; + __u16 file_comment_length; + /* Number of the disk where the file starts or 0xFFFF if ZIP64 archive. */ + __u16 disk; + __u16 internal_attributes; + __u32 external_attributes; + /* Offset from the start of the disk containing the local file header to the + * start of the local file header. + */ + __u32 offset; +} __attribute__((packed)); + +#define LOCAL_FILE_HEADER_MAGIC 0x04034b50 + +/* See section 4.3.7 of the spec. */ +struct local_file_header { + /* Magic value equal to LOCAL_FILE_HEADER_MAGIC. */ + __u32 magic; + /* Minimum zip version needed to extract the file. */ + __u16 min_version; + __u16 flags; + __u16 compression; + __u16 last_modified_time; + __u16 last_modified_date; + __u32 crc; + __u32 compressed_size; + __u32 uncompressed_size; + __u16 file_name_length; + __u16 extra_field_length; +} __attribute__((packed)); + +#pragma GCC diagnostic pop + +struct zip_archive { + void *data; + __u32 size; + __u32 cd_offset; + __u32 cd_records; +}; + +static void *check_access(struct zip_archive *archive, __u32 offset, __u32 size) +{ + if (offset + size > archive->size || offset > offset + size) + return NULL; + + return archive->data + offset; +} + +/* Returns 0 on success, -EINVAL on error and -ENOTSUP if the eocd indicates the + * archive uses features which are not supported. + */ +static int try_parse_end_of_cd(struct zip_archive *archive, __u32 offset) +{ + __u16 comment_length, cd_records; + struct end_of_cd_record *eocd; + __u32 cd_offset, cd_size; + + eocd = check_access(archive, offset, sizeof(*eocd)); + if (!eocd || eocd->magic != END_OF_CD_RECORD_MAGIC) + return -EINVAL; + + comment_length = eocd->comment_length; + if (offset + sizeof(*eocd) + comment_length != archive->size) + return -EINVAL; + + cd_records = eocd->cd_records; + if (eocd->this_disk != 0 || eocd->cd_disk != 0 || eocd->cd_records_total != cd_records) + /* This is a valid eocd, but we only support single-file non-ZIP64 archives. */ + return -ENOTSUP; + + cd_offset = eocd->cd_offset; + cd_size = eocd->cd_size; + if (!check_access(archive, cd_offset, cd_size)) + return -EINVAL; + + archive->cd_offset = cd_offset; + archive->cd_records = cd_records; + return 0; +} + +static int find_cd(struct zip_archive *archive) +{ + int64_t limit, offset; + int rc = -EINVAL; + + if (archive->size <= sizeof(struct end_of_cd_record)) + return -EINVAL; + + /* Because the end of central directory ends with a variable length array of + * up to 0xFFFF bytes we can't know exactly where it starts and need to + * search for it at the end of the file, scanning the (limit, offset] range. + */ + offset = archive->size - sizeof(struct end_of_cd_record); + limit = (int64_t)offset - (1 << 16); + + for (; offset >= 0 && offset > limit && rc != 0; offset--) { + rc = try_parse_end_of_cd(archive, offset); + if (rc == -ENOTSUP) + break; + } + return rc; +} + +struct zip_archive *zip_archive_open(const char *path) +{ + struct zip_archive *archive; + int err, fd; + off_t size; + void *data; + + fd = open(path, O_RDONLY | O_CLOEXEC); + if (fd < 0) + return ERR_PTR(-errno); + + size = lseek(fd, 0, SEEK_END); + if (size == (off_t)-1 || size > UINT32_MAX) { + close(fd); + return ERR_PTR(-EINVAL); + } + + data = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0); + err = -errno; + close(fd); + + if (data == MAP_FAILED) + return ERR_PTR(err); + + archive = malloc(sizeof(*archive)); + if (!archive) { + munmap(data, size); + return ERR_PTR(-ENOMEM); + }; + + archive->data = data; + archive->size = size; + + err = find_cd(archive); + if (err) { + munmap(data, size); + free(archive); + return ERR_PTR(err); + } + + return archive; +} + +void zip_archive_close(struct zip_archive *archive) +{ + munmap(archive->data, archive->size); + free(archive); +} + +static struct local_file_header *local_file_header_at_offset(struct zip_archive *archive, + __u32 offset) +{ + struct local_file_header *lfh; + + lfh = check_access(archive, offset, sizeof(*lfh)); + if (!lfh || lfh->magic != LOCAL_FILE_HEADER_MAGIC) + return NULL; + + return lfh; +} + +static int get_entry_at_offset(struct zip_archive *archive, __u32 offset, struct zip_entry *out) +{ + struct local_file_header *lfh; + __u32 compressed_size; + const char *name; + void *data; + + lfh = local_file_header_at_offset(archive, offset); + if (!lfh) + return -EINVAL; + + offset += sizeof(*lfh); + if ((lfh->flags & FLAG_ENCRYPTED) || (lfh->flags & FLAG_HAS_DATA_DESCRIPTOR)) + return -EINVAL; + + name = check_access(archive, offset, lfh->file_name_length); + if (!name) + return -EINVAL; + + offset += lfh->file_name_length; + if (!check_access(archive, offset, lfh->extra_field_length)) + return -EINVAL; + + offset += lfh->extra_field_length; + compressed_size = lfh->compressed_size; + data = check_access(archive, offset, compressed_size); + if (!data) + return -EINVAL; + + out->compression = lfh->compression; + out->name_length = lfh->file_name_length; + out->name = name; + out->data = data; + out->data_length = compressed_size; + out->data_offset = offset; + + return 0; +} + +int zip_archive_find_entry(struct zip_archive *archive, const char *file_name, + struct zip_entry *out) +{ + size_t file_name_length = strlen(file_name); + __u32 i, offset = archive->cd_offset; + + for (i = 0; i < archive->cd_records; ++i) { + __u16 cdfh_name_length, cdfh_flags; + struct cd_file_header *cdfh; + const char *cdfh_name; + + cdfh = check_access(archive, offset, sizeof(*cdfh)); + if (!cdfh || cdfh->magic != CD_FILE_HEADER_MAGIC) + return -EINVAL; + + offset += sizeof(*cdfh); + cdfh_name_length = cdfh->file_name_length; + cdfh_name = check_access(archive, offset, cdfh_name_length); + if (!cdfh_name) + return -EINVAL; + + cdfh_flags = cdfh->flags; + if ((cdfh_flags & FLAG_ENCRYPTED) == 0 && + (cdfh_flags & FLAG_HAS_DATA_DESCRIPTOR) == 0 && + file_name_length == cdfh_name_length && + memcmp(file_name, archive->data + offset, file_name_length) == 0) { + return get_entry_at_offset(archive, cdfh->offset, out); + } + + offset += cdfh_name_length; + offset += cdfh->extra_field_length; + offset += cdfh->file_comment_length; + } + + return -ENOENT; +} diff --git a/pkg/plugin/lib/common/libbpf/_src/zip.h b/pkg/plugin/lib/common/libbpf/_src/zip.h new file mode 100644 index 0000000000..1c1bb21fba --- /dev/null +++ b/pkg/plugin/lib/common/libbpf/_src/zip.h @@ -0,0 +1,47 @@ +/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */ + +#ifndef __LIBBPF_ZIP_H +#define __LIBBPF_ZIP_H + +#include + +/* Represents an open zip archive. + * Only basic ZIP files are supported, in particular the following are not + * supported: + * - encryption + * - streaming + * - multi-part ZIP files + * - ZIP64 + */ +struct zip_archive; + +/* Carries information on name, compression method, and data corresponding to a + * file in a zip archive. + */ +struct zip_entry { + /* Compression method as defined in pkzip spec. 0 means data is uncompressed. */ + __u16 compression; + + /* Non-null terminated name of the file. */ + const char *name; + /* Length of the file name. */ + __u16 name_length; + + /* Pointer to the file data. */ + const void *data; + /* Length of the file data. */ + __u32 data_length; + /* Offset of the file data within the archive. */ + __u32 data_offset; +}; + +/* Open a zip archive. Returns NULL in case of an error. */ +struct zip_archive *zip_archive_open(const char *path); + +/* Close a zip archive and release resources. */ +void zip_archive_close(struct zip_archive *archive); + +/* Look up an entry corresponding to a file in given zip archive. */ +int zip_archive_find_entry(struct zip_archive *archive, const char *name, struct zip_entry *out); + +#endif diff --git a/pkg/plugin/packetparser/packetparser_linux.go b/pkg/plugin/packetparser/packetparser_linux.go index fe8132ba77..3e0cb8da9e 100644 --- a/pkg/plugin/packetparser/packetparser_linux.go +++ b/pkg/plugin/packetparser/packetparser_linux.go @@ -32,10 +32,13 @@ import ( "github.com/microsoft/retina/pkg/metrics" "github.com/microsoft/retina/pkg/plugin/api" plugincommon "github.com/microsoft/retina/pkg/plugin/common" - _ "github.com/microsoft/retina/pkg/plugin/lib/_amd64" // nolint - _ "github.com/microsoft/retina/pkg/plugin/lib/_arm64" // nolint - _ "github.com/microsoft/retina/pkg/plugin/lib/common/libbpf/_src" // nolint - _ "github.com/microsoft/retina/pkg/plugin/packetparser/_cprog" // nolint + _ "github.com/microsoft/retina/pkg/plugin/lib/_amd64" // nolint + _ "github.com/microsoft/retina/pkg/plugin/lib/_arm64" // nolint + _ "github.com/microsoft/retina/pkg/plugin/lib/common/libbpf/_include/asm" // nolint + _ "github.com/microsoft/retina/pkg/plugin/lib/common/libbpf/_include/linux" // nolint + _ "github.com/microsoft/retina/pkg/plugin/lib/common/libbpf/_include/uapi/linux" // nolint + _ "github.com/microsoft/retina/pkg/plugin/lib/common/libbpf/_src" // nolint + _ "github.com/microsoft/retina/pkg/plugin/packetparser/_cprog" // nolint "github.com/microsoft/retina/pkg/pubsub" "github.com/microsoft/retina/pkg/utils" "github.com/microsoft/retina/pkg/watchers/endpoint"