From fb57ba0b6a4a0d6f2f89e861cff3522e33025755 Mon Sep 17 00:00:00 2001 From: Kornilios Kourtis Date: Fri, 13 Oct 2023 11:44:15 +0200 Subject: [PATCH] bpf: remove cgrpid_tracker There is an issue with the cgroup tracker id (details can be found below). This issue causes policyfilter to misbehave. Furthermore, the original intention behind the cgroup tracker id was never realized and so it currently serves no purpose. Its only user is the policyfilter. This PR removes the cgroup tracker id, and, instead, computes the cgroup id at the time of the policyfilter check. For most common setups (cgroupv2) the overhead of doing so is negligible. Computing the cgroup id at the time of check is more reliable and also simplifies the code. More details can be found below. We have a namespaced policy installed that mathes connect() calls to 8.8.8.8, and execute the following on a pod in the same namespace: `kubectl exec -it nonqos-demo -- curl 8.8.8.8` We observe two things: * there is no event from the policy (as we were expecting) * the cgroup tracker id of the curl process is the same as the parent runc process (which should not be the case) The exec event we get from curl has: .process.pid: 244958 .process.binary: /usr/bin/curl .parent.pid: 244948 .parent.binary: /usr/bin/runc Looking at the cgroup tracker id in the execve map for these two pids we see: "cgrpid_tracker": 2371 "cgrpid_tracker": 2371 Which means that the cgroup id tracker is not correct, and the curl process will not be matched by the policy. The issue might arise from the fact that in the wake_up_new_task(), computing the cgroup tracker id, uses get_current_cgroup_id(). We are, however, still in the parent so the get_current_cgroup_id() helper will return the cgroup of the parent, not of the child (which is what we want). To simplify the code complexity (both from the perspective of understandabiliuty but also verification complexity), we instead remove the tracker id and retrieve the current cgroup id at the time of the check. This is much simpler, and not less efficient for the common case (cgroupv2). Following the same steps with a version build with the current patch, produces an event as expected. Signed-off-by: Kornilios Kourtis --- bpf/lib/bpf_cgroup.h | 112 +++++++++++---------------- bpf/lib/process.h | 1 - bpf/process/bpf_fork.c | 23 +----- bpf/process/bpf_process_event.h | 65 ++-------------- bpf/process/policy_filter.h | 14 ++-- pkg/grpc/exec/exec.go | 1 - pkg/sensors/exec/execvemap/execve.go | 17 ++-- pkg/sensors/tracing/kprobe_test.go | 3 + pkg/testutils/sensors/load.go | 2 +- 9 files changed, 73 insertions(+), 165 deletions(-) diff --git a/bpf/lib/bpf_cgroup.h b/bpf/lib/bpf_cgroup.h index 7d2a7e2461e..b7892bc5c7b 100644 --- a/bpf/lib/bpf_cgroup.h +++ b/bpf/lib/bpf_cgroup.h @@ -311,7 +311,7 @@ get_task_cgroup(struct task_struct *task, __u32 subsys_idx, __u32 *error_flags) } /** - * tg_get_current_cgroup_id() Returns the accurate cgroup id of current task. + * __tg_get_current_cgroup_id() Returns the accurate cgroup id of current task. * @cgrp: cgroup target of current task. * @cgrpfs_ver: Cgroupfs Magic number either Cgroupv1 or Cgroupv2 * @@ -323,10 +323,8 @@ get_task_cgroup(struct task_struct *task, __u32 subsys_idx, __u32 *error_flags) * Returns the cgroup id of current task on success, zero on failures. */ static inline __attribute__((always_inline)) __u64 -tg_get_current_cgroup_id(struct cgroup *cgrp, __u64 cgrpfs_ver) +__tg_get_current_cgroup_id(struct cgroup *cgrp, __u64 cgrpfs_ver) { - __u64 id = 0; - /* * Try the bpf helper on the default hierarchy if available * and if we are running in unified cgroupv2 @@ -334,12 +332,53 @@ tg_get_current_cgroup_id(struct cgroup *cgrp, __u64 cgrpfs_ver) if (bpf_core_enum_value_exists(enum bpf_func_id, BPF_FUNC_get_current_cgroup_id) && cgrpfs_ver == CGROUP2_SUPER_MAGIC) { - id = get_current_cgroup_id(); + return get_current_cgroup_id(); } else { - id = get_cgroup_id(cgrp); + return get_cgroup_id(cgrp); } +} - return id; +/** + * tg_get_current_cgroup_id() Returns the accurate cgroup id of current task. + * + * It works similar to __tg_get_current_cgroup_id, but computes the cgrp if it is needed. + * Returns the cgroup id of current task on success, zero on failures. + */ +static inline __attribute__((always_inline)) __u64 +tg_get_current_cgroup_id() +{ + __u32 error_flags; + struct cgroup *cgrp; + __u64 cgrpfs_magic = 0; + struct task_struct *task; + struct tetragon_conf *conf; + int zero = 0, subsys_idx = 0; + + conf = map_lookup_elem(&tg_conf_map, &zero); + if (conf) { + /* Select which cgroup version */ + cgrpfs_magic = conf->cgrp_fs_magic; + subsys_idx = conf->tg_cgrp_subsys_idx; + } + + /* + * Try the bpf helper on the default hierarchy if available + * and if we are running in unified cgroupv2 + */ + if (bpf_core_enum_value_exists(enum bpf_func_id, + BPF_FUNC_get_current_cgroup_id) && + cgrpfs_magic == CGROUP2_SUPER_MAGIC) { + return get_current_cgroup_id(); + } + + task = (struct task_struct *)get_current_task(); + + // NB: error_flags are ignored for now + cgrp = get_task_cgroup(task, subsys_idx, &error_flags); + if (!cgrp) + return 0; + + return get_cgroup_id(cgrp); } /** @@ -392,63 +431,4 @@ __init_cgrp_tracking_val_heap(struct cgroup *cgrp, cgroup_state state) return heap; } -/** - * __set_task_cgrpid_tracker() Sets the cgrpid_tracker of a task. - * It checks tetragon_conf if not available then exit. - * If tetragon_conf is available then checks if the task - * execve_map_value->cgrpid_tracker is set, if so do nothing. - * If not, then set the task execve_map_value->cgrpid_tracker - * to the tracking cgroup ID. - */ -static inline __attribute__((always_inline)) int -__set_task_cgrpid_tracker(struct tetragon_conf *conf, struct task_struct *task, - struct execve_map_value *execve_val, __u32 *error_flags) -{ - u32 flags = 0; - struct cgroup *cgrp; - __u64 cgrpid_tracker = 0; - - /* Ensure that execve_val is not null */ - if (unlikely(!conf) || unlikely(!execve_val)) - return 0; - - probe_read(&flags, sizeof(flags), _(&task->flags)); - /* Skip kernel threads */ - if (flags & PF_KTHREAD) - return 0; - - /* Set the tracking cgroup id only if it was not set, - * this avoids cgroup thread granularity mess! - */ - if (execve_val->cgrpid_tracker != 0) - return 0; - - cgrp = get_task_cgroup(task, conf->tg_cgrp_subsys_idx, error_flags); - if (!cgrp) - return 0; - - /* TODO: get current cgroup ancestors and their levels, - * so we can use them as a proper reference for setting, - * cgroup tracking IDs. - * - * For now we just use the current cgroup ID as tracker. - */ - cgrpid_tracker = tg_get_current_cgroup_id(cgrp, conf->cgrp_fs_magic); - if (!cgrpid_tracker) { - /* Failed to get cgrpid_tracker do nothing. This should never happen */ - *error_flags |= EVENT_ERROR_CGROUP_ID; - return 0; - } - - /* Update the execve_val with the tracking cgroup ID */ - execve_val->cgrpid_tracker = cgrpid_tracker; - - /* TODO: - * Lookup cgroup data from the cgroup bpf map, create an entry - * if not found, otherwise update previous with the corresponding - * new cgroup state. - */ - return 0; -} - #endif // __BPF_CGROUP_ diff --git a/bpf/lib/process.h b/bpf/lib/process.h index 4ba6c57561d..358a1ffabb2 100644 --- a/bpf/lib/process.h +++ b/bpf/lib/process.h @@ -302,7 +302,6 @@ struct execve_map_value { __u32 nspid; __u32 binary; __u32 pad; - __u64 cgrpid_tracker; /* Pinned Cgroup ID Tracker */ struct msg_ns ns; struct msg_capabilities caps; } __attribute__((packed)) __attribute__((aligned(8))); diff --git a/bpf/process/bpf_fork.c b/bpf/process/bpf_fork.c index e617753e093..1bdf6bc84b8 100644 --- a/bpf/process/bpf_fork.c +++ b/bpf/process/bpf_fork.c @@ -21,10 +21,7 @@ __attribute__((section("kprobe/wake_up_new_task"), used)) int BPF_KPROBE(event_wake_up_new_task, struct task_struct *task) { struct execve_map_value *curr, *parent; - struct task_struct *current_task; - struct tetragon_conf *config; - u32 tgid = 0, ppid = 0, error_flags = 0; - int zero = 0; + u32 tgid = 0, error_flags = 0; if (!task) return 0; @@ -35,24 +32,6 @@ BPF_KPROBE(event_wake_up_new_task, struct task_struct *task) if (!curr) return 0; - /* Cgroup environment */ - config = map_lookup_elem(&tg_conf_map, &zero); - if (config) { - /* Set the tracking cgroup ID for the new task if not already set */ - __set_task_cgrpid_tracker(config, task, curr, &error_flags); - - /* Let's try to catch current or "parent" if it was not tracked */ - current_task = (struct task_struct *)get_current_task(); - probe_read(&ppid, sizeof(ppid), _(¤t_task->tgid)); - /* If they share same thread group nothing todo... */ - if (tgid != ppid) { - parent = execve_map_get_noinit(ppid); - if (parent) - __set_task_cgrpid_tracker(config, current_task, - parent, &error_flags); - } - } - /* Generate an EVENT_COMMON_FLAG_CLONE event once per process, * that is, thread group. */ diff --git a/bpf/process/bpf_process_event.h b/bpf/process/bpf_process_event.h index 1a6bffddff3..3d3e44e0a06 100644 --- a/bpf/process/bpf_process_event.h +++ b/bpf/process/bpf_process_event.h @@ -531,45 +531,6 @@ get_namespaces(struct msg_ns *msg, struct task_struct *task) } } -/* Gather current task cgroup id */ -static inline __attribute__((always_inline)) void -__event_get_current_cgroup_id(struct tetragon_conf *conf, struct cgroup *cgrp, - struct execve_map_value *execve_val, - struct msg_execve_event *msg) -{ - __u64 cgrpfs_magic = 0; - struct msg_process *process; - - process = &msg->process; - msg->kube.cgrpid = 0; - - /* If tracking Cgroup ID is set use it, otherwise use current - * context one. This way we guarantee to always have a cgrpid - * and return cgroup information even if we miss the tracking - * Cgroup ID. - * - * Ensure that execve_val is not null. - */ - if (execve_val && execve_val->cgrpid_tracker) { - /* Set the k8s cgrpid with the tracking ID. */ - msg->kube.cgrpid = execve_val->cgrpid_tracker; - return; - } - - /* Gather Cgroup information using current context */ - if (conf) { - /* Select which cgroup version */ - cgrpfs_magic = conf->cgrp_fs_magic; - } - - /* Collect event cgroup ID */ - msg->kube.cgrpid = tg_get_current_cgroup_id(cgrp, cgrpfs_magic); - if (execve_val) - execve_val->cgrpid_tracker = msg->kube.cgrpid; - if (!msg->kube.cgrpid) - process->flags |= EVENT_ERROR_CGROUP_ID; -} - /* Gather current task cgroup name */ static inline __attribute__((always_inline)) void __event_get_current_cgroup_name(struct cgroup *cgrp, @@ -610,34 +571,22 @@ static inline __attribute__((always_inline)) void __event_get_cgroup_info(struct task_struct *task, struct msg_execve_event *msg) { - __u32 pid; + __u64 cgrpfs_magic = 0; int zero = 0, subsys_idx = 0; struct cgroup *cgrp; struct msg_process *process; struct tetragon_conf *conf; - struct execve_map_value *execve_val; process = &msg->process; /* Clear cgroup info at the beginning, so if we return early we do not pass previous data */ memset(&msg->kube, 0, sizeof(struct msg_k8s)); - pid = (get_current_pid_tgid() >> 32); - execve_val = execve_map_get_noinit(pid); - /* Even if execve_val is null we must continue collect information */ - - /* Check if cgroup configuration is set */ conf = map_lookup_elem(&tg_conf_map, &zero); if (conf) { - /* Select the right css to use */ - if (conf->tg_cgrp_subsys_idx != 0) - subsys_idx = conf->tg_cgrp_subsys_idx; - - /* Set the tracking cgroup ID of the task if it was not - * already set, this could be the case due to race conditions. - * Do not remove this as following cgroup logic depend on it. - */ - __set_task_cgrpid_tracker(conf, task, execve_val, &process->flags); + /* Select which cgroup version */ + cgrpfs_magic = conf->cgrp_fs_magic; + subsys_idx = conf->tg_cgrp_subsys_idx; } cgrp = get_task_cgroup(task, subsys_idx, &process->flags); @@ -645,9 +594,11 @@ __event_get_cgroup_info(struct task_struct *task, return; /* Collect event cgroup ID */ - __event_get_current_cgroup_id(conf, cgrp, execve_val, msg); + msg->kube.cgrpid = __tg_get_current_cgroup_id(cgrp, cgrpfs_magic); + if (!msg->kube.cgrpid) + process->flags |= EVENT_ERROR_CGROUP_ID; - /* Get the cgroup name of this event. TODO: pass the tracking cgroup ID. */ + /* Get the cgroup name of this event. */ __event_get_current_cgroup_name(cgrp, msg); } #endif diff --git a/bpf/process/policy_filter.h b/bpf/process/policy_filter.h index 977f0e8581b..324a010a174 100644 --- a/bpf/process/policy_filter.h +++ b/bpf/process/policy_filter.h @@ -26,23 +26,21 @@ struct { static inline __attribute__((always_inline)) bool policy_filter_check(u32 policy_id) { - __u32 pid; void *policy_map; - struct execve_map_value *curr; + __u64 cgroupid; if (!policy_id) return true; - pid = (get_current_pid_tgid() >> 32); - curr = execve_map_get_noinit(pid); - if (!curr || curr->cgrpid_tracker == 0) - return false; - policy_map = map_lookup_elem(&policy_filter_maps, &policy_id); if (!policy_map) return false; - return map_lookup_elem(policy_map, &curr->cgrpid_tracker); + cgroupid = tg_get_current_cgroup_id(); + if (!cgroupid) + return false; + + return map_lookup_elem(policy_map, &cgroupid); } #endif /* POLICY_FILTER_MAPS_H__ */ diff --git a/pkg/grpc/exec/exec.go b/pkg/grpc/exec/exec.go index ba2f1473874..368c43fbdae 100644 --- a/pkg/grpc/exec/exec.go +++ b/pkg/grpc/exec/exec.go @@ -129,7 +129,6 @@ func (msg *MsgCgroupEventUnix) HandleMessage() *tetragon.GetEventsResponse { "cgroup.event": op.String(), "PID": msg.PID, "NSPID": msg.NSPID, - "cgroup.IDTracker": msg.CgrpidTracker, "cgroup.ID": msg.Cgrpid, "cgroup.state": st, "cgroup.hierarchyID": msg.CgrpData.HierarchyId, diff --git a/pkg/sensors/exec/execvemap/execve.go b/pkg/sensors/exec/execvemap/execve.go index 1420be6ec6c..3c4ef371ff1 100644 --- a/pkg/sensors/exec/execvemap/execve.go +++ b/pkg/sensors/exec/execvemap/execve.go @@ -12,13 +12,12 @@ type ExecveKey struct { } type ExecveValue struct { - Process processapi.MsgExecveKey `align:"key"` - Parent processapi.MsgExecveKey `align:"pkey"` - Flags uint32 `align:"flags"` - Nspid uint32 `align:"nspid"` - Binary uint32 `align:"binary"` - Pad uint32 `align:"pad"` - CgrpIdTracker uint64 `align:"cgrpid_tracker"` - Namespaces processapi.MsgNamespaces `align:"ns"` - Capabilities processapi.MsgCapabilities `align:"caps"` + Process processapi.MsgExecveKey `align:"key"` + Parent processapi.MsgExecveKey `align:"pkey"` + Flags uint32 `align:"flags"` + Nspid uint32 `align:"nspid"` + Binary uint32 `align:"binary"` + Pad uint32 `align:"pad"` + Namespaces processapi.MsgNamespaces `align:"ns"` + Capabilities processapi.MsgCapabilities `align:"caps"` } diff --git a/pkg/sensors/tracing/kprobe_test.go b/pkg/sensors/tracing/kprobe_test.go index 2b2cd828ec1..7e3b97b97d0 100644 --- a/pkg/sensors/tracing/kprobe_test.go +++ b/pkg/sensors/tracing/kprobe_test.go @@ -3731,6 +3731,9 @@ func TestLoadKprobeSensor(t *testing.T) { // generic_kprobe_process_event*,generic_kprobe_actions,retkprobe tus.SensorMap{Name: "fdinstall_map", Progs: []uint{1, 2, 3, 4, 5, 12, 14}}, + + // generic_kprobe_event + tus.SensorMap{Name: "tg_conf_map", Progs: []uint{0}}, } if kernels.EnableLargeProgs() { diff --git a/pkg/testutils/sensors/load.go b/pkg/testutils/sensors/load.go index d71719dc51e..dfdcdacf823 100644 --- a/pkg/testutils/sensors/load.go +++ b/pkg/testutils/sensors/load.go @@ -131,7 +131,7 @@ func mergeInBaseSensorMaps(t *testing.T, sensorMaps []SensorMap, sensorProgs []S // event_execve SensorMap{Name: "names_map", Progs: []uint{0}}, - SensorMap{Name: "tg_conf_map", Progs: []uint{0, 2}}, + SensorMap{Name: "tg_conf_map", Progs: []uint{0}}, // event_wake_up_new_task SensorMap{Name: "execve_val", Progs: []uint{2}},