Skip to content

Commit

Permalink
xdp: add a new helper for dev map multicast support
Browse files Browse the repository at this point in the history
This patch is for xdp multicast support. which has been discussed
before[0], The goal is to be able to implement an OVS-like data plane in
XDP, i.e., a software switch that can forward XDP frames to multiple ports.

To achieve this, an application needs to specify a group of interfaces
to forward a packet to. It is also common to want to exclude one or more
physical interfaces from the forwarding operation - e.g., to forward a
packet to all interfaces in the multicast group except the interface it
arrived on. While this could be done simply by adding more groups, this
quickly leads to a combinatorial explosion in the number of groups an
application has to maintain.

To avoid the combinatorial explosion, we propose to include the ability
to specify an "exclude group" as part of the forwarding operation. This
needs to be a group (instead of just a single port index), because a
physical interface can be part of a logical grouping, such as a bond
device.

Thus, the logical forwarding operation becomes a "set difference"
operation, i.e. "forward to all ports in group A that are not also in
group B". This series implements such an operation using device maps to
represent the groups. This means that the XDP program specifies two
device maps, one containing the list of netdevs to redirect to, and the
other containing the exclude list.

To achieve this, I re-implement a new helper bpf_redirect_map_multi()
to accept two maps, the forwarding map and exclude map. The forwarding
map could be DEVMAP or DEVMAP_HASH, but the exclude map *must* be
DEVMAP_HASH to get better performace. If user don't want to use exclude
map and just want simply stop redirecting back to ingress device, they
can use flag BPF_F_EXCLUDE_INGRESS.

As both bpf_xdp_redirect_map() and this new helpers are using struct
bpf_redirect_info, I add a new ex_map and set tgt_value to NULL in the
new helper to make a difference with bpf_xdp_redirect_map().

Also I keep the general data path in net/core/filter.c, the native data
path in kernel/bpf/devmap.c so we can use direct calls to get better
performace.

[0] https://xdp-project.net/#Handling-multicast

Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
  • Loading branch information
liuhangbin authored and intel-lab-lkp committed Jan 20, 2021
1 parent d7bc981 commit 34a3d52
Show file tree
Hide file tree
Showing 9 changed files with 360 additions and 5 deletions.
20 changes: 20 additions & 0 deletions include/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -1427,6 +1427,11 @@ int dev_xdp_enqueue(struct net_device *dev, struct xdp_buff *xdp,
struct net_device *dev_rx);
int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
struct net_device *dev_rx);
bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
int exclude_ifindex);
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
struct bpf_map *map, struct bpf_map *ex_map,
u32 flags);
int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
struct bpf_prog *xdp_prog);
bool dev_map_can_have_prog(struct bpf_map *map);
Expand Down Expand Up @@ -1595,6 +1600,21 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
return 0;
}

static inline
bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
int exclude_ifindex)
{
return false;
}

static inline
int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
struct bpf_map *map, struct bpf_map *ex_map,
u32 flags)
{
return 0;
}

struct sk_buff;

static inline int dev_map_generic_redirect(struct bpf_dtab_netdev *dst,
Expand Down
1 change: 1 addition & 0 deletions include/linux/filter.h
Original file line number Diff line number Diff line change
Expand Up @@ -637,6 +637,7 @@ struct bpf_redirect_info {
u32 tgt_index;
void *tgt_value;
struct bpf_map *map;
struct bpf_map *ex_map;
u32 kern_flags;
struct bpf_nh_params nh;
};
Expand Down
1 change: 1 addition & 0 deletions include/net/xdp.h
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ struct sk_buff *__xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct net_device *dev);
struct sk_buff *xdp_build_skb_from_frame(struct xdp_frame *xdpf,
struct net_device *dev);
struct xdp_frame *xdpf_clone(struct xdp_frame *xdpf);

static inline
void xdp_convert_frame_to_buff(struct xdp_frame *frame, struct xdp_buff *xdp)
Expand Down
28 changes: 28 additions & 0 deletions include/uapi/linux/bpf.h
Original file line number Diff line number Diff line change
Expand Up @@ -3836,6 +3836,28 @@ union bpf_attr {
* Return
* A pointer to a struct socket on success or NULL if the file is
* not a socket.
*
* long bpf_redirect_map_multi(struct bpf_map *map, struct bpf_map *ex_map, u64 flags)
* Description
* This is a multicast implementation for XDP redirect. It will
* redirect the packet to ALL the interfaces in *map*, but
* exclude the interfaces in *ex_map*.
*
* The forwarding *map* could be either BPF_MAP_TYPE_DEVMAP or
* BPF_MAP_TYPE_DEVMAP_HASH. To get better performance, the
* *ex_map* is limited to BPF_MAP_TYPE_DEVMAP_HASH and must be
* keyed by ifindex for the helper to work.
*
* Currently the *flags* only supports *BPF_F_EXCLUDE_INGRESS*,
* which additionally excludes the current ingress device.
*
* See also bpf_redirect_map() as a unicast implementation,
* which supports redirecting packet to a specific ifindex
* in the map. As both helpers use struct bpf_redirect_info
* to store the redirect info, we will use a a NULL tgt_value
* to distinguish multicast and unicast redirecting.
* Return
* **XDP_REDIRECT** on success, or **XDP_ABORTED** on error.
*/
#define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \
Expand Down Expand Up @@ -4001,6 +4023,7 @@ union bpf_attr {
FN(ktime_get_coarse_ns), \
FN(ima_inode_hash), \
FN(sock_from_file), \
FN(redirect_map_multi), \
/* */

/* integer value in 'imm' field of BPF_CALL instruction selects which helper
Expand Down Expand Up @@ -4177,6 +4200,11 @@ enum {
BPF_F_BPRM_SECUREEXEC = (1ULL << 0),
};

/* BPF_FUNC_redirect_map_multi flags. */
enum {
BPF_F_EXCLUDE_INGRESS = (1ULL << 0),
};

#define __bpf_md_ptr(type, name) \
union { \
type name; \
Expand Down
128 changes: 128 additions & 0 deletions kernel/bpf/devmap.c
Original file line number Diff line number Diff line change
Expand Up @@ -523,6 +523,134 @@ int dev_map_enqueue(struct bpf_dtab_netdev *dst, struct xdp_buff *xdp,
return __xdp_enqueue(dev, xdp, dev_rx, dst->xdp_prog);
}

/* Use direct call in fast path instead of map->ops->map_get_next_key() */
static int devmap_get_next_key(struct bpf_map *map, void *key, void *next_key)
{

switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP:
return dev_map_get_next_key(map, key, next_key);
case BPF_MAP_TYPE_DEVMAP_HASH:
return dev_map_hash_get_next_key(map, key, next_key);
default:
break;
}

return -ENOENT;
}

bool dev_in_exclude_map(struct bpf_dtab_netdev *obj, struct bpf_map *map,
int exclude_ifindex)
{
if (obj->dev->ifindex == exclude_ifindex)
return true;

if (!map)
return false;

return __dev_map_hash_lookup_elem(map, obj->dev->ifindex) != NULL;
}

static struct bpf_dtab_netdev *devmap_get_next_obj(struct xdp_buff *xdp, struct bpf_map *map,
struct bpf_map *ex_map, u32 *key,
u32 *next_key, int ex_ifindex)
{
struct bpf_dtab_netdev *obj;
struct net_device *dev;
u32 *tmp_key = key;
u32 index;
int err;

err = devmap_get_next_key(map, tmp_key, next_key);
if (err)
return NULL;

/* When using dev map hash, we could restart the hashtab traversal
* in case the key has been updated/removed in the mean time.
* So we may end up potentially looping due to traversal restarts
* from first elem.
*
* Let's use map's max_entries to limit the loop number.
*/
for (index = 0; index < map->max_entries; index++) {
switch (map->map_type) {
case BPF_MAP_TYPE_DEVMAP:
obj = __dev_map_lookup_elem(map, *next_key);
break;
case BPF_MAP_TYPE_DEVMAP_HASH:
obj = __dev_map_hash_lookup_elem(map, *next_key);
break;
default:
break;
}

if (!obj || dev_in_exclude_map(obj, ex_map, ex_ifindex))
goto find_next;

dev = obj->dev;

if (!dev->netdev_ops->ndo_xdp_xmit)
goto find_next;

err = xdp_ok_fwd_dev(dev, xdp->data_end - xdp->data);
if (unlikely(err))
goto find_next;

return obj;

find_next:
tmp_key = next_key;
err = devmap_get_next_key(map, tmp_key, next_key);
if (err)
break;
}

return NULL;
}

int dev_map_enqueue_multi(struct xdp_buff *xdp, struct net_device *dev_rx,
struct bpf_map *map, struct bpf_map *ex_map,
u32 flags)
{
struct bpf_dtab_netdev *obj = NULL, *next_obj = NULL;
struct xdp_frame *xdpf, *nxdpf;
int ex_ifindex;
u32 key, next_key;

ex_ifindex = flags & BPF_F_EXCLUDE_INGRESS ? dev_rx->ifindex : 0;

/* Find first available obj */
obj = devmap_get_next_obj(xdp, map, ex_map, NULL, &key, ex_ifindex);
if (!obj)
return 0;

xdpf = xdp_convert_buff_to_frame(xdp);
if (unlikely(!xdpf))
return -EOVERFLOW;

for (;;) {
/* Check if we still have one more available obj */
next_obj = devmap_get_next_obj(xdp, map, ex_map, &key,
&next_key, ex_ifindex);
if (!next_obj) {
bq_enqueue(obj->dev, xdpf, dev_rx, obj->xdp_prog);
return 0;
}

nxdpf = xdpf_clone(xdpf);
if (unlikely(!nxdpf)) {
xdp_return_frame_rx_napi(xdpf);
return -ENOMEM;
}

bq_enqueue(obj->dev, nxdpf, dev_rx, obj->xdp_prog);

/* Deal with next obj */
obj = next_obj;
key = next_key;
}
}

int dev_map_generic_redirect(struct bpf_dtab_netdev *dst, struct sk_buff *skb,
struct bpf_prog *xdp_prog)
{
Expand Down
6 changes: 6 additions & 0 deletions kernel/bpf/verifier.c
Original file line number Diff line number Diff line change
Expand Up @@ -4461,6 +4461,7 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_DEVMAP:
case BPF_MAP_TYPE_DEVMAP_HASH:
if (func_id != BPF_FUNC_redirect_map &&
func_id != BPF_FUNC_redirect_map_multi &&
func_id != BPF_FUNC_map_lookup_elem)
goto error;
break;
Expand Down Expand Up @@ -4565,6 +4566,11 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
map->map_type != BPF_MAP_TYPE_XSKMAP)
goto error;
break;
case BPF_FUNC_redirect_map_multi:
if (map->map_type != BPF_MAP_TYPE_DEVMAP &&
map->map_type != BPF_MAP_TYPE_DEVMAP_HASH)
goto error;
break;
case BPF_FUNC_sk_redirect_map:
case BPF_FUNC_msg_redirect_map:
case BPF_FUNC_sock_map_update:
Expand Down
Loading

0 comments on commit 34a3d52

Please sign in to comment.