diff --git a/drivers/net/tun.c b/drivers/net/tun.c index f16407242b1891..9880b3bc8fa5d1 100644 --- a/drivers/net/tun.c +++ b/drivers/net/tun.c @@ -75,6 +75,7 @@ #include #include #include +#include #include @@ -121,7 +122,8 @@ do { \ #define TUN_VNET_BE 0x40000000 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \ - IFF_MULTI_QUEUE | IFF_NAPI) + IFF_MULTI_QUEUE | IFF_NAPI | IFF_NAPI_FRAGS) + #define GOODCOPY_LEN 128 #define FLT_EXACT_COUNT 8 @@ -173,6 +175,7 @@ struct tun_file { unsigned int ifindex; }; struct napi_struct napi; + struct mutex napi_mutex; /* Protects access to the above napi */ struct list_head next; struct tun_struct *detached; struct skb_array tx_array; @@ -277,6 +280,7 @@ static void tun_napi_init(struct tun_struct *tun, struct tun_file *tfile, netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll, NAPI_POLL_WEIGHT); napi_enable(&tfile->napi); + mutex_init(&tfile->napi_mutex); } } @@ -292,6 +296,11 @@ static void tun_napi_del(struct tun_struct *tun, struct tun_file *tfile) netif_napi_del(&tfile->napi); } +static bool tun_napi_frags_enabled(const struct tun_struct *tun) +{ + return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS; +} + #ifdef CONFIG_TUN_VNET_CROSS_LE static inline bool tun_legacy_is_little_endian(struct tun_struct *tun) { @@ -1036,7 +1045,8 @@ static void tun_poll_controller(struct net_device *dev) * supports polling, which enables bridge devices in virt setups to * still use netconsole * If NAPI is enabled, however, we need to schedule polling for all - * queues. + * queues unless we are using napi_gro_frags(), which we call in + * process context and not in NAPI context. */ struct tun_struct *tun = netdev_priv(dev); @@ -1044,6 +1054,9 @@ static void tun_poll_controller(struct net_device *dev) struct tun_file *tfile; int i; + if (tun_napi_frags_enabled(tun)) + return; + rcu_read_lock(); for (i = 0; i < tun->numqueues; i++) { tfile = rcu_dereference(tun->tfiles[i]); @@ -1266,6 +1279,64 @@ static unsigned int tun_chr_poll(struct file *file, poll_table *wait) return mask; } +static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile, + size_t len, + const struct iov_iter *it) +{ + struct sk_buff *skb; + size_t linear; + int err; + int i; + + if (it->nr_segs > MAX_SKB_FRAGS + 1) + return ERR_PTR(-ENOMEM); + + local_bh_disable(); + skb = napi_get_frags(&tfile->napi); + local_bh_enable(); + if (!skb) + return ERR_PTR(-ENOMEM); + + linear = iov_iter_single_seg_count(it); + err = __skb_grow(skb, linear); + if (err) + goto free; + + skb->len = len; + skb->data_len = len - linear; + skb->truesize += skb->data_len; + + for (i = 1; i < it->nr_segs; i++) { + size_t fragsz = it->iov[i].iov_len; + unsigned long offset; + struct page *page; + void *data; + + if (fragsz == 0 || fragsz > PAGE_SIZE) { + err = -EINVAL; + goto free; + } + + local_bh_disable(); + data = napi_alloc_frag(fragsz); + local_bh_enable(); + if (!data) { + err = -ENOMEM; + goto free; + } + + page = virt_to_head_page(data); + offset = data - page_address(page); + skb_fill_page_desc(skb, i - 1, page, offset, fragsz); + } + + return skb; +free: + /* frees skb and all frags allocated with napi_alloc_frag() */ + napi_free_frags(&tfile->napi); + return ERR_PTR(err); +} + /* prepad is the amount to reserve at front. len is length after that. * linear is a hint as to how much to copy (usually headers). */ static struct sk_buff *tun_alloc_skb(struct tun_file *tfile, @@ -1478,6 +1549,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, int err; u32 rxhash; int skb_xdp = 1; + bool frags = tun_napi_frags_enabled(tun); if (!(tun->dev->flags & IFF_UP)) return -EIO; @@ -1535,7 +1607,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, zerocopy = true; } - if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { + if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) { /* For the packet that is not easy to be processed * (e.g gso or jumbo packet), we will do it at after * skb was created with generic XDP routine. @@ -1556,10 +1628,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, linear = tun16_to_cpu(tun, gso.hdr_len); } - skb = tun_alloc_skb(tfile, align, copylen, linear, noblock); + if (frags) { + mutex_lock(&tfile->napi_mutex); + skb = tun_napi_alloc_frags(tfile, copylen, from); + /* tun_napi_alloc_frags() enforces a layout for the skb. + * If zerocopy is enabled, then this layout will be + * overwritten by zerocopy_sg_from_iter(). + */ + zerocopy = false; + } else { + skb = tun_alloc_skb(tfile, align, copylen, linear, + noblock); + } + if (IS_ERR(skb)) { if (PTR_ERR(skb) != -EAGAIN) this_cpu_inc(tun->pcpu_stats->rx_dropped); + if (frags) + mutex_unlock(&tfile->napi_mutex); return PTR_ERR(skb); } @@ -1571,6 +1657,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (err) { this_cpu_inc(tun->pcpu_stats->rx_dropped); kfree_skb(skb); + if (frags) { + tfile->napi.skb = NULL; + mutex_unlock(&tfile->napi_mutex); + } + return -EFAULT; } } @@ -1578,6 +1669,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) { this_cpu_inc(tun->pcpu_stats->rx_frame_errors); kfree_skb(skb); + if (frags) { + tfile->napi.skb = NULL; + mutex_unlock(&tfile->napi_mutex); + } + return -EINVAL; } @@ -1603,7 +1699,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, skb->dev = tun->dev; break; case IFF_TAP: - skb->protocol = eth_type_trans(skb, tun->dev); + if (!frags) + skb->protocol = eth_type_trans(skb, tun->dev); break; } @@ -1638,7 +1735,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, rxhash = __skb_get_hash_symmetric(skb); - if (tun->flags & IFF_NAPI) { + if (frags) { + /* Exercise flow dissector code path. */ + u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb)); + + if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) { + this_cpu_inc(tun->pcpu_stats->rx_dropped); + napi_free_frags(&tfile->napi); + mutex_unlock(&tfile->napi_mutex); + WARN_ON(1); + return -ENOMEM; + } + + local_bh_disable(); + napi_gro_frags(&tfile->napi); + local_bh_enable(); + mutex_unlock(&tfile->napi_mutex); + } else if (tun->flags & IFF_NAPI) { struct sk_buff_head *queue = &tfile->sk.sk_write_queue; int queue_len; @@ -2061,6 +2174,15 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) if (tfile->detached) return -EINVAL; + if ((ifr->ifr_flags & IFF_NAPI_FRAGS)) { + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (!(ifr->ifr_flags & IFF_NAPI) || + (ifr->ifr_flags & TUN_TYPE_MASK) != IFF_TAP) + return -EINVAL; + } + dev = __dev_get_by_name(net, ifr->ifr_name); if (dev) { if (ifr->ifr_flags & IFF_TUN_EXCL) diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h index 30b6184884ebc4..365ade5685c98f 100644 --- a/include/uapi/linux/if_tun.h +++ b/include/uapi/linux/if_tun.h @@ -61,6 +61,7 @@ #define IFF_TUN 0x0001 #define IFF_TAP 0x0002 #define IFF_NAPI 0x0010 +#define IFF_NAPI_FRAGS 0x0020 #define IFF_NO_PI 0x1000 /* This flag has no real effect */ #define IFF_ONE_QUEUE 0x2000