From b8f6fad5b1ef47e2b188ab639122fb1f61120ac8 Mon Sep 17 00:00:00 2001 From: Hsiao-keng Jerry Chu Date: Sat, 30 Jul 2016 09:37:54 -0700 Subject: [PATCH] lkl: Add offload (TSO4, CSUM) support to LKL device, #1 of 2 This patch prepares the LKL device code for the offload support. The gist of it is to change the tx() and rx() interfaces to accept a vector of buffers (like iovec) rather than just a single one. This allows the support of sg list originated from skb's frags that come down from the stack above to be passed to a backend device without the need to copy. The patch introduces a new environment variable "LKL_HIJACK_OFFLOAD" to the hijack code to allow easy enabling of any combination of offload features. Once the desirable offload features are decided, the code will program the tap device accordingly, e.g., to properly translate offload flags, and decide whether IFF_VNET_HDR needs to be enabled. It will also program the selected features into LKL kernel's virtio driver. Note that the actual code to program the ring properly for various offload flavors is in the 2nd commit. The patch also allows an existing env var "LKL_HIJACK_DEBUG" to take on a hex value, e.g., "export LKL_HIJACK_DEBUG=0x100". This is used to set an internal debug flag "lkl_debug" in the hijack code for diagnosis purpose. The above export statement, for example, will cause the LKL kernel to pause after the hijack'ed app exits. This allows one to debug or collect info from the LKL kernel before it quits. Signed-off-by: H.K. Jerry Chu --- Documentation/lkl.txt | 19 +++++++++- tools/lkl/include/lkl.h | 4 +- tools/lkl/include/lkl_host.h | 20 +++++----- tools/lkl/lib/hijack/init.c | 51 +++++++++++++++++++------- tools/lkl/lib/virtio_net.c | 23 ++++++------ tools/lkl/lib/virtio_net_dpdk.c | 17 +++++---- tools/lkl/lib/virtio_net_linux_fdnet.c | 24 ++++++------ tools/lkl/lib/virtio_net_tap.c | 34 +++++++++++++++-- tools/lkl/lib/virtio_net_vde.c | 21 ++++++----- tools/lkl/tests/boot.c | 4 +- tools/lkl/tests/net-test.c | 4 +- 11 files changed, 147 insertions(+), 74 deletions(-) diff --git a/Documentation/lkl.txt b/Documentation/lkl.txt index ef4b1afeef59ce..c0cbaf09573b57 100644 --- a/Documentation/lkl.txt +++ b/Documentation/lkl.txt @@ -198,7 +198,12 @@ are the list of those variable for your environment. ``` * LKL_HIJACK_DEBUG - increase the verbose level of debug information. + Setting it causes some debug information (both from the kernel and the + LKL library) to be enabled. + It is also used as a bit mask to turn on specific debugging facilities. + E.g., setting it to 0x100 ("export LKL_HIJACK_DEBUG=0x100") will cause + the LKL kernel to pause after the hijack'ed app exits. This allows one + to debug or collect info from the LKL kernel before it quits. ``` $ LKL_HIJACK_DEBUG=1 lkl-hijack.sh ip address show ``` @@ -210,6 +215,18 @@ are the list of those variable for your environment. ``` $ LKL_HIJACK_SINGLE_CPU=1 lkl-hijack.sh ip address show ``` +* LKL_HIJACK_OFFLOAD + + Work as a bit mask to enable selective device offload features. E.g., + to enable "mergeable RX buffer" (LKL_VIRTIO_NET_F_MRG_RXBUF) + + "guest csum" (LKL_VIRTIO_NET_F_GUEST_CSUM) device features, simply set + it to 0x8002. + + See virtio_net.h for a list of offload features and their bit masks. +``` + $ LKL_HIJACK_OFFLOAD=0x8002 lkl-hijack.sh ./netserver -D -f +``` + FAQ === diff --git a/tools/lkl/include/lkl.h b/tools/lkl/include/lkl.h index 50b21a619b6b24..2310a28700e294 100644 --- a/tools/lkl/include/lkl.h +++ b/tools/lkl/include/lkl.h @@ -237,7 +237,7 @@ struct lkl_netdev; * @returns a network device id (0 is valid) or a strictly negative value in * case of error */ -int lkl_netdev_add(struct lkl_netdev *nd, void *mac); +int lkl_netdev_add(struct lkl_netdev *nd, void *mac, int offload); /** * lkl_netdevs_remove - destroy all network devices @@ -283,7 +283,7 @@ int lkl_stop_syscall_thread(void); * @ifname - interface name for the TAP device. need to be configured * on host in advance */ -struct lkl_netdev *lkl_netdev_tap_create(const char *ifname); +struct lkl_netdev *lkl_netdev_tap_create(const char *ifname, int offload); /** * lkl_netdev_dpdk_create - create DPDK net_device for the virtio net backend diff --git a/tools/lkl/include/lkl_host.h b/tools/lkl/include/lkl_host.h index 2ccaa90153c22d..9a7b5e45f572f2 100644 --- a/tools/lkl/include/lkl_host.h +++ b/tools/lkl/include/lkl_host.h @@ -21,7 +21,7 @@ extern char lkl_virtio_devs[256]; struct lkl_dev_buf { void *addr; - unsigned int len; + size_t len; }; extern struct lkl_dev_blk_ops lkl_dev_blk_ops; @@ -50,6 +50,7 @@ struct lkl_dev_blk_ops { struct lkl_netdev { struct lkl_dev_net_ops *ops; lkl_thread_t rx_tid, tx_tid; + uint8_t has_vnet_hdr: 1; }; struct lkl_dev_net_ops { @@ -58,11 +59,11 @@ struct lkl_dev_net_ops { * The data buffer can only hold 0 or 1 complete packets. * * @nd - pointer to the network device - * @data - pointer to the buffer - * @len - size of the buffer in bytes - * @returns 0 for success and -1 for failure. + * @iov - pointer to the buffer vector + * @cnt - # of vectors in iov. + * @returns number of bytes transmitted */ - int (*tx)(struct lkl_netdev *nd, void *data, int len); + int (*tx)(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt); /* Reads a packet from the net device. * * It must only read one complete packet if present. @@ -71,12 +72,11 @@ struct lkl_dev_net_ops { * decide to drop it or trim it. * * @nd - pointer to the network device - * @data - pointer to the buffer to store the packet - * @len - pointer to the maximum size of the buffer. Also stores the - * real number of bytes read after return. - * @returns 0 for success and -1 if nothing is read. + * @iov - pointer to the buffer vector to store the packet + * @cnt - # of vectors in iov. + * @returns number of bytes read for success or < 0 if error */ - int (*rx)(struct lkl_netdev *nd, void *data, int *len); + int (*rx)(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt); #define LKL_DEV_NET_POLL_RX 1 #define LKL_DEV_NET_POLL_TX 2 /* Polls a net device. diff --git a/tools/lkl/lib/hijack/init.c b/tools/lkl/lib/hijack/init.c index 2f2f59c5048679..9307977e77ace0 100644 --- a/tools/lkl/lib/hijack/init.c +++ b/tools/lkl/lib/hijack/init.c @@ -202,6 +202,8 @@ static void PinToFirstCpu(const cpu_set_t* cpus) } } +int lkl_debug; + void __attribute__((constructor(102))) hijack_init(void) { @@ -234,11 +236,17 @@ hijack_init(void) char *single_cpu= getenv("LKL_HIJACK_SINGLE_CPU"); int single_cpu_mode = 0; cpu_set_t ori_cpu; + char *offload1 = getenv("LKL_HIJACK_OFFLOAD"); + int offload = 0; - if (!debug) + if (!debug) { lkl_host_ops.print = NULL; - else + } else { lkl_register_dbg_handler(); + lkl_debug = strtol(debug, NULL, 0); + } + if (offload1) + offload = strtol(offload1, NULL, 0); if (single_cpu) { single_cpu_mode = atoi(single_cpu); @@ -274,18 +282,28 @@ hijack_init(void) "WARN: variable LKL_HIJACK_NET_TAP is now obsoleted.\n" " please use LKL_HIJACK_NET_IFTYPE and " "LKL_HIJACK_NET_IFPARAMS instead.\n"); - nd = lkl_netdev_tap_create(tap); + nd = lkl_netdev_tap_create(tap, offload); } if (!nd && iftype && ifparams) { - if ((strcmp(iftype, "tap") == 0)) - nd = lkl_netdev_tap_create(ifparams); - else if (strcmp(iftype, "dpdk") == 0) - nd = lkl_netdev_dpdk_create(ifparams); - else if (strcmp(iftype, "vde") == 0) - nd = lkl_netdev_vde_create(ifparams); - else if (strcmp(iftype, "raw") == 0) - nd = lkl_netdev_raw_create(ifparams); + if ((strcmp(iftype, "tap") == 0)) { + nd = lkl_netdev_tap_create(ifparams, offload); + } else { + if (offload) { + fprintf(stderr, + "WARN: LKL_HIJACK_OFFLOAD is only " + "supported on tap device (for now)!\n" + "No offload features will be " + "enabled.\n"); + } + offload = 0; + if (strcmp(iftype, "dpdk") == 0) + nd = lkl_netdev_dpdk_create(ifparams); + else if (strcmp(iftype, "vde") == 0) + nd = lkl_netdev_vde_create(ifparams); + else if (strcmp(iftype, "raw") == 0) + nd = lkl_netdev_raw_create(ifparams); + } } if (nd) { @@ -295,9 +313,9 @@ hijack_init(void) fprintf(stderr, "failed to parse mac\n"); return; } else if (ret > 0) { - ret = lkl_netdev_add(nd, mac); + ret = lkl_netdev_add(nd, mac, offload); } else { - ret = lkl_netdev_add(nd, NULL); + ret = lkl_netdev_add(nd, NULL, offload); } if (ret < 0) { @@ -388,6 +406,13 @@ hijack_fini(void) int i; char *dump = getenv("LKL_HIJACK_DUMP"); + /* The following pauses the kernel before exiting allowing one + * to debug or collect stattistics/diagnosis info from it. + */ + if (lkl_debug & 0x100) { + while (1) + pause(); + } if (dump) mount_cmds_exec(dump, dump_file); diff --git a/tools/lkl/lib/virtio_net.c b/tools/lkl/lib/virtio_net.c index d402b7de7bc938..43231e2f1d2ed1 100644 --- a/tools/lkl/lib/virtio_net.c +++ b/tools/lkl/lib/virtio_net.c @@ -73,28 +73,28 @@ static int net_enqueue(struct virtio_dev *dev, struct virtio_req *req) { struct lkl_virtio_net_hdr_v1 *header; struct virtio_net_dev *net_dev; - int ret, len; - void *buf; + int ret; + struct lkl_dev_buf iov[1]; header = req->buf[0].addr; net_dev = netdev_of(dev); - len = req->buf[0].len - sizeof(*header); + iov[0].len = req->buf[0].len - sizeof(*header); - buf = &header[1]; + iov[0].addr = &header[1]; - if (!len && req->buf_count > 1) { - buf = req->buf[1].addr; - len = req->buf[1].len; + if (!iov[0].len && req->buf_count > 1) { + iov[0].addr = req->buf[1].addr; + iov[0].len = req->buf[1].len; } /* Pick which virtqueue to send the buffer(s) to */ if (is_tx_queue(dev, req->q)) { - ret = net_dev->ops->tx(net_dev->nd, buf, len); + ret = net_dev->ops->tx(net_dev->nd, iov, 1); if (ret < 0) return -1; } else if (is_rx_queue(dev, req->q)) { header->num_buffers = 1; - ret = net_dev->ops->rx(net_dev->nd, buf, &len); + ret = net_dev->ops->rx(net_dev->nd, iov, 1); if (ret < 0) return -1; } else { @@ -102,7 +102,7 @@ static int net_enqueue(struct virtio_dev *dev, struct virtio_req *req) return -1; } - virtio_req_complete(req, len + sizeof(*header)); + virtio_req_complete(req, iov[0].len + sizeof(*header)); return 0; } @@ -174,7 +174,7 @@ static struct lkl_mutex **init_queue_locks(int num_queues) return ret; } -int lkl_netdev_add(struct lkl_netdev *nd, void *mac) +int lkl_netdev_add(struct lkl_netdev *nd, void *mac, int offload) { struct virtio_net_dev *dev; int ret = -LKL_ENOMEM; @@ -188,6 +188,7 @@ int lkl_netdev_add(struct lkl_netdev *nd, void *mac) dev->dev.device_id = LKL_VIRTIO_ID_NET; if (mac) dev->dev.device_features |= BIT(LKL_VIRTIO_NET_F_MAC); + dev->dev.device_features |= offload; dev->dev.config_data = &dev->config; dev->dev.config_len = sizeof(dev->config); dev->dev.ops = &net_ops; diff --git a/tools/lkl/lib/virtio_net_dpdk.c b/tools/lkl/lib/virtio_net_dpdk.c index bf31fe8a0dca98..10a8884d55111d 100644 --- a/tools/lkl/lib/virtio_net_dpdk.c +++ b/tools/lkl/lib/virtio_net_dpdk.c @@ -58,11 +58,13 @@ struct lkl_netdev_dpdk { int bufidx; }; -static int net_tx(struct lkl_netdev *nd, void *data, int len) +static int net_tx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt) { void *pkt; struct rte_mbuf *rm; struct lkl_netdev_dpdk *nd_dpdk; + void *data = iov[0].addr; + int len = (int)iov[0].len; nd_dpdk = (struct lkl_netdev_dpdk *) nd; @@ -80,7 +82,7 @@ static int net_tx(struct lkl_netdev *nd, void *data, int len) /* XXX: should be bulk-trasmitted !! */ rte_eth_tx_burst(nd_dpdk->portid, 0, &rm, 1); - return 0; + return len; } /* @@ -90,10 +92,12 @@ static int net_tx(struct lkl_netdev *nd, void *data, int len) * refactor allows us to read in parallel, the buffer (nd_dpdk->rms) shall * be guarded. */ -static int net_rx(struct lkl_netdev *nd, void *data, int *len) +static int net_rx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt) { struct lkl_netdev_dpdk *nd_dpdk; int i, nb_rx, read = 0; + void *data = iov[0].addr; + int len = (int)iov[0].len; nd_dpdk = (struct lkl_netdev_dpdk *) nd; @@ -122,8 +126,8 @@ static int net_rx(struct lkl_netdev *nd, void *data, int *len) r_data = rte_pktmbuf_mtod(rm, void *); r_size = rte_pktmbuf_data_len(rm); - *len -= r_size; - if (*len < 0) { + len -= r_size; + if (len < 0) { fprintf(stderr, "dpdk: buffer full. skip it\n"); goto end; } @@ -144,8 +148,7 @@ static int net_rx(struct lkl_netdev *nd, void *data, int *len) for (i = 0; i < nb_rx; i++) rte_pktmbuf_free(nd_dpdk->rms[i]); - *len = read; - return 0; + return read; } static int net_poll(struct lkl_netdev *nd, int events) diff --git a/tools/lkl/lib/virtio_net_linux_fdnet.c b/tools/lkl/lib/virtio_net_linux_fdnet.c index 7cbe7db2ff182c..fdcf65b5073243 100644 --- a/tools/lkl/lib/virtio_net_linux_fdnet.c +++ b/tools/lkl/lib/virtio_net_linux_fdnet.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "virtio.h" #include "virtio_net_linux_fdnet.h" @@ -31,40 +32,37 @@ struct lkl_netdev_linux_fdnet_ops lkl_netdev_linux_fdnet_ops = { #endif /* __NR_eventfd */ }; -static int linux_fdnet_net_tx(struct lkl_netdev *nd, void *data, int len) +static int linux_fdnet_net_tx(struct lkl_netdev *nd, + struct lkl_dev_buf *iov, int cnt) { int ret; struct lkl_netdev_linux_fdnet *nd_fdnet = container_of(nd, struct lkl_netdev_linux_fdnet, dev); do { - ret = write(nd_fdnet->fd, data, len); + ret = writev(nd_fdnet->fd, (struct iovec *)iov, cnt); } while (ret == -1 && errno == EINVAL); - if (ret > 0) - return 0; + if (ret < 0 && errno != EAGAIN) perror("write to Linux fd netdev fails"); - - return -1; + return ret; } -static int linux_fdnet_net_rx(struct lkl_netdev *nd, void *data, int *len) +static int linux_fdnet_net_rx(struct lkl_netdev *nd, + struct lkl_dev_buf *iov, int cnt) { int ret; struct lkl_netdev_linux_fdnet *nd_fdnet = container_of(nd, struct lkl_netdev_linux_fdnet, dev); do { - ret = read(nd_fdnet->fd, data, *len); + ret = readv(nd_fdnet->fd, (struct iovec *)iov, cnt); } while (ret == -1 && errno == EINVAL); - if (ret > 0) { - *len = ret; - return 0; - } + if (ret < 0 && errno != EAGAIN) perror("read from fdnet device fails"); - return -1; + return ret; } static int linux_fdnet_net_poll(struct lkl_netdev *nd, int events) diff --git a/tools/lkl/lib/virtio_net_tap.c b/tools/lkl/lib/virtio_net_tap.c index 2013b33b936102..76ee2bb23089d2 100644 --- a/tools/lkl/lib/virtio_net_tap.c +++ b/tools/lkl/lib/virtio_net_tap.c @@ -21,15 +21,30 @@ #include "virtio.h" #include "virtio_net_linux_fdnet.h" -struct lkl_netdev *lkl_netdev_tap_create(const char *ifname) +#define BIT(x) (1ULL << x) + +struct lkl_netdev *lkl_netdev_tap_create(const char *ifname, int offload) { struct lkl_netdev_linux_fdnet *nd; - int fd, ret; + int fd, ret, tap_arg = 0; + int vnet_hdr_sz = 0; struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI, }; + if (offload & BIT(LKL_VIRTIO_NET_F_GUEST_CSUM)) + tap_arg |= TUN_F_CSUM; + if (offload & (BIT(LKL_VIRTIO_NET_F_GUEST_TSO4) | + BIT(LKL_VIRTIO_NET_F_MRG_RXBUF))) + tap_arg |= TUN_F_TSO4 | TUN_F_CSUM; + + if (tap_arg || (offload & (BIT(LKL_VIRTIO_NET_F_CSUM) | + BIT(LKL_VIRTIO_NET_F_HOST_TSO4)))) { + ifr.ifr_flags |= IFF_VNET_HDR; + vnet_hdr_sz = sizeof(struct lkl_virtio_net_hdr_v1); + } + strncpy(ifr.ifr_name, ifname, IFNAMSIZ); fd = open("/dev/net/tun", O_RDWR|O_NONBLOCK); @@ -45,12 +60,23 @@ struct lkl_netdev *lkl_netdev_tap_create(const char *ifname) close(fd); return NULL; } - + if (vnet_hdr_sz && ioctl(fd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0) { + fprintf(stderr, "tap: failed to TUNSETVNETHDRSZ to %s: %s\n", + ifr.ifr_name, strerror(errno)); + close(fd); + return NULL; + } + if (tap_arg && ioctl(fd, TUNSETOFFLOAD, tap_arg) != 0) { + fprintf(stderr, "tap: failed to TUNSETOFFLOAD to %s: %s\n", + ifr.ifr_name, strerror(errno)); + close(fd); + return NULL; + } nd = lkl_register_netdev_linux_fdnet(fd); if (!nd) { perror("failed to register to."); return NULL; } - + nd->dev.has_vnet_hdr = (vnet_hdr_sz != 0); return (struct lkl_netdev *)nd; } diff --git a/tools/lkl/lib/virtio_net_vde.c b/tools/lkl/lib/virtio_net_vde.c index 6624bda6a9e41b..20535439fae309 100644 --- a/tools/lkl/lib/virtio_net_vde.c +++ b/tools/lkl/lib/virtio_net_vde.c @@ -16,8 +16,8 @@ struct lkl_netdev_vde { }; struct lkl_netdev *nuse_vif_vde_create(char *switch_path); -static int net_vde_tx(struct lkl_netdev *nd, void *data, int len); -static int net_vde_rx(struct lkl_netdev *nd, void *data, int *len); +static int net_vde_tx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt); +static int net_vde_rx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt); static int net_vde_poll_with_timeout(struct lkl_netdev *nd, int events, int timeout); static int net_vde_poll(struct lkl_netdev *nd, int events); @@ -28,23 +28,27 @@ struct lkl_dev_net_ops vde_net_ops = { .poll = net_vde_poll, }; -int net_vde_tx(struct lkl_netdev *nd, void *data, int len) +int net_vde_tx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt) { int ret; struct lkl_netdev_vde *nd_vde; + void *data = iov[0].addr; + int len = (int)iov[0].len; nd_vde = (struct lkl_netdev_vde *) nd; ret = vde_send(nd_vde->conn, data, len, 0); if (ret <= 0 && errno == EAGAIN) return -1; - return 0; + return ret; } -int net_vde_rx(struct lkl_netdev *nd, void *data, int *len) +int net_vde_rx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt) { int ret; struct lkl_netdev_vde *nd_vde; + void *data = iov[0].addr; + int len = (int)iov[0].len; nd_vde = (struct lkl_netdev_vde *) nd; @@ -52,17 +56,16 @@ int net_vde_rx(struct lkl_netdev *nd, void *data, int *len) * Due to a bug in libvdeplug we have to first poll to make sure * that there is data available. * The correct solution would be to just use - * ret = vde_recv(nd_vde->conn, data, *len, MSG_DONTWAIT); + * ret = vde_recv(nd_vde->conn, data, len, MSG_DONTWAIT); * This should be changed once libvdeplug is fixed. */ ret = 0; if (net_vde_poll_with_timeout(nd, LKL_DEV_NET_POLL_RX, 0) & LKL_DEV_NET_POLL_RX) - ret = vde_recv(nd_vde->conn, data, *len, 0); + ret = vde_recv(nd_vde->conn, data, len, 0); if (ret <= 0) return -1; - *len = ret; - return 0; + return ret; } int net_vde_poll_with_timeout(struct lkl_netdev *nd, int events, int timeout) diff --git a/tools/lkl/tests/boot.c b/tools/lkl/tests/boot.c index 5620c062d6bc7b..6b0b7ac8f626d4 100644 --- a/tools/lkl/tests/boot.c +++ b/tools/lkl/tests/boot.c @@ -328,11 +328,11 @@ int test_netdev_add(char *str, int len) struct lkl_netdev *netdev; int ret = 0; - netdev = lkl_netdev_tap_create(cla.tap_ifname); + netdev = lkl_netdev_tap_create(cla.tap_ifname, 0); if (!netdev) goto out; - ret = lkl_netdev_add((struct lkl_netdev *)netdev, NULL); + ret = lkl_netdev_add((struct lkl_netdev *)netdev, NULL, 0); if (ret < 0) goto out; diff --git a/tools/lkl/tests/net-test.c b/tools/lkl/tests/net-test.c index 90225e6ff21b59..480ff7809f6e87 100644 --- a/tools/lkl/tests/net-test.c +++ b/tools/lkl/tests/net-test.c @@ -130,7 +130,7 @@ static int test_net_init(int argc, char **argv) gateway = argv[6]; if (iftype && ifname && (strncmp(iftype, "tap", 3) == 0)) - nd = lkl_netdev_tap_create(ifname); + nd = lkl_netdev_tap_create(ifname, 0); #ifdef CONFIG_AUTO_LKL_VIRTIO_NET_DPDK else if (iftype && ifname && (strncmp(iftype, "dpdk", 4) == 0)) nd = lkl_netdev_dpdk_create(ifname); @@ -143,7 +143,7 @@ static int test_net_init(int argc, char **argv) return -1; } - ret = lkl_netdev_add(nd, NULL); + ret = lkl_netdev_add(nd, NULL, 0); if (ret < 0) { fprintf(stderr, "failed to add netdev: %s\n", lkl_strerror(ret));