diff --git a/Documentation/lkl.txt b/Documentation/lkl.txt index ef4b1afeef59ce..c0cbaf09573b57 100644 --- a/Documentation/lkl.txt +++ b/Documentation/lkl.txt @@ -198,7 +198,12 @@ are the list of those variable for your environment. ``` * LKL_HIJACK_DEBUG - increase the verbose level of debug information. + Setting it causes some debug information (both from the kernel and the + LKL library) to be enabled. + It is also used as a bit mask to turn on specific debugging facilities. + E.g., setting it to 0x100 ("export LKL_HIJACK_DEBUG=0x100") will cause + the LKL kernel to pause after the hijack'ed app exits. This allows one + to debug or collect info from the LKL kernel before it quits. ``` $ LKL_HIJACK_DEBUG=1 lkl-hijack.sh ip address show ``` @@ -210,6 +215,18 @@ are the list of those variable for your environment. ``` $ LKL_HIJACK_SINGLE_CPU=1 lkl-hijack.sh ip address show ``` +* LKL_HIJACK_OFFLOAD + + Work as a bit mask to enable selective device offload features. E.g., + to enable "mergeable RX buffer" (LKL_VIRTIO_NET_F_MRG_RXBUF) + + "guest csum" (LKL_VIRTIO_NET_F_GUEST_CSUM) device features, simply set + it to 0x8002. + + See virtio_net.h for a list of offload features and their bit masks. +``` + $ LKL_HIJACK_OFFLOAD=0x8002 lkl-hijack.sh ./netserver -D -f +``` + FAQ === diff --git a/tools/lkl/include/lkl.h b/tools/lkl/include/lkl.h index 50b21a619b6b24..2310a28700e294 100644 --- a/tools/lkl/include/lkl.h +++ b/tools/lkl/include/lkl.h @@ -237,7 +237,7 @@ struct lkl_netdev; * @returns a network device id (0 is valid) or a strictly negative value in * case of error */ -int lkl_netdev_add(struct lkl_netdev *nd, void *mac); +int lkl_netdev_add(struct lkl_netdev *nd, void *mac, int offload); /** * lkl_netdevs_remove - destroy all network devices @@ -283,7 +283,7 @@ int lkl_stop_syscall_thread(void); * @ifname - interface name for the TAP device. need to be configured * on host in advance */ -struct lkl_netdev *lkl_netdev_tap_create(const char *ifname); +struct lkl_netdev *lkl_netdev_tap_create(const char *ifname, int offload); /** * lkl_netdev_dpdk_create - create DPDK net_device for the virtio net backend diff --git a/tools/lkl/include/lkl_host.h b/tools/lkl/include/lkl_host.h index 2ccaa90153c22d..9a7b5e45f572f2 100644 --- a/tools/lkl/include/lkl_host.h +++ b/tools/lkl/include/lkl_host.h @@ -21,7 +21,7 @@ extern char lkl_virtio_devs[256]; struct lkl_dev_buf { void *addr; - unsigned int len; + size_t len; }; extern struct lkl_dev_blk_ops lkl_dev_blk_ops; @@ -50,6 +50,7 @@ struct lkl_dev_blk_ops { struct lkl_netdev { struct lkl_dev_net_ops *ops; lkl_thread_t rx_tid, tx_tid; + uint8_t has_vnet_hdr: 1; }; struct lkl_dev_net_ops { @@ -58,11 +59,11 @@ struct lkl_dev_net_ops { * The data buffer can only hold 0 or 1 complete packets. * * @nd - pointer to the network device - * @data - pointer to the buffer - * @len - size of the buffer in bytes - * @returns 0 for success and -1 for failure. + * @iov - pointer to the buffer vector + * @cnt - # of vectors in iov. + * @returns number of bytes transmitted */ - int (*tx)(struct lkl_netdev *nd, void *data, int len); + int (*tx)(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt); /* Reads a packet from the net device. * * It must only read one complete packet if present. @@ -71,12 +72,11 @@ struct lkl_dev_net_ops { * decide to drop it or trim it. * * @nd - pointer to the network device - * @data - pointer to the buffer to store the packet - * @len - pointer to the maximum size of the buffer. Also stores the - * real number of bytes read after return. - * @returns 0 for success and -1 if nothing is read. + * @iov - pointer to the buffer vector to store the packet + * @cnt - # of vectors in iov. + * @returns number of bytes read for success or < 0 if error */ - int (*rx)(struct lkl_netdev *nd, void *data, int *len); + int (*rx)(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt); #define LKL_DEV_NET_POLL_RX 1 #define LKL_DEV_NET_POLL_TX 2 /* Polls a net device. diff --git a/tools/lkl/lib/hijack/init.c b/tools/lkl/lib/hijack/init.c index 2f2f59c5048679..9307977e77ace0 100644 --- a/tools/lkl/lib/hijack/init.c +++ b/tools/lkl/lib/hijack/init.c @@ -202,6 +202,8 @@ static void PinToFirstCpu(const cpu_set_t* cpus) } } +int lkl_debug; + void __attribute__((constructor(102))) hijack_init(void) { @@ -234,11 +236,17 @@ hijack_init(void) char *single_cpu= getenv("LKL_HIJACK_SINGLE_CPU"); int single_cpu_mode = 0; cpu_set_t ori_cpu; + char *offload1 = getenv("LKL_HIJACK_OFFLOAD"); + int offload = 0; - if (!debug) + if (!debug) { lkl_host_ops.print = NULL; - else + } else { lkl_register_dbg_handler(); + lkl_debug = strtol(debug, NULL, 0); + } + if (offload1) + offload = strtol(offload1, NULL, 0); if (single_cpu) { single_cpu_mode = atoi(single_cpu); @@ -274,18 +282,28 @@ hijack_init(void) "WARN: variable LKL_HIJACK_NET_TAP is now obsoleted.\n" " please use LKL_HIJACK_NET_IFTYPE and " "LKL_HIJACK_NET_IFPARAMS instead.\n"); - nd = lkl_netdev_tap_create(tap); + nd = lkl_netdev_tap_create(tap, offload); } if (!nd && iftype && ifparams) { - if ((strcmp(iftype, "tap") == 0)) - nd = lkl_netdev_tap_create(ifparams); - else if (strcmp(iftype, "dpdk") == 0) - nd = lkl_netdev_dpdk_create(ifparams); - else if (strcmp(iftype, "vde") == 0) - nd = lkl_netdev_vde_create(ifparams); - else if (strcmp(iftype, "raw") == 0) - nd = lkl_netdev_raw_create(ifparams); + if ((strcmp(iftype, "tap") == 0)) { + nd = lkl_netdev_tap_create(ifparams, offload); + } else { + if (offload) { + fprintf(stderr, + "WARN: LKL_HIJACK_OFFLOAD is only " + "supported on tap device (for now)!\n" + "No offload features will be " + "enabled.\n"); + } + offload = 0; + if (strcmp(iftype, "dpdk") == 0) + nd = lkl_netdev_dpdk_create(ifparams); + else if (strcmp(iftype, "vde") == 0) + nd = lkl_netdev_vde_create(ifparams); + else if (strcmp(iftype, "raw") == 0) + nd = lkl_netdev_raw_create(ifparams); + } } if (nd) { @@ -295,9 +313,9 @@ hijack_init(void) fprintf(stderr, "failed to parse mac\n"); return; } else if (ret > 0) { - ret = lkl_netdev_add(nd, mac); + ret = lkl_netdev_add(nd, mac, offload); } else { - ret = lkl_netdev_add(nd, NULL); + ret = lkl_netdev_add(nd, NULL, offload); } if (ret < 0) { @@ -388,6 +406,13 @@ hijack_fini(void) int i; char *dump = getenv("LKL_HIJACK_DUMP"); + /* The following pauses the kernel before exiting allowing one + * to debug or collect stattistics/diagnosis info from it. + */ + if (lkl_debug & 0x100) { + while (1) + pause(); + } if (dump) mount_cmds_exec(dump, dump_file); diff --git a/tools/lkl/lib/virtio_net.c b/tools/lkl/lib/virtio_net.c index d402b7de7bc938..43231e2f1d2ed1 100644 --- a/tools/lkl/lib/virtio_net.c +++ b/tools/lkl/lib/virtio_net.c @@ -73,28 +73,28 @@ static int net_enqueue(struct virtio_dev *dev, struct virtio_req *req) { struct lkl_virtio_net_hdr_v1 *header; struct virtio_net_dev *net_dev; - int ret, len; - void *buf; + int ret; + struct lkl_dev_buf iov[1]; header = req->buf[0].addr; net_dev = netdev_of(dev); - len = req->buf[0].len - sizeof(*header); + iov[0].len = req->buf[0].len - sizeof(*header); - buf = &header[1]; + iov[0].addr = &header[1]; - if (!len && req->buf_count > 1) { - buf = req->buf[1].addr; - len = req->buf[1].len; + if (!iov[0].len && req->buf_count > 1) { + iov[0].addr = req->buf[1].addr; + iov[0].len = req->buf[1].len; } /* Pick which virtqueue to send the buffer(s) to */ if (is_tx_queue(dev, req->q)) { - ret = net_dev->ops->tx(net_dev->nd, buf, len); + ret = net_dev->ops->tx(net_dev->nd, iov, 1); if (ret < 0) return -1; } else if (is_rx_queue(dev, req->q)) { header->num_buffers = 1; - ret = net_dev->ops->rx(net_dev->nd, buf, &len); + ret = net_dev->ops->rx(net_dev->nd, iov, 1); if (ret < 0) return -1; } else { @@ -102,7 +102,7 @@ static int net_enqueue(struct virtio_dev *dev, struct virtio_req *req) return -1; } - virtio_req_complete(req, len + sizeof(*header)); + virtio_req_complete(req, iov[0].len + sizeof(*header)); return 0; } @@ -174,7 +174,7 @@ static struct lkl_mutex **init_queue_locks(int num_queues) return ret; } -int lkl_netdev_add(struct lkl_netdev *nd, void *mac) +int lkl_netdev_add(struct lkl_netdev *nd, void *mac, int offload) { struct virtio_net_dev *dev; int ret = -LKL_ENOMEM; @@ -188,6 +188,7 @@ int lkl_netdev_add(struct lkl_netdev *nd, void *mac) dev->dev.device_id = LKL_VIRTIO_ID_NET; if (mac) dev->dev.device_features |= BIT(LKL_VIRTIO_NET_F_MAC); + dev->dev.device_features |= offload; dev->dev.config_data = &dev->config; dev->dev.config_len = sizeof(dev->config); dev->dev.ops = &net_ops; diff --git a/tools/lkl/lib/virtio_net_dpdk.c b/tools/lkl/lib/virtio_net_dpdk.c index bf31fe8a0dca98..10a8884d55111d 100644 --- a/tools/lkl/lib/virtio_net_dpdk.c +++ b/tools/lkl/lib/virtio_net_dpdk.c @@ -58,11 +58,13 @@ struct lkl_netdev_dpdk { int bufidx; }; -static int net_tx(struct lkl_netdev *nd, void *data, int len) +static int net_tx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt) { void *pkt; struct rte_mbuf *rm; struct lkl_netdev_dpdk *nd_dpdk; + void *data = iov[0].addr; + int len = (int)iov[0].len; nd_dpdk = (struct lkl_netdev_dpdk *) nd; @@ -80,7 +82,7 @@ static int net_tx(struct lkl_netdev *nd, void *data, int len) /* XXX: should be bulk-trasmitted !! */ rte_eth_tx_burst(nd_dpdk->portid, 0, &rm, 1); - return 0; + return len; } /* @@ -90,10 +92,12 @@ static int net_tx(struct lkl_netdev *nd, void *data, int len) * refactor allows us to read in parallel, the buffer (nd_dpdk->rms) shall * be guarded. */ -static int net_rx(struct lkl_netdev *nd, void *data, int *len) +static int net_rx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt) { struct lkl_netdev_dpdk *nd_dpdk; int i, nb_rx, read = 0; + void *data = iov[0].addr; + int len = (int)iov[0].len; nd_dpdk = (struct lkl_netdev_dpdk *) nd; @@ -122,8 +126,8 @@ static int net_rx(struct lkl_netdev *nd, void *data, int *len) r_data = rte_pktmbuf_mtod(rm, void *); r_size = rte_pktmbuf_data_len(rm); - *len -= r_size; - if (*len < 0) { + len -= r_size; + if (len < 0) { fprintf(stderr, "dpdk: buffer full. skip it\n"); goto end; } @@ -144,8 +148,7 @@ static int net_rx(struct lkl_netdev *nd, void *data, int *len) for (i = 0; i < nb_rx; i++) rte_pktmbuf_free(nd_dpdk->rms[i]); - *len = read; - return 0; + return read; } static int net_poll(struct lkl_netdev *nd, int events) diff --git a/tools/lkl/lib/virtio_net_linux_fdnet.c b/tools/lkl/lib/virtio_net_linux_fdnet.c index 7cbe7db2ff182c..fdcf65b5073243 100644 --- a/tools/lkl/lib/virtio_net_linux_fdnet.c +++ b/tools/lkl/lib/virtio_net_linux_fdnet.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "virtio.h" #include "virtio_net_linux_fdnet.h" @@ -31,40 +32,37 @@ struct lkl_netdev_linux_fdnet_ops lkl_netdev_linux_fdnet_ops = { #endif /* __NR_eventfd */ }; -static int linux_fdnet_net_tx(struct lkl_netdev *nd, void *data, int len) +static int linux_fdnet_net_tx(struct lkl_netdev *nd, + struct lkl_dev_buf *iov, int cnt) { int ret; struct lkl_netdev_linux_fdnet *nd_fdnet = container_of(nd, struct lkl_netdev_linux_fdnet, dev); do { - ret = write(nd_fdnet->fd, data, len); + ret = writev(nd_fdnet->fd, (struct iovec *)iov, cnt); } while (ret == -1 && errno == EINVAL); - if (ret > 0) - return 0; + if (ret < 0 && errno != EAGAIN) perror("write to Linux fd netdev fails"); - - return -1; + return ret; } -static int linux_fdnet_net_rx(struct lkl_netdev *nd, void *data, int *len) +static int linux_fdnet_net_rx(struct lkl_netdev *nd, + struct lkl_dev_buf *iov, int cnt) { int ret; struct lkl_netdev_linux_fdnet *nd_fdnet = container_of(nd, struct lkl_netdev_linux_fdnet, dev); do { - ret = read(nd_fdnet->fd, data, *len); + ret = readv(nd_fdnet->fd, (struct iovec *)iov, cnt); } while (ret == -1 && errno == EINVAL); - if (ret > 0) { - *len = ret; - return 0; - } + if (ret < 0 && errno != EAGAIN) perror("read from fdnet device fails"); - return -1; + return ret; } static int linux_fdnet_net_poll(struct lkl_netdev *nd, int events) diff --git a/tools/lkl/lib/virtio_net_tap.c b/tools/lkl/lib/virtio_net_tap.c index 2013b33b936102..76ee2bb23089d2 100644 --- a/tools/lkl/lib/virtio_net_tap.c +++ b/tools/lkl/lib/virtio_net_tap.c @@ -21,15 +21,30 @@ #include "virtio.h" #include "virtio_net_linux_fdnet.h" -struct lkl_netdev *lkl_netdev_tap_create(const char *ifname) +#define BIT(x) (1ULL << x) + +struct lkl_netdev *lkl_netdev_tap_create(const char *ifname, int offload) { struct lkl_netdev_linux_fdnet *nd; - int fd, ret; + int fd, ret, tap_arg = 0; + int vnet_hdr_sz = 0; struct ifreq ifr = { .ifr_flags = IFF_TAP | IFF_NO_PI, }; + if (offload & BIT(LKL_VIRTIO_NET_F_GUEST_CSUM)) + tap_arg |= TUN_F_CSUM; + if (offload & (BIT(LKL_VIRTIO_NET_F_GUEST_TSO4) | + BIT(LKL_VIRTIO_NET_F_MRG_RXBUF))) + tap_arg |= TUN_F_TSO4 | TUN_F_CSUM; + + if (tap_arg || (offload & (BIT(LKL_VIRTIO_NET_F_CSUM) | + BIT(LKL_VIRTIO_NET_F_HOST_TSO4)))) { + ifr.ifr_flags |= IFF_VNET_HDR; + vnet_hdr_sz = sizeof(struct lkl_virtio_net_hdr_v1); + } + strncpy(ifr.ifr_name, ifname, IFNAMSIZ); fd = open("/dev/net/tun", O_RDWR|O_NONBLOCK); @@ -45,12 +60,23 @@ struct lkl_netdev *lkl_netdev_tap_create(const char *ifname) close(fd); return NULL; } - + if (vnet_hdr_sz && ioctl(fd, TUNSETVNETHDRSZ, &vnet_hdr_sz) != 0) { + fprintf(stderr, "tap: failed to TUNSETVNETHDRSZ to %s: %s\n", + ifr.ifr_name, strerror(errno)); + close(fd); + return NULL; + } + if (tap_arg && ioctl(fd, TUNSETOFFLOAD, tap_arg) != 0) { + fprintf(stderr, "tap: failed to TUNSETOFFLOAD to %s: %s\n", + ifr.ifr_name, strerror(errno)); + close(fd); + return NULL; + } nd = lkl_register_netdev_linux_fdnet(fd); if (!nd) { perror("failed to register to."); return NULL; } - + nd->dev.has_vnet_hdr = (vnet_hdr_sz != 0); return (struct lkl_netdev *)nd; } diff --git a/tools/lkl/lib/virtio_net_vde.c b/tools/lkl/lib/virtio_net_vde.c index 6624bda6a9e41b..20535439fae309 100644 --- a/tools/lkl/lib/virtio_net_vde.c +++ b/tools/lkl/lib/virtio_net_vde.c @@ -16,8 +16,8 @@ struct lkl_netdev_vde { }; struct lkl_netdev *nuse_vif_vde_create(char *switch_path); -static int net_vde_tx(struct lkl_netdev *nd, void *data, int len); -static int net_vde_rx(struct lkl_netdev *nd, void *data, int *len); +static int net_vde_tx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt); +static int net_vde_rx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt); static int net_vde_poll_with_timeout(struct lkl_netdev *nd, int events, int timeout); static int net_vde_poll(struct lkl_netdev *nd, int events); @@ -28,23 +28,27 @@ struct lkl_dev_net_ops vde_net_ops = { .poll = net_vde_poll, }; -int net_vde_tx(struct lkl_netdev *nd, void *data, int len) +int net_vde_tx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt) { int ret; struct lkl_netdev_vde *nd_vde; + void *data = iov[0].addr; + int len = (int)iov[0].len; nd_vde = (struct lkl_netdev_vde *) nd; ret = vde_send(nd_vde->conn, data, len, 0); if (ret <= 0 && errno == EAGAIN) return -1; - return 0; + return ret; } -int net_vde_rx(struct lkl_netdev *nd, void *data, int *len) +int net_vde_rx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt) { int ret; struct lkl_netdev_vde *nd_vde; + void *data = iov[0].addr; + int len = (int)iov[0].len; nd_vde = (struct lkl_netdev_vde *) nd; @@ -52,17 +56,16 @@ int net_vde_rx(struct lkl_netdev *nd, void *data, int *len) * Due to a bug in libvdeplug we have to first poll to make sure * that there is data available. * The correct solution would be to just use - * ret = vde_recv(nd_vde->conn, data, *len, MSG_DONTWAIT); + * ret = vde_recv(nd_vde->conn, data, len, MSG_DONTWAIT); * This should be changed once libvdeplug is fixed. */ ret = 0; if (net_vde_poll_with_timeout(nd, LKL_DEV_NET_POLL_RX, 0) & LKL_DEV_NET_POLL_RX) - ret = vde_recv(nd_vde->conn, data, *len, 0); + ret = vde_recv(nd_vde->conn, data, len, 0); if (ret <= 0) return -1; - *len = ret; - return 0; + return ret; } int net_vde_poll_with_timeout(struct lkl_netdev *nd, int events, int timeout) diff --git a/tools/lkl/tests/boot.c b/tools/lkl/tests/boot.c index 5620c062d6bc7b..6b0b7ac8f626d4 100644 --- a/tools/lkl/tests/boot.c +++ b/tools/lkl/tests/boot.c @@ -328,11 +328,11 @@ int test_netdev_add(char *str, int len) struct lkl_netdev *netdev; int ret = 0; - netdev = lkl_netdev_tap_create(cla.tap_ifname); + netdev = lkl_netdev_tap_create(cla.tap_ifname, 0); if (!netdev) goto out; - ret = lkl_netdev_add((struct lkl_netdev *)netdev, NULL); + ret = lkl_netdev_add((struct lkl_netdev *)netdev, NULL, 0); if (ret < 0) goto out; diff --git a/tools/lkl/tests/net-test.c b/tools/lkl/tests/net-test.c index 90225e6ff21b59..480ff7809f6e87 100644 --- a/tools/lkl/tests/net-test.c +++ b/tools/lkl/tests/net-test.c @@ -130,7 +130,7 @@ static int test_net_init(int argc, char **argv) gateway = argv[6]; if (iftype && ifname && (strncmp(iftype, "tap", 3) == 0)) - nd = lkl_netdev_tap_create(ifname); + nd = lkl_netdev_tap_create(ifname, 0); #ifdef CONFIG_AUTO_LKL_VIRTIO_NET_DPDK else if (iftype && ifname && (strncmp(iftype, "dpdk", 4) == 0)) nd = lkl_netdev_dpdk_create(ifname); @@ -143,7 +143,7 @@ static int test_net_init(int argc, char **argv) return -1; } - ret = lkl_netdev_add(nd, NULL); + ret = lkl_netdev_add(nd, NULL, 0); if (ret < 0) { fprintf(stderr, "failed to add netdev: %s\n", lkl_strerror(ret));