Skip to content

Commit

Permalink
lkl: Add offload (TSO4, CSUM) support to LKL device, #1 of 2
Browse files Browse the repository at this point in the history
This patch prepares the LKL device code for the offload support.
The gist of it is to change the tx() and rx() interfaces to accept a
vector of buffers (like iovec) rather than just a single one. This
allows the support of sg list originated from skb's frags that come
down from the stack above to be passed to a backend device without
the need to copy.

The patch introduces a new environment variable "LKL_HIJACK_OFFLOAD"
to the hijack code to allow easy enabling of any combination of
offload features. Once the desirable offload features are decided,
the code will program the tap device accordingly, e.g., to properly
translate offload flags, and decide whether IFF_VNET_HDR needs to be
enabled. It will also program the selected features into LKL kernel's
virtio driver. Note that the actual code to program the ring properly
for various offload flavors is in the 2nd commit.

The patch also allows an existing env var "LKL_HIJACK_DEBUG" to take
on a hex value, e.g., "export LKL_HIJACK_DEBUG=0x100". This is used
to set an internal debug flag "lkl_debug" in the hijack code for
diagnosis purpose. The above export statement, for example, will
cause the LKL kernel to pause after the hijack'ed app exits. This
allows one to debug or collect info from the LKL kernel before it
quits.

Signed-off-by: H.K. Jerry Chu <hkchu@google.com>
  • Loading branch information
Hsiao-keng Jerry Chu committed Jul 30, 2016
1 parent 35df18a commit b8f6fad
Show file tree
Hide file tree
Showing 11 changed files with 147 additions and 74 deletions.
19 changes: 18 additions & 1 deletion Documentation/lkl.txt
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,12 @@ are the list of those variable for your environment.
```
* LKL_HIJACK_DEBUG

increase the verbose level of debug information.
Setting it causes some debug information (both from the kernel and the
LKL library) to be enabled.
It is also used as a bit mask to turn on specific debugging facilities.
E.g., setting it to 0x100 ("export LKL_HIJACK_DEBUG=0x100") will cause
the LKL kernel to pause after the hijack'ed app exits. This allows one
to debug or collect info from the LKL kernel before it quits.
```
$ LKL_HIJACK_DEBUG=1 lkl-hijack.sh ip address show
```
Expand All @@ -210,6 +215,18 @@ are the list of those variable for your environment.
```
$ LKL_HIJACK_SINGLE_CPU=1 lkl-hijack.sh ip address show
```
* LKL_HIJACK_OFFLOAD

Work as a bit mask to enable selective device offload features. E.g.,
to enable "mergeable RX buffer" (LKL_VIRTIO_NET_F_MRG_RXBUF) +
"guest csum" (LKL_VIRTIO_NET_F_GUEST_CSUM) device features, simply set
it to 0x8002.

See virtio_net.h for a list of offload features and their bit masks.
```
$ LKL_HIJACK_OFFLOAD=0x8002 lkl-hijack.sh ./netserver -D -f
```

FAQ
===

Expand Down
4 changes: 2 additions & 2 deletions tools/lkl/include/lkl.h
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ struct lkl_netdev;
* @returns a network device id (0 is valid) or a strictly negative value in
* case of error
*/
int lkl_netdev_add(struct lkl_netdev *nd, void *mac);
int lkl_netdev_add(struct lkl_netdev *nd, void *mac, int offload);

/**
* lkl_netdevs_remove - destroy all network devices
Expand Down Expand Up @@ -283,7 +283,7 @@ int lkl_stop_syscall_thread(void);
* @ifname - interface name for the TAP device. need to be configured
* on host in advance
*/
struct lkl_netdev *lkl_netdev_tap_create(const char *ifname);
struct lkl_netdev *lkl_netdev_tap_create(const char *ifname, int offload);

/**
* lkl_netdev_dpdk_create - create DPDK net_device for the virtio net backend
Expand Down
20 changes: 10 additions & 10 deletions tools/lkl/include/lkl_host.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ extern char lkl_virtio_devs[256];

struct lkl_dev_buf {
void *addr;
unsigned int len;
size_t len;
};

extern struct lkl_dev_blk_ops lkl_dev_blk_ops;
Expand Down Expand Up @@ -50,6 +50,7 @@ struct lkl_dev_blk_ops {
struct lkl_netdev {
struct lkl_dev_net_ops *ops;
lkl_thread_t rx_tid, tx_tid;
uint8_t has_vnet_hdr: 1;
};

struct lkl_dev_net_ops {
Expand All @@ -58,11 +59,11 @@ struct lkl_dev_net_ops {
* The data buffer can only hold 0 or 1 complete packets.
*
* @nd - pointer to the network device
* @data - pointer to the buffer
* @len - size of the buffer in bytes
* @returns 0 for success and -1 for failure.
* @iov - pointer to the buffer vector
* @cnt - # of vectors in iov.
* @returns number of bytes transmitted
*/
int (*tx)(struct lkl_netdev *nd, void *data, int len);
int (*tx)(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt);
/* Reads a packet from the net device.
*
* It must only read one complete packet if present.
Expand All @@ -71,12 +72,11 @@ struct lkl_dev_net_ops {
* decide to drop it or trim it.
*
* @nd - pointer to the network device
* @data - pointer to the buffer to store the packet
* @len - pointer to the maximum size of the buffer. Also stores the
* real number of bytes read after return.
* @returns 0 for success and -1 if nothing is read.
* @iov - pointer to the buffer vector to store the packet
* @cnt - # of vectors in iov.
* @returns number of bytes read for success or < 0 if error
*/
int (*rx)(struct lkl_netdev *nd, void *data, int *len);
int (*rx)(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt);
#define LKL_DEV_NET_POLL_RX 1
#define LKL_DEV_NET_POLL_TX 2
/* Polls a net device.
Expand Down
51 changes: 38 additions & 13 deletions tools/lkl/lib/hijack/init.c
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,8 @@ static void PinToFirstCpu(const cpu_set_t* cpus)
}
}

int lkl_debug;

void __attribute__((constructor(102)))
hijack_init(void)
{
Expand Down Expand Up @@ -234,11 +236,17 @@ hijack_init(void)
char *single_cpu= getenv("LKL_HIJACK_SINGLE_CPU");
int single_cpu_mode = 0;
cpu_set_t ori_cpu;
char *offload1 = getenv("LKL_HIJACK_OFFLOAD");
int offload = 0;

if (!debug)
if (!debug) {
lkl_host_ops.print = NULL;
else
} else {
lkl_register_dbg_handler();
lkl_debug = strtol(debug, NULL, 0);
}
if (offload1)
offload = strtol(offload1, NULL, 0);

if (single_cpu) {
single_cpu_mode = atoi(single_cpu);
Expand Down Expand Up @@ -274,18 +282,28 @@ hijack_init(void)
"WARN: variable LKL_HIJACK_NET_TAP is now obsoleted.\n"
" please use LKL_HIJACK_NET_IFTYPE and "
"LKL_HIJACK_NET_IFPARAMS instead.\n");
nd = lkl_netdev_tap_create(tap);
nd = lkl_netdev_tap_create(tap, offload);
}

if (!nd && iftype && ifparams) {
if ((strcmp(iftype, "tap") == 0))
nd = lkl_netdev_tap_create(ifparams);
else if (strcmp(iftype, "dpdk") == 0)
nd = lkl_netdev_dpdk_create(ifparams);
else if (strcmp(iftype, "vde") == 0)
nd = lkl_netdev_vde_create(ifparams);
else if (strcmp(iftype, "raw") == 0)
nd = lkl_netdev_raw_create(ifparams);
if ((strcmp(iftype, "tap") == 0)) {
nd = lkl_netdev_tap_create(ifparams, offload);
} else {
if (offload) {
fprintf(stderr,
"WARN: LKL_HIJACK_OFFLOAD is only "
"supported on tap device (for now)!\n"
"No offload features will be "
"enabled.\n");
}
offload = 0;
if (strcmp(iftype, "dpdk") == 0)
nd = lkl_netdev_dpdk_create(ifparams);
else if (strcmp(iftype, "vde") == 0)
nd = lkl_netdev_vde_create(ifparams);
else if (strcmp(iftype, "raw") == 0)
nd = lkl_netdev_raw_create(ifparams);
}
}

if (nd) {
Expand All @@ -295,9 +313,9 @@ hijack_init(void)
fprintf(stderr, "failed to parse mac\n");
return;
} else if (ret > 0) {
ret = lkl_netdev_add(nd, mac);
ret = lkl_netdev_add(nd, mac, offload);
} else {
ret = lkl_netdev_add(nd, NULL);
ret = lkl_netdev_add(nd, NULL, offload);
}

if (ret < 0) {
Expand Down Expand Up @@ -388,6 +406,13 @@ hijack_fini(void)
int i;
char *dump = getenv("LKL_HIJACK_DUMP");

/* The following pauses the kernel before exiting allowing one
* to debug or collect stattistics/diagnosis info from it.
*/
if (lkl_debug & 0x100) {
while (1)
pause();
}
if (dump)
mount_cmds_exec(dump, dump_file);

Expand Down
23 changes: 12 additions & 11 deletions tools/lkl/lib/virtio_net.c
Original file line number Diff line number Diff line change
Expand Up @@ -73,36 +73,36 @@ static int net_enqueue(struct virtio_dev *dev, struct virtio_req *req)
{
struct lkl_virtio_net_hdr_v1 *header;
struct virtio_net_dev *net_dev;
int ret, len;
void *buf;
int ret;
struct lkl_dev_buf iov[1];

header = req->buf[0].addr;
net_dev = netdev_of(dev);
len = req->buf[0].len - sizeof(*header);
iov[0].len = req->buf[0].len - sizeof(*header);

buf = &header[1];
iov[0].addr = &header[1];

if (!len && req->buf_count > 1) {
buf = req->buf[1].addr;
len = req->buf[1].len;
if (!iov[0].len && req->buf_count > 1) {
iov[0].addr = req->buf[1].addr;
iov[0].len = req->buf[1].len;
}

/* Pick which virtqueue to send the buffer(s) to */
if (is_tx_queue(dev, req->q)) {
ret = net_dev->ops->tx(net_dev->nd, buf, len);
ret = net_dev->ops->tx(net_dev->nd, iov, 1);
if (ret < 0)
return -1;
} else if (is_rx_queue(dev, req->q)) {
header->num_buffers = 1;
ret = net_dev->ops->rx(net_dev->nd, buf, &len);
ret = net_dev->ops->rx(net_dev->nd, iov, 1);
if (ret < 0)
return -1;
} else {
bad_request("tried to push on non-existent queue");
return -1;
}

virtio_req_complete(req, len + sizeof(*header));
virtio_req_complete(req, iov[0].len + sizeof(*header));
return 0;
}

Expand Down Expand Up @@ -174,7 +174,7 @@ static struct lkl_mutex **init_queue_locks(int num_queues)
return ret;
}

int lkl_netdev_add(struct lkl_netdev *nd, void *mac)
int lkl_netdev_add(struct lkl_netdev *nd, void *mac, int offload)
{
struct virtio_net_dev *dev;
int ret = -LKL_ENOMEM;
Expand All @@ -188,6 +188,7 @@ int lkl_netdev_add(struct lkl_netdev *nd, void *mac)
dev->dev.device_id = LKL_VIRTIO_ID_NET;
if (mac)
dev->dev.device_features |= BIT(LKL_VIRTIO_NET_F_MAC);
dev->dev.device_features |= offload;
dev->dev.config_data = &dev->config;
dev->dev.config_len = sizeof(dev->config);
dev->dev.ops = &net_ops;
Expand Down
17 changes: 10 additions & 7 deletions tools/lkl/lib/virtio_net_dpdk.c
Original file line number Diff line number Diff line change
Expand Up @@ -58,11 +58,13 @@ struct lkl_netdev_dpdk {
int bufidx;
};

static int net_tx(struct lkl_netdev *nd, void *data, int len)
static int net_tx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt)
{
void *pkt;
struct rte_mbuf *rm;
struct lkl_netdev_dpdk *nd_dpdk;
void *data = iov[0].addr;
int len = (int)iov[0].len;

nd_dpdk = (struct lkl_netdev_dpdk *) nd;

Expand All @@ -80,7 +82,7 @@ static int net_tx(struct lkl_netdev *nd, void *data, int len)
/* XXX: should be bulk-trasmitted !! */
rte_eth_tx_burst(nd_dpdk->portid, 0, &rm, 1);

return 0;
return len;
}

/*
Expand All @@ -90,10 +92,12 @@ static int net_tx(struct lkl_netdev *nd, void *data, int len)
* refactor allows us to read in parallel, the buffer (nd_dpdk->rms) shall
* be guarded.
*/
static int net_rx(struct lkl_netdev *nd, void *data, int *len)
static int net_rx(struct lkl_netdev *nd, struct lkl_dev_buf *iov, int cnt)
{
struct lkl_netdev_dpdk *nd_dpdk;
int i, nb_rx, read = 0;
void *data = iov[0].addr;
int len = (int)iov[0].len;

nd_dpdk = (struct lkl_netdev_dpdk *) nd;

Expand Down Expand Up @@ -122,8 +126,8 @@ static int net_rx(struct lkl_netdev *nd, void *data, int *len)
r_data = rte_pktmbuf_mtod(rm, void *);
r_size = rte_pktmbuf_data_len(rm);

*len -= r_size;
if (*len < 0) {
len -= r_size;
if (len < 0) {
fprintf(stderr, "dpdk: buffer full. skip it\n");
goto end;
}
Expand All @@ -144,8 +148,7 @@ static int net_rx(struct lkl_netdev *nd, void *data, int *len)
for (i = 0; i < nb_rx; i++)
rte_pktmbuf_free(nd_dpdk->rms[i]);

*len = read;
return 0;
return read;
}

static int net_poll(struct lkl_netdev *nd, int events)
Expand Down
24 changes: 11 additions & 13 deletions tools/lkl/lib/virtio_net_linux_fdnet.c
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include <string.h>
#include <sys/epoll.h>
#include <sys/eventfd.h>
#include <sys/uio.h>

#include "virtio.h"
#include "virtio_net_linux_fdnet.h"
Expand All @@ -31,40 +32,37 @@ struct lkl_netdev_linux_fdnet_ops lkl_netdev_linux_fdnet_ops = {
#endif /* __NR_eventfd */
};

static int linux_fdnet_net_tx(struct lkl_netdev *nd, void *data, int len)
static int linux_fdnet_net_tx(struct lkl_netdev *nd,
struct lkl_dev_buf *iov, int cnt)
{
int ret;
struct lkl_netdev_linux_fdnet *nd_fdnet =
container_of(nd, struct lkl_netdev_linux_fdnet, dev);

do {
ret = write(nd_fdnet->fd, data, len);
ret = writev(nd_fdnet->fd, (struct iovec *)iov, cnt);
} while (ret == -1 && errno == EINVAL);
if (ret > 0)
return 0;

if (ret < 0 && errno != EAGAIN)
perror("write to Linux fd netdev fails");

return -1;
return ret;
}

static int linux_fdnet_net_rx(struct lkl_netdev *nd, void *data, int *len)
static int linux_fdnet_net_rx(struct lkl_netdev *nd,
struct lkl_dev_buf *iov, int cnt)
{
int ret;
struct lkl_netdev_linux_fdnet *nd_fdnet =
container_of(nd, struct lkl_netdev_linux_fdnet, dev);

do {
ret = read(nd_fdnet->fd, data, *len);
ret = readv(nd_fdnet->fd, (struct iovec *)iov, cnt);
} while (ret == -1 && errno == EINVAL);
if (ret > 0) {
*len = ret;
return 0;
}

if (ret < 0 && errno != EAGAIN)
perror("read from fdnet device fails");

return -1;
return ret;
}

static int linux_fdnet_net_poll(struct lkl_netdev *nd, int events)
Expand Down
Loading

0 comments on commit b8f6fad

Please sign in to comment.