Skip to content

Commit

Permalink
net: add support for skbs with unreadable frags
Browse files Browse the repository at this point in the history
For device memory TCP, we expect the skb headers to be available in host
memory for access, and we expect the skb frags to be in device memory
and unaccessible to the host. We expect there to be no mixing and
matching of device memory frags (unaccessible) with host memory frags
(accessible) in the same skb.

Add a skb->devmem flag which indicates whether the frags in this skb
are device memory frags or not.

__skb_fill_netmem_desc() now checks frags added to skbs for net_iov,
and marks the skb as skb->devmem accordingly.

Add checks through the network stack to avoid accessing the frags of
devmem skbs and avoid coalescing devmem skbs with non devmem skbs.

Signed-off-by: Willem de Bruijn <willemb@google.com>
Signed-off-by: Kaiyuan Zhang <kaiyuanz@google.com>
Signed-off-by: Mina Almasry <almasrymina@google.com>
Reviewed-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Jakub Kicinski <kuba@kernel.org>
Link: https://patch.msgid.link/20240910171458.219195-9-almasrymina@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
  • Loading branch information
mina authored and kuba-moo committed Sep 12, 2024
1 parent 9f6b619 commit 65249fe
Show file tree
Hide file tree
Showing 9 changed files with 89 additions and 11 deletions.
19 changes: 17 additions & 2 deletions include/linux/skbuff.h
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,8 @@ enum skb_tstamp_type {
* @csum_level: indicates the number of consecutive checksums found in
* the packet minus one that have been verified as
* CHECKSUM_UNNECESSARY (max 3)
* @unreadable: indicates that at least 1 of the fragments in this skb is
* unreadable.
* @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB
* @slow_gro: state present at GRO time, slower prepare step required
Expand Down Expand Up @@ -1008,7 +1010,7 @@ struct sk_buff {
#if IS_ENABLED(CONFIG_IP_SCTP)
__u8 csum_not_inet:1;
#endif

__u8 unreadable:1;
#if defined(CONFIG_NET_SCHED) || defined(CONFIG_NET_XGRESS)
__u16 tc_index; /* traffic control index */
#endif
Expand Down Expand Up @@ -1824,6 +1826,12 @@ static inline void skb_zcopy_downgrade_managed(struct sk_buff *skb)
__skb_zcopy_downgrade_managed(skb);
}

/* Return true if frags in this skb are readable by the host. */
static inline bool skb_frags_readable(const struct sk_buff *skb)
{
return !skb->unreadable;
}

static inline void skb_mark_not_on_list(struct sk_buff *skb)
{
skb->next = NULL;
Expand Down Expand Up @@ -2540,10 +2548,17 @@ static inline void skb_len_add(struct sk_buff *skb, int delta)
static inline void __skb_fill_netmem_desc(struct sk_buff *skb, int i,
netmem_ref netmem, int off, int size)
{
struct page *page = netmem_to_page(netmem);
struct page *page;

__skb_fill_netmem_desc_noacc(skb_shinfo(skb), i, netmem, off, size);

if (netmem_is_net_iov(netmem)) {
skb->unreadable = true;
return;
}

page = netmem_to_page(netmem);

/* Propagate page pfmemalloc to the skb if we can. The problem is
* that not all callers have unique ownership of the page but rely
* on page_is_pfmemalloc doing the right thing(tm).
Expand Down
3 changes: 2 additions & 1 deletion include/net/tcp.h
Original file line number Diff line number Diff line change
Expand Up @@ -1069,7 +1069,8 @@ static inline bool tcp_skb_can_collapse(const struct sk_buff *to,
/* skb_cmp_decrypted() not needed, use tcp_write_collapse_fence() */
return likely(tcp_skb_can_collapse_to(to) &&
mptcp_skb_can_collapse(to, from) &&
skb_pure_zcopy_same(to, from));
skb_pure_zcopy_same(to, from) &&
skb_frags_readable(to) == skb_frags_readable(from));
}

static inline bool tcp_skb_can_collapse_rx(const struct sk_buff *to,
Expand Down
6 changes: 6 additions & 0 deletions net/core/datagram.c
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,9 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset,
return 0;
}

if (!skb_frags_readable(skb))
goto short_copy;

/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
Expand Down Expand Up @@ -623,6 +626,9 @@ int zerocopy_fill_skb_from_iter(struct sk_buff *skb,
{
int frag = skb_shinfo(skb)->nr_frags;

if (!skb_frags_readable(skb))
return -EFAULT;

while (length && iov_iter_count(from)) {
struct page *head, *last_head = NULL;
struct page *pages[MAX_SKB_FRAGS];
Expand Down
4 changes: 4 additions & 0 deletions net/core/dev.c
Original file line number Diff line number Diff line change
Expand Up @@ -3312,6 +3312,10 @@ int skb_checksum_help(struct sk_buff *skb)
return -EINVAL;
}

if (!skb_frags_readable(skb)) {
return -EFAULT;
}

/* Before computing a checksum, we should make sure no frag could
* be modified by an external entity : checksum could be wrong.
*/
Expand Down
43 changes: 41 additions & 2 deletions net/core/skbuff.c
Original file line number Diff line number Diff line change
Expand Up @@ -1972,6 +1972,9 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
return -EINVAL;

if (!skb_frags_readable(skb))
return -EFAULT;

if (!num_frags)
goto release;

Expand Down Expand Up @@ -2145,6 +2148,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
unsigned int size;
int headerlen;

if (!skb_frags_readable(skb))
return NULL;

if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
return NULL;

Expand Down Expand Up @@ -2483,6 +2489,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb,
struct sk_buff *n;
int oldheadroom;

if (!skb_frags_readable(skb))
return NULL;

if (WARN_ON_ONCE(skb_shinfo(skb)->gso_type & SKB_GSO_FRAGLIST))
return NULL;

Expand Down Expand Up @@ -2827,6 +2836,9 @@ void *__pskb_pull_tail(struct sk_buff *skb, int delta)
*/
int i, k, eat = (skb->tail + delta) - skb->end;

if (!skb_frags_readable(skb))
return NULL;

if (eat > 0 || skb_cloned(skb)) {
if (pskb_expand_head(skb, 0, eat > 0 ? eat + 128 : 0,
GFP_ATOMIC))
Expand Down Expand Up @@ -2980,6 +2992,9 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
to += copy;
}

if (!skb_frags_readable(skb))
goto fault;

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
Expand Down Expand Up @@ -3168,6 +3183,9 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
/*
* then map the fragments
*/
if (!skb_frags_readable(skb))
return false;

for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];

Expand Down Expand Up @@ -3391,6 +3409,9 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
from += copy;
}

if (!skb_frags_readable(skb))
goto fault;

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
int end;
Expand Down Expand Up @@ -3470,6 +3491,9 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
pos = copy;
}

if (WARN_ON_ONCE(!skb_frags_readable(skb)))
return 0;

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
Expand Down Expand Up @@ -3570,6 +3594,9 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
pos = copy;
}

if (!skb_frags_readable(skb))
return 0;

for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;

Expand Down Expand Up @@ -4061,6 +4088,7 @@ static inline void skb_split_inside_header(struct sk_buff *skb,
skb_shinfo(skb1)->frags[i] = skb_shinfo(skb)->frags[i];

skb_shinfo(skb1)->nr_frags = skb_shinfo(skb)->nr_frags;
skb1->unreadable = skb->unreadable;
skb_shinfo(skb)->nr_frags = 0;
skb1->data_len = skb->data_len;
skb1->len += skb1->data_len;
Expand Down Expand Up @@ -4108,6 +4136,8 @@ static inline void skb_split_no_header(struct sk_buff *skb,
pos += size;
}
skb_shinfo(skb1)->nr_frags = k;

skb1->unreadable = skb->unreadable;
}

/**
Expand Down Expand Up @@ -4345,6 +4375,9 @@ unsigned int skb_seq_read(unsigned int consumed, const u8 **data,
return block_limit - abs_offset;
}

if (!skb_frags_readable(st->cur_skb))
return 0;

if (st->frag_idx == 0 && !st->frag_data)
st->stepped_offset += skb_headlen(st->cur_skb);

Expand Down Expand Up @@ -5992,7 +6025,10 @@ bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from,
if (to->pp_recycle != from->pp_recycle)
return false;

if (len <= skb_tailroom(to)) {
if (skb_frags_readable(from) != skb_frags_readable(to))
return false;

if (len <= skb_tailroom(to) && skb_frags_readable(from)) {
if (len)
BUG_ON(skb_copy_bits(from, 0, skb_put(to, len), len));
*delta_truesize = 0;
Expand Down Expand Up @@ -6169,6 +6205,9 @@ int skb_ensure_writable(struct sk_buff *skb, unsigned int write_len)
if (!pskb_may_pull(skb, write_len))
return -ENOMEM;

if (!skb_frags_readable(skb))
return -EFAULT;

if (!skb_cloned(skb) || skb_clone_writable(skb, write_len))
return 0;

Expand Down Expand Up @@ -6848,7 +6887,7 @@ void skb_condense(struct sk_buff *skb)
{
if (skb->data_len) {
if (skb->data_len > skb->end - skb->tail ||
skb_cloned(skb))
skb_cloned(skb) || !skb_frags_readable(skb))
return;

/* Nice, we can free page frag(s) right now */
Expand Down
3 changes: 3 additions & 0 deletions net/ipv4/tcp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2160,6 +2160,9 @@ static int tcp_zerocopy_receive(struct sock *sk,
skb = tcp_recv_skb(sk, seq, &offset);
}

if (!skb_frags_readable(skb))
break;

if (TCP_SKB_CB(skb)->has_rxtstamp) {
tcp_update_recv_tstamps(skb, tss);
zc->msg_flags |= TCP_CMSG_TS;
Expand Down
13 changes: 10 additions & 3 deletions net/ipv4/tcp_input.c
Original file line number Diff line number Diff line change
Expand Up @@ -5391,6 +5391,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
for (end_of_skbs = true; skb != NULL && skb != tail; skb = n) {
n = tcp_skb_next(skb, list);

if (!skb_frags_readable(skb))
goto skip_this;

/* No new bits? It is possible on ofo queue. */
if (!before(start, TCP_SKB_CB(skb)->end_seq)) {
skb = tcp_collapse_one(sk, skb, list, root);
Expand All @@ -5411,17 +5414,20 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
break;
}

if (n && n != tail && tcp_skb_can_collapse_rx(skb, n) &&
if (n && n != tail && skb_frags_readable(n) &&
tcp_skb_can_collapse_rx(skb, n) &&
TCP_SKB_CB(skb)->end_seq != TCP_SKB_CB(n)->seq) {
end_of_skbs = false;
break;
}

skip_this:
/* Decided to skip this, advance start seq. */
start = TCP_SKB_CB(skb)->end_seq;
}
if (end_of_skbs ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
!skb_frags_readable(skb))
return;

__skb_queue_head_init(&tmp);
Expand Down Expand Up @@ -5463,7 +5469,8 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, struct rb_root *root,
if (!skb ||
skb == tail ||
!tcp_skb_can_collapse_rx(nskb, skb) ||
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)))
(TCP_SKB_CB(skb)->tcp_flags & (TCPHDR_SYN | TCPHDR_FIN)) ||
!skb_frags_readable(skb))
goto end;
}
}
Expand Down
5 changes: 4 additions & 1 deletion net/ipv4/tcp_output.c
Original file line number Diff line number Diff line change
Expand Up @@ -2344,7 +2344,8 @@ static bool tcp_can_coalesce_send_queue_head(struct sock *sk, int len)

if (unlikely(TCP_SKB_CB(skb)->eor) ||
tcp_has_tx_tstamp(skb) ||
!skb_pure_zcopy_same(skb, next))
!skb_pure_zcopy_same(skb, next) ||
skb_frags_readable(skb) != skb_frags_readable(next))
return false;

len -= skb->len;
Expand Down Expand Up @@ -3264,6 +3265,8 @@ static bool tcp_can_collapse(const struct sock *sk, const struct sk_buff *skb)
return false;
if (skb_cloned(skb))
return false;
if (!skb_frags_readable(skb))
return false;
/* Some heuristics for collapsing over SACK'd could be invented */
if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
return false;
Expand Down
4 changes: 2 additions & 2 deletions net/packet/af_packet.c
Original file line number Diff line number Diff line change
Expand Up @@ -2216,7 +2216,7 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
}
}

snaplen = skb->len;
snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);

res = run_filter(skb, sk, snaplen);
if (!res)
Expand Down Expand Up @@ -2336,7 +2336,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
}
}

snaplen = skb->len;
snaplen = skb_frags_readable(skb) ? skb->len : skb_headlen(skb);

res = run_filter(skb, sk, snaplen);
if (!res)
Expand Down

0 comments on commit 65249fe

Please sign in to comment.