OSDN Git Service

net-zerocopy: Defer vm zap unless actually needed.
authorArjun Roy <arjunroy@google.com>
Wed, 2 Dec 2020 22:53:49 +0000 (14:53 -0800)
committerJakub Kicinski <kuba@kernel.org>
Fri, 4 Dec 2020 21:40:53 +0000 (13:40 -0800)
Zapping pages is required only if we are calling vm_insert_page into a
region where pages had previously been mapped. Receive zerocopy allows
reusing such regions, and hitherto called zap_page_range() before
calling vm_insert_page() in that range.

zap_page_range() can also be triggered from userspace with
madvise(MADV_DONTNEED). If userspace is configured to call this before
reusing a segment, or if there was nothing mapped at this virtual
address to begin with, we can avoid calling zap_page_range() under the
socket lock. That said, if userspace does not do that, then we are
still responsible for calling zap_page_range().

This patch adds a flag that the user can use to hint to the kernel
that a zap is not required. If the flag is not set, or if an older
user application does not have a flags field at all, then the kernel
calls zap_page_range as before. Also, if the flag is set but a zap is
still required, the kernel performs that zap as necessary. Thus
incorrectly indicating that a zap can be avoided does not change the
correctness of operation. It also increases the batchsize for
vm_insert_pages and prefetches the page struct for the batch since
we're about to bump the refcount.

An alternative mechanism could be to not have a flag, assume by
default a zap is not needed, and fall back to zapping if needed.
However, this would harm performance for older applications for which
a zap is necessary, and thus we implement it with an explicit flag
so newer applications can opt in.

When using RPC-style traffic with medium sized (tens of KB) RPCs, this
change yields an efficency improvement of about 30% for QPS/CPU usage.

Signed-off-by: Arjun Roy <arjunroy@google.com>
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Soheil Hassas Yeganeh <soheil@google.com>
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
include/uapi/linux/tcp.h
net/ipv4/tcp.c

index 62db78b..13ceeb3 100644 (file)
@@ -343,6 +343,7 @@ struct tcp_diag_md5sig {
 
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
 
+#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
 struct tcp_zerocopy_receive {
        __u64 address;          /* in: address of mapping */
        __u32 length;           /* in/out: number of bytes to map/mapped */
@@ -351,5 +352,6 @@ struct tcp_zerocopy_receive {
        __s32 err; /* out: socket error */
        __u64 copybuf_address;  /* in: copybuf address (small reads) */
        __s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
+       __u32 flags; /* in: flags */
 };
 #endif /* _UAPI_LINUX_TCP_H */
index df6dd15..3c99d48 100644 (file)
@@ -1924,51 +1924,101 @@ static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
        return zc->copybuf_len < 0 ? 0 : copylen;
 }
 
+static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
+                                             struct page **pending_pages,
+                                             unsigned long pages_remaining,
+                                             unsigned long *address,
+                                             u32 *length,
+                                             u32 *seq,
+                                             struct tcp_zerocopy_receive *zc,
+                                             u32 total_bytes_to_map,
+                                             int err)
+{
+       /* At least one page did not map. Try zapping if we skipped earlier. */
+       if (err == -EBUSY &&
+           zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
+               u32 maybe_zap_len;
+
+               maybe_zap_len = total_bytes_to_map -  /* All bytes to map */
+                               *length + /* Mapped or pending */
+                               (pages_remaining * PAGE_SIZE); /* Failed map. */
+               zap_page_range(vma, *address, maybe_zap_len);
+               err = 0;
+       }
+
+       if (!err) {
+               unsigned long leftover_pages = pages_remaining;
+               int bytes_mapped;
+
+               /* We called zap_page_range, try to reinsert. */
+               err = vm_insert_pages(vma, *address,
+                                     pending_pages,
+                                     &pages_remaining);
+               bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
+               *seq += bytes_mapped;
+               *address += bytes_mapped;
+       }
+       if (err) {
+               /* Either we were unable to zap, OR we zapped, retried an
+                * insert, and still had an issue. Either ways, pages_remaining
+                * is the number of pages we were unable to map, and we unroll
+                * some state we speculatively touched before.
+                */
+               const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+
+               *length -= bytes_not_mapped;
+               zc->recv_skip_hint += bytes_not_mapped;
+       }
+       return err;
+}
+
 static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
                                        struct page **pages,
-                                       unsigned long pages_to_map,
-                                       unsigned long *insert_addr,
-                                       u32 *length_with_pending,
+                                       unsigned int pages_to_map,
+                                       unsigned long *address,
+                                       u32 *length,
                                        u32 *seq,
-                                       struct tcp_zerocopy_receive *zc)
+                                       struct tcp_zerocopy_receive *zc,
+                                       u32 total_bytes_to_map)
 {
        unsigned long pages_remaining = pages_to_map;
-       int bytes_mapped;
-       int ret;
+       unsigned int pages_mapped;
+       unsigned int bytes_mapped;
+       int err;
 
-       ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
-       bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
+       err = vm_insert_pages(vma, *address, pages, &pages_remaining);
+       pages_mapped = pages_to_map - (unsigned int)pages_remaining;
+       bytes_mapped = PAGE_SIZE * pages_mapped;
        /* Even if vm_insert_pages fails, it may have partially succeeded in
         * mapping (some but not all of the pages).
         */
        *seq += bytes_mapped;
-       *insert_addr += bytes_mapped;
-       if (ret) {
-               /* But if vm_insert_pages did fail, we have to unroll some state
-                * we speculatively touched before.
-                */
-               const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
-               *length_with_pending -= bytes_not_mapped;
-               zc->recv_skip_hint += bytes_not_mapped;
-       }
-       return ret;
+       *address += bytes_mapped;
+
+       if (likely(!err))
+               return 0;
+
+       /* Error: maybe zap and retry + rollback state for failed inserts. */
+       return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
+               pages_remaining, address, length, seq, zc, total_bytes_to_map,
+               err);
 }
 
+#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
 static int tcp_zerocopy_receive(struct sock *sk,
                                struct tcp_zerocopy_receive *zc)
 {
-       u32 length = 0, offset, vma_len, avail_len, aligned_len, copylen = 0;
+       u32 length = 0, offset, vma_len, avail_len, copylen = 0;
        unsigned long address = (unsigned long)zc->address;
+       struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
        s32 copybuf_len = zc->copybuf_len;
        struct tcp_sock *tp = tcp_sk(sk);
-       #define PAGE_BATCH_SIZE 8
-       struct page *pages[PAGE_BATCH_SIZE];
        const skb_frag_t *frags = NULL;
+       unsigned int pages_to_map = 0;
        struct vm_area_struct *vma;
        struct sk_buff *skb = NULL;
-       unsigned long pg_idx = 0;
-       unsigned long curr_addr;
        u32 seq = tp->copied_seq;
+       u32 total_bytes_to_map;
        int inq = tcp_inq(sk);
        int ret;
 
@@ -2002,34 +2052,24 @@ static int tcp_zerocopy_receive(struct sock *sk,
        }
        vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
        avail_len = min_t(u32, vma_len, inq);
-       aligned_len = avail_len & ~(PAGE_SIZE - 1);
-       if (aligned_len) {
-               zap_page_range(vma, address, aligned_len);
-               zc->length = aligned_len;
+       total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
+       if (total_bytes_to_map) {
+               if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
+                       zap_page_range(vma, address, total_bytes_to_map);
+               zc->length = total_bytes_to_map;
                zc->recv_skip_hint = 0;
        } else {
                zc->length = avail_len;
                zc->recv_skip_hint = avail_len;
        }
        ret = 0;
-       curr_addr = address;
        while (length + PAGE_SIZE <= zc->length) {
                int mappable_offset;
+               struct page *page;
 
                if (zc->recv_skip_hint < PAGE_SIZE) {
                        u32 offset_frag;
 
-                       /* If we're here, finish the current batch. */
-                       if (pg_idx) {
-                               ret = tcp_zerocopy_vm_insert_batch(vma, pages,
-                                                                  pg_idx,
-                                                                  &curr_addr,
-                                                                  &length,
-                                                                  &seq, zc);
-                               if (ret)
-                                       goto out;
-                               pg_idx = 0;
-                       }
                        if (skb) {
                                if (zc->recv_skip_hint > 0)
                                        break;
@@ -2050,24 +2090,31 @@ static int tcp_zerocopy_receive(struct sock *sk,
                        zc->recv_skip_hint = mappable_offset;
                        break;
                }
-               pages[pg_idx] = skb_frag_page(frags);
-               pg_idx++;
+               page = skb_frag_page(frags);
+               prefetchw(page);
+               pages[pages_to_map++] = page;
                length += PAGE_SIZE;
                zc->recv_skip_hint -= PAGE_SIZE;
                frags++;
-               if (pg_idx == PAGE_BATCH_SIZE) {
-                       ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
-                                                          &curr_addr, &length,
-                                                          &seq, zc);
+               if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
+                   zc->recv_skip_hint < PAGE_SIZE) {
+                       /* Either full batch, or we're about to go to next skb
+                        * (and we cannot unroll failed ops across skbs).
+                        */
+                       ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+                                                          pages_to_map,
+                                                          &address, &length,
+                                                          &seq, zc,
+                                                          total_bytes_to_map);
                        if (ret)
                                goto out;
-                       pg_idx = 0;
+                       pages_to_map = 0;
                }
        }
-       if (pg_idx) {
-               ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
-                                                  &curr_addr, &length, &seq,
-                                                  zc);
+       if (pages_to_map) {
+               ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
+                                                  &address, &length, &seq,
+                                                  zc, total_bytes_to_map);
        }
 out:
        mmap_read_unlock(current->mm);