OSDN Git Service

userfaultfd: provide unmasked address on page-fault
authorNadav Amit <namit@vmware.com>
Tue, 22 Mar 2022 21:45:32 +0000 (14:45 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 22 Mar 2022 22:57:08 +0000 (15:57 -0700)
Userfaultfd is supposed to provide the full address (i.e., unmasked) of
the faulting access back to userspace.  However, that is not the case for
quite some time.

Even running "userfaultfd_demo" from the userfaultfd man page provides the
wrong output (and contradicts the man page).  Notice that
"UFFD_EVENT_PAGEFAULT event" shows the masked address (7fc5e30b3000) and
not the first read address (0x7fc5e30b300f).

Address returned by mmap() = 0x7fc5e30b3000

fault_handler_thread():
    poll() returns: nready = 1; POLLIN = 1; POLLERR = 0
    UFFD_EVENT_PAGEFAULT event: flags = 0; address = 7fc5e30b3000
(uffdio_copy.copy returned 4096)
Read address 0x7fc5e30b300f in main(): A
Read address 0x7fc5e30b340f in main(): A
Read address 0x7fc5e30b380f in main(): A
Read address 0x7fc5e30b3c0f in main(): A

The exact address is useful for various reasons and specifically for
prefetching decisions.  If it is known that the memory is populated by
certain objects whose size is not page-aligned, then based on the faulting
address, the uffd-monitor can decide whether to prefetch and prefault the
adjacent page.

This bug has been for quite some time in the kernel: since commit
1a29d85eb0f1 ("mm: use vmf->address instead of of vmf->virtual_address")
vmf->virtual_address"), which dates back to 2016.  A concern has been
raised that existing userspace application might rely on the old/wrong
behavior in which the address is masked.  Therefore, it was suggested to
provide the masked address unless the user explicitly asks for the exact
address.

Add a new userfaultfd feature UFFD_FEATURE_EXACT_ADDRESS to direct
userfaultfd to provide the exact address.  Add a new "real_address" field
to vmf to hold the unmasked address.  Provide the address to userspace
accordingly.

Initialize real_address in various code-paths to be consistent with
address, even when it is not used, to be on the safe side.

[namit@vmware.com: initialize real_address on all code paths, per Jan]
Link: https://lkml.kernel.org/r/20220226022655.350562-1-namit@vmware.com
[akpm@linux-foundation.org: fix typo in comment, per Jan]

Link: https://lkml.kernel.org/r/20220218041003.3508-1-namit@vmware.com
Signed-off-by: Nadav Amit <namit@vmware.com>
Acked-by: Peter Xu <peterx@redhat.com>
Reviewed-by: David Hildenbrand <david@redhat.com>
Acked-by: Mike Rapoport <rppt@linux.ibm.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
fs/userfaultfd.c
include/linux/mm.h
include/uapi/linux/userfaultfd.h
mm/hugetlb.c
mm/memory.c
mm/swapfile.c

index 8e03b3d..aa0c47c 100644 (file)
@@ -198,6 +198,9 @@ static inline struct uffd_msg userfault_msg(unsigned long address,
        struct uffd_msg msg;
        msg_init(&msg);
        msg.event = UFFD_EVENT_PAGEFAULT;
+
+       if (!(features & UFFD_FEATURE_EXACT_ADDRESS))
+               address &= PAGE_MASK;
        msg.arg.pagefault.address = address;
        /*
         * These flags indicate why the userfault occurred:
@@ -482,7 +485,7 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
 
        init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
        uwq.wq.private = current;
-       uwq.msg = userfault_msg(vmf->address, vmf->flags, reason,
+       uwq.msg = userfault_msg(vmf->real_address, vmf->flags, reason,
                        ctx->features);
        uwq.ctx = ctx;
        uwq.waken = false;
index 9d58321..0e4fd10 100644 (file)
@@ -478,7 +478,8 @@ struct vm_fault {
                struct vm_area_struct *vma;     /* Target VMA */
                gfp_t gfp_mask;                 /* gfp mask to be used for allocations */
                pgoff_t pgoff;                  /* Logical page offset based on vma */
-               unsigned long address;          /* Faulting virtual address */
+               unsigned long address;          /* Faulting virtual address - masked */
+               unsigned long real_address;     /* Faulting virtual address - unmasked */
        };
        enum fault_flag flags;          /* FAULT_FLAG_xxx flags
                                         * XXX: should really be 'const' */
index 05b31d6..ef73905 100644 (file)
@@ -32,7 +32,8 @@
                           UFFD_FEATURE_SIGBUS |                \
                           UFFD_FEATURE_THREAD_ID |             \
                           UFFD_FEATURE_MINOR_HUGETLBFS |       \
-                          UFFD_FEATURE_MINOR_SHMEM)
+                          UFFD_FEATURE_MINOR_SHMEM |           \
+                          UFFD_FEATURE_EXACT_ADDRESS)
 #define UFFD_API_IOCTLS                                \
        ((__u64)1 << _UFFDIO_REGISTER |         \
         (__u64)1 << _UFFDIO_UNREGISTER |       \
@@ -189,6 +190,10 @@ struct uffdio_api {
         *
         * UFFD_FEATURE_MINOR_SHMEM indicates the same support as
         * UFFD_FEATURE_MINOR_HUGETLBFS, but for shmem-backed pages instead.
+        *
+        * UFFD_FEATURE_EXACT_ADDRESS indicates that the exact address of page
+        * faults would be provided and the offset within the page would not be
+        * masked.
         */
 #define UFFD_FEATURE_PAGEFAULT_FLAG_WP         (1<<0)
 #define UFFD_FEATURE_EVENT_FORK                        (1<<1)
@@ -201,6 +206,7 @@ struct uffdio_api {
 #define UFFD_FEATURE_THREAD_ID                 (1<<8)
 #define UFFD_FEATURE_MINOR_HUGETLBFS           (1<<9)
 #define UFFD_FEATURE_MINOR_SHMEM               (1<<10)
+#define UFFD_FEATURE_EXACT_ADDRESS             (1<<11)
        __u64 features;
 
        __u64 ioctls;
index f425147..75b4187 100644 (file)
@@ -5341,6 +5341,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
                                                  pgoff_t idx,
                                                  unsigned int flags,
                                                  unsigned long haddr,
+                                                 unsigned long addr,
                                                  unsigned long reason)
 {
        vm_fault_t ret;
@@ -5348,6 +5349,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma,
        struct vm_fault vmf = {
                .vma = vma,
                .address = haddr,
+               .real_address = addr,
                .flags = flags,
 
                /*
@@ -5416,7 +5418,7 @@ retry:
                /* Check for page in userfault range */
                if (userfaultfd_missing(vma)) {
                        ret = hugetlb_handle_userfault(vma, mapping, idx,
-                                                      flags, haddr,
+                                                      flags, haddr, address,
                                                       VM_UFFD_MISSING);
                        goto out;
                }
@@ -5480,7 +5482,7 @@ retry:
                        unlock_page(page);
                        put_page(page);
                        ret = hugetlb_handle_userfault(vma, mapping, idx,
-                                                      flags, haddr,
+                                                      flags, haddr, address,
                                                       VM_UFFD_MINOR);
                        goto out;
                }
index 1a55b4c..e0f3410 100644 (file)
@@ -4633,6 +4633,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
        struct vm_fault vmf = {
                .vma = vma,
                .address = address & PAGE_MASK,
+               .real_address = address,
                .flags = flags,
                .pgoff = linear_page_index(vma, address),
                .gfp_mask = __get_fault_gfp_mask(vma),
index bf0df7a..33c7abb 100644 (file)
@@ -1951,6 +1951,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                        struct vm_fault vmf = {
                                .vma = vma,
                                .address = addr,
+                               .real_address = addr,
                                .pmd = pmd,
                        };