OSDN Git Service

Merge tag 'fs_for_v5.18-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack...
[uclinux-h8/linux.git] / kernel / fork.c
index d94f002..9796897 100644 (file)
@@ -97,6 +97,7 @@
 #include <linux/scs.h>
 #include <linux/io_uring.h>
 #include <linux/bpf.h>
+#include <linux/sched/mm.h>
 
 #include <asm/pgalloc.h>
 #include <linux/uaccess.h>
@@ -185,7 +186,7 @@ static inline void free_task_struct(struct task_struct *tsk)
  */
 # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)
 
-#ifdef CONFIG_VMAP_STACK
+#  ifdef CONFIG_VMAP_STACK
 /*
  * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB
  * flush.  Try to minimize the number of calls by caching stacks.
@@ -193,6 +194,41 @@ static inline void free_task_struct(struct task_struct *tsk)
 #define NR_CACHED_STACKS 2
 static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]);
 
+struct vm_stack {
+       struct rcu_head rcu;
+       struct vm_struct *stack_vm_area;
+};
+
+static bool try_release_thread_stack_to_cache(struct vm_struct *vm)
+{
+       unsigned int i;
+
+       for (i = 0; i < NR_CACHED_STACKS; i++) {
+               if (this_cpu_cmpxchg(cached_stacks[i], NULL, vm) != NULL)
+                       continue;
+               return true;
+       }
+       return false;
+}
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+       struct vm_stack *vm_stack = container_of(rh, struct vm_stack, rcu);
+
+       if (try_release_thread_stack_to_cache(vm_stack->stack_vm_area))
+               return;
+
+       vfree(vm_stack);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+       struct vm_stack *vm_stack = tsk->stack;
+
+       vm_stack->stack_vm_area = tsk->stack_vm_area;
+       call_rcu(&vm_stack->rcu, thread_stack_free_rcu);
+}
+
 static int free_vm_stack_cache(unsigned int cpu)
 {
        struct vm_struct **cached_vm_stacks = per_cpu_ptr(cached_stacks, cpu);
@@ -210,11 +246,35 @@ static int free_vm_stack_cache(unsigned int cpu)
 
        return 0;
 }
-#endif
 
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
+static int memcg_charge_kernel_stack(struct vm_struct *vm)
 {
-#ifdef CONFIG_VMAP_STACK
+       int i;
+       int ret;
+
+       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+       BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
+
+       for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
+               ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL, 0);
+               if (ret)
+                       goto err;
+       }
+       return 0;
+err:
+       /*
+        * If memcg_kmem_charge_page() fails, page's memory cgroup pointer is
+        * NULL, and memcg_kmem_uncharge_page() in free_thread_stack() will
+        * ignore this page.
+        */
+       for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+               memcg_kmem_uncharge_page(vm->pages[i], 0);
+       return ret;
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+       struct vm_struct *vm;
        void *stack;
        int i;
 
@@ -226,15 +286,22 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
                if (!s)
                        continue;
 
-               /* Mark stack accessible for KASAN. */
+               /* Reset stack metadata. */
                kasan_unpoison_range(s->addr, THREAD_SIZE);
 
+               stack = kasan_reset_tag(s->addr);
+
                /* Clear stale pointers from reused stack. */
-               memset(s->addr, 0, THREAD_SIZE);
+               memset(stack, 0, THREAD_SIZE);
+
+               if (memcg_charge_kernel_stack(s)) {
+                       vfree(s->addr);
+                       return -ENOMEM;
+               }
 
                tsk->stack_vm_area = s;
-               tsk->stack = s->addr;
-               return s->addr;
+               tsk->stack = stack;
+               return 0;
        }
 
        /*
@@ -247,71 +314,96 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node)
                                     THREADINFO_GFP & ~__GFP_ACCOUNT,
                                     PAGE_KERNEL,
                                     0, node, __builtin_return_address(0));
+       if (!stack)
+               return -ENOMEM;
 
+       vm = find_vm_area(stack);
+       if (memcg_charge_kernel_stack(vm)) {
+               vfree(stack);
+               return -ENOMEM;
+       }
        /*
         * We can't call find_vm_area() in interrupt context, and
         * free_thread_stack() can be called in interrupt context,
         * so cache the vm_struct.
         */
-       if (stack) {
-               tsk->stack_vm_area = find_vm_area(stack);
-               tsk->stack = stack;
-       }
-       return stack;
-#else
+       tsk->stack_vm_area = vm;
+       stack = kasan_reset_tag(stack);
+       tsk->stack = stack;
+       return 0;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+       if (!try_release_thread_stack_to_cache(tsk->stack_vm_area))
+               thread_stack_delayed_free(tsk);
+
+       tsk->stack = NULL;
+       tsk->stack_vm_area = NULL;
+}
+
+#  else /* !CONFIG_VMAP_STACK */
+
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+       __free_pages(virt_to_page(rh), THREAD_SIZE_ORDER);
+}
+
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+       struct rcu_head *rh = tsk->stack;
+
+       call_rcu(rh, thread_stack_free_rcu);
+}
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
        struct page *page = alloc_pages_node(node, THREADINFO_GFP,
                                             THREAD_SIZE_ORDER);
 
        if (likely(page)) {
                tsk->stack = kasan_reset_tag(page_address(page));
-               return tsk->stack;
+               return 0;
        }
-       return NULL;
-#endif
+       return -ENOMEM;
 }
 
-static inline void free_thread_stack(struct task_struct *tsk)
+static void free_thread_stack(struct task_struct *tsk)
 {
-#ifdef CONFIG_VMAP_STACK
-       struct vm_struct *vm = task_stack_vm_area(tsk);
-
-       if (vm) {
-               int i;
+       thread_stack_delayed_free(tsk);
+       tsk->stack = NULL;
+}
 
-               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
-                       memcg_kmem_uncharge_page(vm->pages[i], 0);
+#  endif /* CONFIG_VMAP_STACK */
+# else /* !(THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK)) */
 
-               for (i = 0; i < NR_CACHED_STACKS; i++) {
-                       if (this_cpu_cmpxchg(cached_stacks[i],
-                                       NULL, tsk->stack_vm_area) != NULL)
-                               continue;
+static struct kmem_cache *thread_stack_cache;
 
-                       return;
-               }
+static void thread_stack_free_rcu(struct rcu_head *rh)
+{
+       kmem_cache_free(thread_stack_cache, rh);
+}
 
-               vfree_atomic(tsk->stack);
-               return;
-       }
-#endif
+static void thread_stack_delayed_free(struct task_struct *tsk)
+{
+       struct rcu_head *rh = tsk->stack;
 
-       __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER);
+       call_rcu(rh, thread_stack_free_rcu);
 }
-# else
-static struct kmem_cache *thread_stack_cache;
 
-static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
-                                                 int node)
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
 {
        unsigned long *stack;
        stack = kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
        stack = kasan_reset_tag(stack);
        tsk->stack = stack;
-       return stack;
+       return stack ? 0 : -ENOMEM;
 }
 
 static void free_thread_stack(struct task_struct *tsk)
 {
-       kmem_cache_free(thread_stack_cache, tsk->stack);
+       thread_stack_delayed_free(tsk);
+       tsk->stack = NULL;
 }
 
 void thread_stack_cache_init(void)
@@ -321,8 +413,26 @@ void thread_stack_cache_init(void)
                                        THREAD_SIZE, NULL);
        BUG_ON(thread_stack_cache == NULL);
 }
-# endif
-#endif
+
+# endif /* THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) */
+#else /* CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
+
+static int alloc_thread_stack_node(struct task_struct *tsk, int node)
+{
+       unsigned long *stack;
+
+       stack = arch_alloc_thread_stack_node(tsk, node);
+       tsk->stack = stack;
+       return stack ? 0 : -ENOMEM;
+}
+
+static void free_thread_stack(struct task_struct *tsk)
+{
+       arch_free_thread_stack(tsk);
+       tsk->stack = NULL;
+}
+
+#endif /* !CONFIG_ARCH_THREAD_STACK_ALLOCATOR */
 
 /* SLAB cache for signal_struct structures (tsk->signal) */
 static struct kmem_cache *signal_cachep;
@@ -379,50 +489,34 @@ void vm_area_free(struct vm_area_struct *vma)
 
 static void account_kernel_stack(struct task_struct *tsk, int account)
 {
-       void *stack = task_stack_page(tsk);
-       struct vm_struct *vm = task_stack_vm_area(tsk);
-
-       if (vm) {
+       if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+               struct vm_struct *vm = task_stack_vm_area(tsk);
                int i;
 
                for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
                        mod_lruvec_page_state(vm->pages[i], NR_KERNEL_STACK_KB,
                                              account * (PAGE_SIZE / 1024));
        } else {
+               void *stack = task_stack_page(tsk);
+
                /* All stack pages are in the same node. */
                mod_lruvec_kmem_state(stack, NR_KERNEL_STACK_KB,
                                      account * (THREAD_SIZE / 1024));
        }
 }
 
-static int memcg_charge_kernel_stack(struct task_struct *tsk)
+void exit_task_stack_account(struct task_struct *tsk)
 {
-#ifdef CONFIG_VMAP_STACK
-       struct vm_struct *vm = task_stack_vm_area(tsk);
-       int ret;
-
-       BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0);
+       account_kernel_stack(tsk, -1);
 
-       if (vm) {
+       if (IS_ENABLED(CONFIG_VMAP_STACK)) {
+               struct vm_struct *vm;
                int i;
 
-               BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE);
-
-               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) {
-                       /*
-                        * If memcg_kmem_charge_page() fails, page's
-                        * memory cgroup pointer is NULL, and
-                        * memcg_kmem_uncharge_page() in free_thread_stack()
-                        * will ignore this page.
-                        */
-                       ret = memcg_kmem_charge_page(vm->pages[i], GFP_KERNEL,
-                                                    0);
-                       if (ret)
-                               return ret;
-               }
+               vm = task_stack_vm_area(tsk);
+               for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++)
+                       memcg_kmem_uncharge_page(vm->pages[i], 0);
        }
-#endif
-       return 0;
 }
 
 static void release_task_stack(struct task_struct *tsk)
@@ -430,12 +524,7 @@ static void release_task_stack(struct task_struct *tsk)
        if (WARN_ON(READ_ONCE(tsk->__state) != TASK_DEAD))
                return;  /* Better to leak the stack than to free prematurely */
 
-       account_kernel_stack(tsk, -1);
        free_thread_stack(tsk);
-       tsk->stack = NULL;
-#ifdef CONFIG_VMAP_STACK
-       tsk->stack_vm_area = NULL;
-#endif
 }
 
 #ifdef CONFIG_THREAD_INFO_IN_TASK
@@ -874,8 +963,6 @@ void set_task_stack_end_magic(struct task_struct *tsk)
 static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 {
        struct task_struct *tsk;
-       unsigned long *stack;
-       struct vm_struct *stack_vm_area __maybe_unused;
        int err;
 
        if (node == NUMA_NO_NODE)
@@ -884,32 +971,18 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        if (!tsk)
                return NULL;
 
-       stack = alloc_thread_stack_node(tsk, node);
-       if (!stack)
+       err = arch_dup_task_struct(tsk, orig);
+       if (err)
                goto free_tsk;
 
-       if (memcg_charge_kernel_stack(tsk))
-               goto free_stack;
-
-       stack_vm_area = task_stack_vm_area(tsk);
-
-       err = arch_dup_task_struct(tsk, orig);
+       err = alloc_thread_stack_node(tsk, node);
+       if (err)
+               goto free_tsk;
 
-       /*
-        * arch_dup_task_struct() clobbers the stack-related fields.  Make
-        * sure they're properly initialized before using any stack-related
-        * functions again.
-        */
-       tsk->stack = stack;
-#ifdef CONFIG_VMAP_STACK
-       tsk->stack_vm_area = stack_vm_area;
-#endif
 #ifdef CONFIG_THREAD_INFO_IN_TASK
        refcount_set(&tsk->stack_refcount, 1);
 #endif
-
-       if (err)
-               goto free_stack;
+       account_kernel_stack(tsk, 1);
 
        err = scs_prepare(tsk, node);
        if (err)
@@ -953,8 +1026,6 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->wake_q.next = NULL;
        tsk->worker_private = NULL;
 
-       account_kernel_stack(tsk, 1);
-
        kcov_task_init(tsk);
        kmap_local_fork(tsk);
 
@@ -967,12 +1038,17 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
        tsk->use_memdelay = 0;
 #endif
 
+#ifdef CONFIG_IOMMU_SVA
+       tsk->pasid_activated = 0;
+#endif
+
 #ifdef CONFIG_MEMCG
        tsk->active_memcg = NULL;
 #endif
        return tsk;
 
 free_stack:
+       exit_task_stack_account(tsk);
        free_thread_stack(tsk);
 free_tsk:
        free_task_struct(tsk);
@@ -1019,13 +1095,6 @@ static void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
 #endif
 }
 
-static void mm_init_pasid(struct mm_struct *mm)
-{
-#ifdef CONFIG_IOMMU_SUPPORT
-       mm->pasid = INIT_PASID;
-#endif
-}
-
 static void mm_init_uprobes_state(struct mm_struct *mm)
 {
 #ifdef CONFIG_UPROBES
@@ -1054,7 +1123,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p,
        mm_init_cpumask(mm);
        mm_init_aio(mm);
        mm_init_owner(mm, p);
-       mm_init_pasid(mm);
+       mm_pasid_init(mm);
        RCU_INIT_POINTER(mm->exe_file, NULL);
        mmu_notifier_subscriptions_init(mm);
        init_tlb_flush_pending(mm);
@@ -1121,6 +1190,7 @@ static inline void __mmput(struct mm_struct *mm)
        }
        if (mm->binfmt)
                module_put(mm->binfmt->module);
+       mm_pasid_drop(mm);
        mmdrop(mm);
 }
 
@@ -2454,6 +2524,7 @@ bad_fork_cleanup_count:
        exit_creds(p);
 bad_fork_free:
        WRITE_ONCE(p->__state, TASK_DEAD);
+       exit_task_stack_account(p);
        put_task_stack(p);
        delayed_free_task(p);
 fork_out: