OSDN Git Service

io_uring: ensure RCU callback ordering with rcu_barrier()
[tomoyo/tomoyo-test1.git] / fs / io_uring.c
index 5a82601..1b25172 100644 (file)
@@ -183,17 +183,12 @@ struct fixed_file_table {
        struct file             **files;
 };
 
-enum {
-       FFD_F_ATOMIC,
-};
-
 struct fixed_file_data {
        struct fixed_file_table         *table;
        struct io_ring_ctx              *ctx;
 
        struct percpu_ref               refs;
        struct llist_head               put_llist;
-       unsigned long                   state;
        struct work_struct              ref_work;
        struct completion               done;
 };
@@ -1004,6 +999,7 @@ static void io_kill_timeout(struct io_kiocb *req)
        if (ret != -1) {
                atomic_inc(&req->ctx->cq_timeouts);
                list_del_init(&req->list);
+               req->flags |= REQ_F_COMP_LOCKED;
                io_cqring_fill_event(req, 0);
                io_put_req(req);
        }
@@ -1260,6 +1256,9 @@ static void __io_req_aux_free(struct io_kiocb *req)
 {
        struct io_ring_ctx *ctx = req->ctx;
 
+       if (req->flags & REQ_F_NEED_CLEANUP)
+               io_cleanup_req(req);
+
        kfree(req->io);
        if (req->file) {
                if (req->flags & REQ_F_FIXED_FILE)
@@ -1275,9 +1274,6 @@ static void __io_free_req(struct io_kiocb *req)
 {
        __io_req_aux_free(req);
 
-       if (req->flags & REQ_F_NEED_CLEANUP)
-               io_cleanup_req(req);
-
        if (req->flags & REQ_F_INFLIGHT) {
                struct io_ring_ctx *ctx = req->ctx;
                unsigned long flags;
@@ -1483,10 +1479,10 @@ static void io_free_req(struct io_kiocb *req)
 __attribute__((nonnull))
 static void io_put_req_find_next(struct io_kiocb *req, struct io_kiocb **nxtptr)
 {
-       io_req_find_next(req, nxtptr);
-
-       if (refcount_dec_and_test(&req->refs))
+       if (refcount_dec_and_test(&req->refs)) {
+               io_req_find_next(req, nxtptr);
                __io_free_req(req);
+       }
 }
 
 static void io_put_req(struct io_kiocb *req)
@@ -1672,11 +1668,17 @@ static void io_iopoll_reap_events(struct io_ring_ctx *ctx)
        mutex_unlock(&ctx->uring_lock);
 }
 
-static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
-                           long min)
+static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
+                          long min)
 {
        int iters = 0, ret = 0;
 
+       /*
+        * We disallow the app entering submit/complete with polling, but we
+        * still need to lock the ring to prevent racing with polled issue
+        * that got punted to a workqueue.
+        */
+       mutex_lock(&ctx->uring_lock);
        do {
                int tmin = 0;
 
@@ -1712,21 +1714,6 @@ static int __io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
                ret = 0;
        } while (min && !*nr_events && !need_resched());
 
-       return ret;
-}
-
-static int io_iopoll_check(struct io_ring_ctx *ctx, unsigned *nr_events,
-                          long min)
-{
-       int ret;
-
-       /*
-        * We disallow the app entering submit/complete with polling, but we
-        * still need to lock the ring to prevent racing with polled issue
-        * that got punted to a workqueue.
-        */
-       mutex_lock(&ctx->uring_lock);
-       ret = __io_iopoll_check(ctx, nr_events, min);
        mutex_unlock(&ctx->uring_lock);
        return ret;
 }
@@ -1830,6 +1817,10 @@ static void io_iopoll_req_issued(struct io_kiocb *req)
                list_add(&req->list, &ctx->poll_list);
        else
                list_add_tail(&req->list, &ctx->poll_list);
+
+       if ((ctx->flags & IORING_SETUP_SQPOLL) &&
+           wq_has_sleeper(&ctx->sqo_wait))
+               wake_up(&ctx->sqo_wait);
 }
 
 static void io_file_put(struct io_submit_state *state)
@@ -2080,7 +2071,7 @@ static ssize_t io_import_iovec(int rw, struct io_kiocb *req,
                ssize_t ret;
                ret = import_single_range(rw, buf, sqe_len, *iovec, iter);
                *iovec = NULL;
-               return ret;
+               return ret < 0 ? ret : sqe_len;
        }
 
        if (req->io) {
@@ -2517,6 +2508,9 @@ static void io_fallocate_finish(struct io_wq_work **workptr)
        struct io_kiocb *nxt = NULL;
        int ret;
 
+       if (io_req_cancelled(req))
+               return;
+
        ret = vfs_fallocate(req->file, req->sync.mode, req->sync.off,
                                req->sync.len);
        if (ret < 0)
@@ -2904,6 +2898,7 @@ static void io_close_finish(struct io_wq_work **workptr)
        struct io_kiocb *req = container_of(*workptr, struct io_kiocb, work);
        struct io_kiocb *nxt = NULL;
 
+       /* not cancellable, don't do io_req_cancelled() */
        __io_close_finish(req, &nxt);
        if (nxt)
                io_wq_assign_next(workptr, nxt);
@@ -3007,6 +3002,11 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe)
        sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
        sr->len = READ_ONCE(sqe->len);
 
+#ifdef CONFIG_COMPAT
+       if (req->ctx->compat)
+               sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+
        if (!io || req->opcode == IORING_OP_SEND)
                return 0;
        /* iovec is already imported */
@@ -3071,7 +3071,7 @@ static int io_sendmsg(struct io_kiocb *req, struct io_kiocb **nxt,
                        if (req->io)
                                return -EAGAIN;
                        if (io_alloc_async_ctx(req)) {
-                               if (kmsg && kmsg->iov != kmsg->fast_iov)
+                               if (kmsg->iov != kmsg->fast_iov)
                                        kfree(kmsg->iov);
                                return -ENOMEM;
                        }
@@ -3159,6 +3159,11 @@ static int io_recvmsg_prep(struct io_kiocb *req,
        sr->msg = u64_to_user_ptr(READ_ONCE(sqe->addr));
        sr->len = READ_ONCE(sqe->len);
 
+#ifdef CONFIG_COMPAT
+       if (req->ctx->compat)
+               sr->msg_flags |= MSG_CMSG_COMPAT;
+#endif
+
        if (!io || req->opcode == IORING_OP_RECV)
                return 0;
        /* iovec is already imported */
@@ -3225,7 +3230,7 @@ static int io_recvmsg(struct io_kiocb *req, struct io_kiocb **nxt,
                        if (req->io)
                                return -EAGAIN;
                        if (io_alloc_async_ctx(req)) {
-                               if (kmsg && kmsg->iov != kmsg->fast_iov)
+                               if (kmsg->iov != kmsg->fast_iov)
                                        kfree(kmsg->iov);
                                return -ENOMEM;
                        }
@@ -4710,11 +4715,21 @@ static void __io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
 {
        struct io_kiocb *linked_timeout;
        struct io_kiocb *nxt = NULL;
+       const struct cred *old_creds = NULL;
        int ret;
 
 again:
        linked_timeout = io_prep_linked_timeout(req);
 
+       if (req->work.creds && req->work.creds != current_cred()) {
+               if (old_creds)
+                       revert_creds(old_creds);
+               if (old_creds == req->work.creds)
+                       old_creds = NULL; /* restored original creds */
+               else
+                       old_creds = override_creds(req->work.creds);
+       }
+
        ret = io_issue_sqe(req, sqe, &nxt, true);
 
        /*
@@ -4740,7 +4755,7 @@ punt:
 
 err:
        /* drop submission reference */
-       io_put_req(req);
+       io_put_req_find_next(req, &nxt);
 
        if (linked_timeout) {
                if (!ret)
@@ -4764,6 +4779,8 @@ done_req:
                        goto punt;
                goto again;
        }
+       if (old_creds)
+               revert_creds(old_creds);
 }
 
 static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -4808,7 +4825,6 @@ static inline void io_queue_link_head(struct io_kiocb *req)
 static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
                          struct io_submit_state *state, struct io_kiocb **link)
 {
-       const struct cred *old_creds = NULL;
        struct io_ring_ctx *ctx = req->ctx;
        unsigned int sqe_flags;
        int ret, id;
@@ -4823,14 +4839,12 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 
        id = READ_ONCE(sqe->personality);
        if (id) {
-               const struct cred *personality_creds;
-
-               personality_creds = idr_find(&ctx->personality_idr, id);
-               if (unlikely(!personality_creds)) {
+               req->work.creds = idr_find(&ctx->personality_idr, id);
+               if (unlikely(!req->work.creds)) {
                        ret = -EINVAL;
                        goto err_req;
                }
-               old_creds = override_creds(personality_creds);
+               get_cred(req->work.creds);
        }
 
        /* same numerical values with corresponding REQ_F_*, safe to copy */
@@ -4842,8 +4856,6 @@ static bool io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe,
 err_req:
                io_cqring_add_event(req, ret);
                io_double_put_req(req);
-               if (old_creds)
-                       revert_creds(old_creds);
                return false;
        }
 
@@ -4904,8 +4916,6 @@ err_req:
                }
        }
 
-       if (old_creds)
-               revert_creds(old_creds);
        return true;
 }
 
@@ -5086,9 +5096,8 @@ static int io_sq_thread(void *data)
        const struct cred *old_cred;
        mm_segment_t old_fs;
        DEFINE_WAIT(wait);
-       unsigned inflight;
        unsigned long timeout;
-       int ret;
+       int ret = 0;
 
        complete(&ctx->completions[1]);
 
@@ -5096,39 +5105,19 @@ static int io_sq_thread(void *data)
        set_fs(USER_DS);
        old_cred = override_creds(ctx->creds);
 
-       ret = timeout = inflight = 0;
+       timeout = jiffies + ctx->sq_thread_idle;
        while (!kthread_should_park()) {
                unsigned int to_submit;
 
-               if (inflight) {
+               if (!list_empty(&ctx->poll_list)) {
                        unsigned nr_events = 0;
 
-                       if (ctx->flags & IORING_SETUP_IOPOLL) {
-                               /*
-                                * inflight is the count of the maximum possible
-                                * entries we submitted, but it can be smaller
-                                * if we dropped some of them. If we don't have
-                                * poll entries available, then we know that we
-                                * have nothing left to poll for. Reset the
-                                * inflight count to zero in that case.
-                                */
-                               mutex_lock(&ctx->uring_lock);
-                               if (!list_empty(&ctx->poll_list))
-                                       __io_iopoll_check(ctx, &nr_events, 0);
-                               else
-                                       inflight = 0;
-                               mutex_unlock(&ctx->uring_lock);
-                       } else {
-                               /*
-                                * Normal IO, just pretend everything completed.
-                                * We don't have to poll completions for that.
-                                */
-                               nr_events = inflight;
-                       }
-
-                       inflight -= nr_events;
-                       if (!inflight)
+                       mutex_lock(&ctx->uring_lock);
+                       if (!list_empty(&ctx->poll_list))
+                               io_iopoll_getevents(ctx, &nr_events, 0);
+                       else
                                timeout = jiffies + ctx->sq_thread_idle;
+                       mutex_unlock(&ctx->uring_lock);
                }
 
                to_submit = io_sqring_entries(ctx);
@@ -5139,34 +5128,47 @@ static int io_sq_thread(void *data)
                 */
                if (!to_submit || ret == -EBUSY) {
                        /*
+                        * Drop cur_mm before scheduling, we can't hold it for
+                        * long periods (or over schedule()). Do this before
+                        * adding ourselves to the waitqueue, as the unuse/drop
+                        * may sleep.
+                        */
+                       if (cur_mm) {
+                               unuse_mm(cur_mm);
+                               mmput(cur_mm);
+                               cur_mm = NULL;
+                       }
+
+                       /*
                         * We're polling. If we're within the defined idle
                         * period, then let us spin without work before going
                         * to sleep. The exception is if we got EBUSY doing
                         * more IO, we should wait for the application to
                         * reap events and wake us up.
                         */
-                       if (inflight ||
+                       if (!list_empty(&ctx->poll_list) ||
                            (!time_after(jiffies, timeout) && ret != -EBUSY &&
                            !percpu_ref_is_dying(&ctx->refs))) {
                                cond_resched();
                                continue;
                        }
 
+                       prepare_to_wait(&ctx->sqo_wait, &wait,
+                                               TASK_INTERRUPTIBLE);
+
                        /*
-                        * Drop cur_mm before scheduling, we can't hold it for
-                        * long periods (or over schedule()). Do this before
-                        * adding ourselves to the waitqueue, as the unuse/drop
-                        * may sleep.
+                        * While doing polled IO, before going to sleep, we need
+                        * to check if there are new reqs added to poll_list, it
+                        * is because reqs may have been punted to io worker and
+                        * will be added to poll_list later, hence check the
+                        * poll_list again.
                         */
-                       if (cur_mm) {
-                               unuse_mm(cur_mm);
-                               mmput(cur_mm);
-                               cur_mm = NULL;
+                       if ((ctx->flags & IORING_SETUP_IOPOLL) &&
+                           !list_empty_careful(&ctx->poll_list)) {
+                               finish_wait(&ctx->sqo_wait, &wait);
+                               continue;
                        }
 
-                       prepare_to_wait(&ctx->sqo_wait, &wait,
-                                               TASK_INTERRUPTIBLE);
-
                        /* Tell userspace we may need a wakeup call */
                        ctx->rings->sq_flags |= IORING_SQ_NEED_WAKEUP;
                        /* make sure to read SQ tail after writing flags */
@@ -5194,8 +5196,7 @@ static int io_sq_thread(void *data)
                mutex_lock(&ctx->uring_lock);
                ret = io_submit_sqes(ctx, to_submit, NULL, -1, &cur_mm, true);
                mutex_unlock(&ctx->uring_lock);
-               if (ret > 0)
-                       inflight += ret;
+               timeout = jiffies + ctx->sq_thread_idle;
        }
 
        set_fs(old_fs);
@@ -5329,6 +5330,23 @@ static void io_file_ref_kill(struct percpu_ref *ref)
        complete(&data->done);
 }
 
+static void io_file_ref_exit_and_free(struct work_struct *work)
+{
+       struct fixed_file_data *data;
+
+       data = container_of(work, struct fixed_file_data, ref_work);
+
+       /*
+        * Ensure any percpu-ref atomic switch callback has run, it could have
+        * been in progress when the files were being unregistered. Once
+        * that's done, we can safely exit and free the ref and containing
+        * data structure.
+        */
+       rcu_barrier();
+       percpu_ref_exit(&data->refs);
+       kfree(data);
+}
+
 static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
 {
        struct fixed_file_data *data = ctx->file_data;
@@ -5341,14 +5359,14 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx)
        flush_work(&data->ref_work);
        wait_for_completion(&data->done);
        io_ring_file_ref_flush(data);
-       percpu_ref_exit(&data->refs);
 
        __io_sqe_files_unregister(ctx);
        nr_tables = DIV_ROUND_UP(ctx->nr_user_files, IORING_MAX_FILES_TABLE);
        for (i = 0; i < nr_tables; i++)
                kfree(data->table[i].files);
        kfree(data->table);
-       kfree(data);
+       INIT_WORK(&data->ref_work, io_file_ref_exit_and_free);
+       queue_work(system_wq, &data->ref_work);
        ctx->file_data = NULL;
        ctx->nr_user_files = 0;
        return 0;
@@ -5600,7 +5618,6 @@ static void io_ring_file_ref_switch(struct work_struct *work)
 
        data = container_of(work, struct fixed_file_data, ref_work);
        io_ring_file_ref_flush(data);
-       percpu_ref_get(&data->refs);
        percpu_ref_switch_to_percpu(&data->refs);
 }
 
@@ -5776,8 +5793,13 @@ static void io_atomic_switch(struct percpu_ref *ref)
 {
        struct fixed_file_data *data;
 
+       /*
+        * Juggle reference to ensure we hit zero, if needed, so we can
+        * switch back to percpu mode
+        */
        data = container_of(ref, struct fixed_file_data, refs);
-       clear_bit(FFD_F_ATOMIC, &data->state);
+       percpu_ref_put(&data->refs);
+       percpu_ref_get(&data->refs);
 }
 
 static bool io_queue_file_removal(struct fixed_file_data *data,
@@ -5800,11 +5822,7 @@ static bool io_queue_file_removal(struct fixed_file_data *data,
        llist_add(&pfile->llist, &data->put_llist);
 
        if (pfile == &pfile_stack) {
-               if (!test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
-                       percpu_ref_put(&data->refs);
-                       percpu_ref_switch_to_atomic(&data->refs,
-                                                       io_atomic_switch);
-               }
+               percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
                wait_for_completion(&done);
                flush_work(&data->ref_work);
                return false;
@@ -5878,10 +5896,8 @@ static int __io_sqe_files_update(struct io_ring_ctx *ctx,
                up->offset++;
        }
 
-       if (ref_switch && !test_and_set_bit(FFD_F_ATOMIC, &data->state)) {
-               percpu_ref_put(&data->refs);
+       if (ref_switch)
                percpu_ref_switch_to_atomic(&data->refs, io_atomic_switch);
-       }
 
        return done ? done : err;
 }
@@ -6339,6 +6355,7 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx)
        io_sqe_buffer_unregister(ctx);
        io_sqe_files_unregister(ctx);
        io_eventfd_unregister(ctx);
+       idr_destroy(&ctx->personality_idr);
 
 #if defined(CONFIG_UNIX)
        if (ctx->ring_sock) {
@@ -6652,6 +6669,7 @@ out_fput:
        return submitted ? submitted : ret;
 }
 
+#ifdef CONFIG_PROC_FS
 static int io_uring_show_cred(int id, void *p, void *data)
 {
        const struct cred *cred = p;
@@ -6725,6 +6743,7 @@ static void io_uring_show_fdinfo(struct seq_file *m, struct file *f)
                percpu_ref_put(&ctx->refs);
        }
 }
+#endif
 
 static const struct file_operations io_uring_fops = {
        .release        = io_uring_release,
@@ -6736,7 +6755,9 @@ static const struct file_operations io_uring_fops = {
 #endif
        .poll           = io_uring_poll,
        .fasync         = io_uring_fasync,
+#ifdef CONFIG_PROC_FS
        .show_fdinfo    = io_uring_show_fdinfo,
+#endif
 };
 
 static int io_allocate_scq_urings(struct io_ring_ctx *ctx,