OSDN Git Service

perf: Drop sample rate when sampling is too slow
[android-x86/kernel.git] / kernel / futex.c
1 /*
2  *  Fast Userspace Mutexes (which I call "Futexes!").
3  *  (C) Rusty Russell, IBM 2002
4  *
5  *  Generalized futexes, futex requeueing, misc fixes by Ingo Molnar
6  *  (C) Copyright 2003 Red Hat Inc, All Rights Reserved
7  *
8  *  Removed page pinning, fix privately mapped COW pages and other cleanups
9  *  (C) Copyright 2003, 2004 Jamie Lokier
10  *
11  *  Robust futex support started by Ingo Molnar
12  *  (C) Copyright 2006 Red Hat Inc, All Rights Reserved
13  *  Thanks to Thomas Gleixner for suggestions, analysis and fixes.
14  *
15  *  PI-futex support started by Ingo Molnar and Thomas Gleixner
16  *  Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
17  *  Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com>
18  *
19  *  PRIVATE futexes by Eric Dumazet
20  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
21  *
22  *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
23  *  Copyright (C) IBM Corporation, 2009
24  *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
25  *
26  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
27  *  enough at me, Linus for the original (flawed) idea, Matthew
28  *  Kirkwood for proof-of-concept implementation.
29  *
30  *  "The futexes are also cursed."
31  *  "But they come in a choice of three flavours!"
32  *
33  *  This program is free software; you can redistribute it and/or modify
34  *  it under the terms of the GNU General Public License as published by
35  *  the Free Software Foundation; either version 2 of the License, or
36  *  (at your option) any later version.
37  *
38  *  This program is distributed in the hope that it will be useful,
39  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
40  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
41  *  GNU General Public License for more details.
42  *
43  *  You should have received a copy of the GNU General Public License
44  *  along with this program; if not, write to the Free Software
45  *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
46  */
47 #include <linux/slab.h>
48 #include <linux/poll.h>
49 #include <linux/fs.h>
50 #include <linux/file.h>
51 #include <linux/jhash.h>
52 #include <linux/init.h>
53 #include <linux/futex.h>
54 #include <linux/mount.h>
55 #include <linux/pagemap.h>
56 #include <linux/syscalls.h>
57 #include <linux/signal.h>
58 #include <linux/export.h>
59 #include <linux/magic.h>
60 #include <linux/pid.h>
61 #include <linux/nsproxy.h>
62 #include <linux/ptrace.h>
63 #include <linux/sched/rt.h>
64 #include <linux/hugetlb.h>
65
66 #include <asm/futex.h>
67
68 #include "rtmutex_common.h"
69
70 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
71 int __read_mostly futex_cmpxchg_enabled;
72 #endif
73
74 #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8)
75
76 /*
77  * Futex flags used to encode options to functions and preserve them across
78  * restarts.
79  */
80 #define FLAGS_SHARED            0x01
81 #define FLAGS_CLOCKRT           0x02
82 #define FLAGS_HAS_TIMEOUT       0x04
83
84 /*
85  * Priority Inheritance state:
86  */
87 struct futex_pi_state {
88         /*
89          * list of 'owned' pi_state instances - these have to be
90          * cleaned up in do_exit() if the task exits prematurely:
91          */
92         struct list_head list;
93
94         /*
95          * The PI object:
96          */
97         struct rt_mutex pi_mutex;
98
99         struct task_struct *owner;
100         atomic_t refcount;
101
102         union futex_key key;
103 };
104
105 /**
106  * struct futex_q - The hashed futex queue entry, one per waiting task
107  * @list:               priority-sorted list of tasks waiting on this futex
108  * @task:               the task waiting on the futex
109  * @lock_ptr:           the hash bucket lock
110  * @key:                the key the futex is hashed on
111  * @pi_state:           optional priority inheritance state
112  * @rt_waiter:          rt_waiter storage for use with requeue_pi
113  * @requeue_pi_key:     the requeue_pi target futex key
114  * @bitset:             bitset for the optional bitmasked wakeup
115  *
116  * We use this hashed waitqueue, instead of a normal wait_queue_t, so
117  * we can wake only the relevant ones (hashed queues may be shared).
118  *
119  * A futex_q has a woken state, just like tasks have TASK_RUNNING.
120  * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0.
121  * The order of wakeup is always to make the first condition true, then
122  * the second.
123  *
124  * PI futexes are typically woken before they are removed from the hash list via
125  * the rt_mutex code. See unqueue_me_pi().
126  */
127 struct futex_q {
128         struct plist_node list;
129
130         struct task_struct *task;
131         spinlock_t *lock_ptr;
132         union futex_key key;
133         struct futex_pi_state *pi_state;
134         struct rt_mutex_waiter *rt_waiter;
135         union futex_key *requeue_pi_key;
136         u32 bitset;
137 };
138
139 static const struct futex_q futex_q_init = {
140         /* list gets initialized in queue_me()*/
141         .key = FUTEX_KEY_INIT,
142         .bitset = FUTEX_BITSET_MATCH_ANY
143 };
144
145 /*
146  * Hash buckets are shared by all the futex_keys that hash to the same
147  * location.  Each key may have multiple futex_q structures, one for each task
148  * waiting on a futex.
149  */
150 struct futex_hash_bucket {
151         spinlock_t lock;
152         struct plist_head chain;
153 };
154
155 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
156
157 /*
158  * We hash on the keys returned from get_futex_key (see below).
159  */
160 static struct futex_hash_bucket *hash_futex(union futex_key *key)
161 {
162         u32 hash = jhash2((u32*)&key->both.word,
163                           (sizeof(key->both.word)+sizeof(key->both.ptr))/4,
164                           key->both.offset);
165         return &futex_queues[hash & ((1 << FUTEX_HASHBITS)-1)];
166 }
167
168 /*
169  * Return 1 if two futex_keys are equal, 0 otherwise.
170  */
171 static inline int match_futex(union futex_key *key1, union futex_key *key2)
172 {
173         return (key1 && key2
174                 && key1->both.word == key2->both.word
175                 && key1->both.ptr == key2->both.ptr
176                 && key1->both.offset == key2->both.offset);
177 }
178
179 /*
180  * Take a reference to the resource addressed by a key.
181  * Can be called while holding spinlocks.
182  *
183  */
184 static void get_futex_key_refs(union futex_key *key)
185 {
186         if (!key->both.ptr)
187                 return;
188
189         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
190         case FUT_OFF_INODE:
191                 ihold(key->shared.inode);
192                 break;
193         case FUT_OFF_MMSHARED:
194                 atomic_inc(&key->private.mm->mm_count);
195                 break;
196         }
197 }
198
199 /*
200  * Drop a reference to the resource addressed by a key.
201  * The hash bucket spinlock must not be held.
202  */
203 static void drop_futex_key_refs(union futex_key *key)
204 {
205         if (!key->both.ptr) {
206                 /* If we're here then we tried to put a key we failed to get */
207                 WARN_ON_ONCE(1);
208                 return;
209         }
210
211         switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
212         case FUT_OFF_INODE:
213                 iput(key->shared.inode);
214                 break;
215         case FUT_OFF_MMSHARED:
216                 mmdrop(key->private.mm);
217                 break;
218         }
219 }
220
221 /**
222  * get_futex_key() - Get parameters which are the keys for a futex
223  * @uaddr:      virtual address of the futex
224  * @fshared:    0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
225  * @key:        address where result is stored.
226  * @rw:         mapping needs to be read/write (values: VERIFY_READ,
227  *              VERIFY_WRITE)
228  *
229  * Return: a negative error code or 0
230  *
231  * The key words are stored in *key on success.
232  *
233  * For shared mappings, it's (page->index, file_inode(vma->vm_file),
234  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
235  * We can usually work out the index without swapping in the page.
236  *
237  * lock_page() might sleep, the caller should not hold a spinlock.
238  */
239 static int
240 get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
241 {
242         unsigned long address = (unsigned long)uaddr;
243         struct mm_struct *mm = current->mm;
244         struct page *page, *page_head;
245         int err, ro = 0;
246
247         /*
248          * The futex address must be "naturally" aligned.
249          */
250         key->both.offset = address % PAGE_SIZE;
251         if (unlikely((address % sizeof(u32)) != 0))
252                 return -EINVAL;
253         address -= key->both.offset;
254
255         /*
256          * PROCESS_PRIVATE futexes are fast.
257          * As the mm cannot disappear under us and the 'key' only needs
258          * virtual address, we dont even have to find the underlying vma.
259          * Note : We do have to check 'uaddr' is a valid user address,
260          *        but access_ok() should be faster than find_vma()
261          */
262         if (!fshared) {
263                 if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
264                         return -EFAULT;
265                 key->private.mm = mm;
266                 key->private.address = address;
267                 get_futex_key_refs(key);
268                 return 0;
269         }
270
271 again:
272         err = get_user_pages_fast(address, 1, 1, &page);
273         /*
274          * If write access is not required (eg. FUTEX_WAIT), try
275          * and get read-only access.
276          */
277         if (err == -EFAULT && rw == VERIFY_READ) {
278                 err = get_user_pages_fast(address, 1, 0, &page);
279                 ro = 1;
280         }
281         if (err < 0)
282                 return err;
283         else
284                 err = 0;
285
286 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
287         page_head = page;
288         if (unlikely(PageTail(page))) {
289                 put_page(page);
290                 /* serialize against __split_huge_page_splitting() */
291                 local_irq_disable();
292                 if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
293                         page_head = compound_head(page);
294                         /*
295                          * page_head is valid pointer but we must pin
296                          * it before taking the PG_lock and/or
297                          * PG_compound_lock. The moment we re-enable
298                          * irqs __split_huge_page_splitting() can
299                          * return and the head page can be freed from
300                          * under us. We can't take the PG_lock and/or
301                          * PG_compound_lock on a page that could be
302                          * freed from under us.
303                          */
304                         if (page != page_head) {
305                                 get_page(page_head);
306                                 put_page(page);
307                         }
308                         local_irq_enable();
309                 } else {
310                         local_irq_enable();
311                         goto again;
312                 }
313         }
314 #else
315         page_head = compound_head(page);
316         if (page != page_head) {
317                 get_page(page_head);
318                 put_page(page);
319         }
320 #endif
321
322         lock_page(page_head);
323
324         /*
325          * If page_head->mapping is NULL, then it cannot be a PageAnon
326          * page; but it might be the ZERO_PAGE or in the gate area or
327          * in a special mapping (all cases which we are happy to fail);
328          * or it may have been a good file page when get_user_pages_fast
329          * found it, but truncated or holepunched or subjected to
330          * invalidate_complete_page2 before we got the page lock (also
331          * cases which we are happy to fail).  And we hold a reference,
332          * so refcount care in invalidate_complete_page's remove_mapping
333          * prevents drop_caches from setting mapping to NULL beneath us.
334          *
335          * The case we do have to guard against is when memory pressure made
336          * shmem_writepage move it from filecache to swapcache beneath us:
337          * an unlikely race, but we do need to retry for page_head->mapping.
338          */
339         if (!page_head->mapping) {
340                 int shmem_swizzled = PageSwapCache(page_head);
341                 unlock_page(page_head);
342                 put_page(page_head);
343                 if (shmem_swizzled)
344                         goto again;
345                 return -EFAULT;
346         }
347
348         /*
349          * Private mappings are handled in a simple way.
350          *
351          * NOTE: When userspace waits on a MAP_SHARED mapping, even if
352          * it's a read-only handle, it's expected that futexes attach to
353          * the object not the particular process.
354          */
355         if (PageAnon(page_head)) {
356                 /*
357                  * A RO anonymous page will never change and thus doesn't make
358                  * sense for futex operations.
359                  */
360                 if (ro) {
361                         err = -EFAULT;
362                         goto out;
363                 }
364
365                 key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
366                 key->private.mm = mm;
367                 key->private.address = address;
368         } else {
369                 key->both.offset |= FUT_OFF_INODE; /* inode-based key */
370                 key->shared.inode = page_head->mapping->host;
371                 key->shared.pgoff = basepage_index(page);
372         }
373
374         get_futex_key_refs(key);
375
376 out:
377         unlock_page(page_head);
378         put_page(page_head);
379         return err;
380 }
381
382 static inline void put_futex_key(union futex_key *key)
383 {
384         drop_futex_key_refs(key);
385 }
386
387 /**
388  * fault_in_user_writeable() - Fault in user address and verify RW access
389  * @uaddr:      pointer to faulting user space address
390  *
391  * Slow path to fixup the fault we just took in the atomic write
392  * access to @uaddr.
393  *
394  * We have no generic implementation of a non-destructive write to the
395  * user address. We know that we faulted in the atomic pagefault
396  * disabled section so we can as well avoid the #PF overhead by
397  * calling get_user_pages() right away.
398  */
399 static int fault_in_user_writeable(u32 __user *uaddr)
400 {
401         struct mm_struct *mm = current->mm;
402         int ret;
403
404         down_read(&mm->mmap_sem);
405         ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
406                                FAULT_FLAG_WRITE);
407         up_read(&mm->mmap_sem);
408
409         return ret < 0 ? ret : 0;
410 }
411
412 /**
413  * futex_top_waiter() - Return the highest priority waiter on a futex
414  * @hb:         the hash bucket the futex_q's reside in
415  * @key:        the futex key (to distinguish it from other futex futex_q's)
416  *
417  * Must be called with the hb lock held.
418  */
419 static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
420                                         union futex_key *key)
421 {
422         struct futex_q *this;
423
424         plist_for_each_entry(this, &hb->chain, list) {
425                 if (match_futex(&this->key, key))
426                         return this;
427         }
428         return NULL;
429 }
430
431 static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
432                                       u32 uval, u32 newval)
433 {
434         int ret;
435
436         pagefault_disable();
437         ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
438         pagefault_enable();
439
440         return ret;
441 }
442
443 static int get_futex_value_locked(u32 *dest, u32 __user *from)
444 {
445         int ret;
446
447         pagefault_disable();
448         ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
449         pagefault_enable();
450
451         return ret ? -EFAULT : 0;
452 }
453
454
455 /*
456  * PI code:
457  */
458 static int refill_pi_state_cache(void)
459 {
460         struct futex_pi_state *pi_state;
461
462         if (likely(current->pi_state_cache))
463                 return 0;
464
465         pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
466
467         if (!pi_state)
468                 return -ENOMEM;
469
470         INIT_LIST_HEAD(&pi_state->list);
471         /* pi_mutex gets initialized later */
472         pi_state->owner = NULL;
473         atomic_set(&pi_state->refcount, 1);
474         pi_state->key = FUTEX_KEY_INIT;
475
476         current->pi_state_cache = pi_state;
477
478         return 0;
479 }
480
481 static struct futex_pi_state * alloc_pi_state(void)
482 {
483         struct futex_pi_state *pi_state = current->pi_state_cache;
484
485         WARN_ON(!pi_state);
486         current->pi_state_cache = NULL;
487
488         return pi_state;
489 }
490
491 static void free_pi_state(struct futex_pi_state *pi_state)
492 {
493         if (!atomic_dec_and_test(&pi_state->refcount))
494                 return;
495
496         /*
497          * If pi_state->owner is NULL, the owner is most probably dying
498          * and has cleaned up the pi_state already
499          */
500         if (pi_state->owner) {
501                 raw_spin_lock_irq(&pi_state->owner->pi_lock);
502                 list_del_init(&pi_state->list);
503                 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
504
505                 rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner);
506         }
507
508         if (current->pi_state_cache)
509                 kfree(pi_state);
510         else {
511                 /*
512                  * pi_state->list is already empty.
513                  * clear pi_state->owner.
514                  * refcount is at 0 - put it back to 1.
515                  */
516                 pi_state->owner = NULL;
517                 atomic_set(&pi_state->refcount, 1);
518                 current->pi_state_cache = pi_state;
519         }
520 }
521
522 /*
523  * Look up the task based on what TID userspace gave us.
524  * We dont trust it.
525  */
526 static struct task_struct * futex_find_get_task(pid_t pid)
527 {
528         struct task_struct *p;
529
530         rcu_read_lock();
531         p = find_task_by_vpid(pid);
532         if (p)
533                 get_task_struct(p);
534
535         rcu_read_unlock();
536
537         return p;
538 }
539
540 /*
541  * This task is holding PI mutexes at exit time => bad.
542  * Kernel cleans up PI-state, but userspace is likely hosed.
543  * (Robust-futex cleanup is separate and might save the day for userspace.)
544  */
545 void exit_pi_state_list(struct task_struct *curr)
546 {
547         struct list_head *next, *head = &curr->pi_state_list;
548         struct futex_pi_state *pi_state;
549         struct futex_hash_bucket *hb;
550         union futex_key key = FUTEX_KEY_INIT;
551
552         if (!futex_cmpxchg_enabled)
553                 return;
554         /*
555          * We are a ZOMBIE and nobody can enqueue itself on
556          * pi_state_list anymore, but we have to be careful
557          * versus waiters unqueueing themselves:
558          */
559         raw_spin_lock_irq(&curr->pi_lock);
560         while (!list_empty(head)) {
561
562                 next = head->next;
563                 pi_state = list_entry(next, struct futex_pi_state, list);
564                 key = pi_state->key;
565                 hb = hash_futex(&key);
566                 raw_spin_unlock_irq(&curr->pi_lock);
567
568                 spin_lock(&hb->lock);
569
570                 raw_spin_lock_irq(&curr->pi_lock);
571                 /*
572                  * We dropped the pi-lock, so re-check whether this
573                  * task still owns the PI-state:
574                  */
575                 if (head->next != next) {
576                         spin_unlock(&hb->lock);
577                         continue;
578                 }
579
580                 WARN_ON(pi_state->owner != curr);
581                 WARN_ON(list_empty(&pi_state->list));
582                 list_del_init(&pi_state->list);
583                 pi_state->owner = NULL;
584                 raw_spin_unlock_irq(&curr->pi_lock);
585
586                 rt_mutex_unlock(&pi_state->pi_mutex);
587
588                 spin_unlock(&hb->lock);
589
590                 raw_spin_lock_irq(&curr->pi_lock);
591         }
592         raw_spin_unlock_irq(&curr->pi_lock);
593 }
594
595 /*
596  * We need to check the following states:
597  *
598  *      Waiter | pi_state | pi->owner | uTID      | uODIED | ?
599  *
600  * [1]  NULL   | ---      | ---       | 0         | 0/1    | Valid
601  * [2]  NULL   | ---      | ---       | >0        | 0/1    | Valid
602  *
603  * [3]  Found  | NULL     | --        | Any       | 0/1    | Invalid
604  *
605  * [4]  Found  | Found    | NULL      | 0         | 1      | Valid
606  * [5]  Found  | Found    | NULL      | >0        | 1      | Invalid
607  *
608  * [6]  Found  | Found    | task      | 0         | 1      | Valid
609  *
610  * [7]  Found  | Found    | NULL      | Any       | 0      | Invalid
611  *
612  * [8]  Found  | Found    | task      | ==taskTID | 0/1    | Valid
613  * [9]  Found  | Found    | task      | 0         | 0      | Invalid
614  * [10] Found  | Found    | task      | !=taskTID | 0/1    | Invalid
615  *
616  * [1]  Indicates that the kernel can acquire the futex atomically. We
617  *      came came here due to a stale FUTEX_WAITERS/FUTEX_OWNER_DIED bit.
618  *
619  * [2]  Valid, if TID does not belong to a kernel thread. If no matching
620  *      thread is found then it indicates that the owner TID has died.
621  *
622  * [3]  Invalid. The waiter is queued on a non PI futex
623  *
624  * [4]  Valid state after exit_robust_list(), which sets the user space
625  *      value to FUTEX_WAITERS | FUTEX_OWNER_DIED.
626  *
627  * [5]  The user space value got manipulated between exit_robust_list()
628  *      and exit_pi_state_list()
629  *
630  * [6]  Valid state after exit_pi_state_list() which sets the new owner in
631  *      the pi_state but cannot access the user space value.
632  *
633  * [7]  pi_state->owner can only be NULL when the OWNER_DIED bit is set.
634  *
635  * [8]  Owner and user space value match
636  *
637  * [9]  There is no transient state which sets the user space TID to 0
638  *      except exit_robust_list(), but this is indicated by the
639  *      FUTEX_OWNER_DIED bit. See [4]
640  *
641  * [10] There is no transient state which leaves owner and user space
642  *      TID out of sync.
643  */
644 static int
645 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
646                 union futex_key *key, struct futex_pi_state **ps)
647 {
648         struct futex_pi_state *pi_state = NULL;
649         struct futex_q *this, *next;
650         struct plist_head *head;
651         struct task_struct *p;
652         pid_t pid = uval & FUTEX_TID_MASK;
653
654         head = &hb->chain;
655
656         plist_for_each_entry_safe(this, next, head, list) {
657                 if (match_futex(&this->key, key)) {
658                         /*
659                          * Sanity check the waiter before increasing
660                          * the refcount and attaching to it.
661                          */
662                         pi_state = this->pi_state;
663                         /*
664                          * Userspace might have messed up non-PI and
665                          * PI futexes [3]
666                          */
667                         if (unlikely(!pi_state))
668                                 return -EINVAL;
669
670                         WARN_ON(!atomic_read(&pi_state->refcount));
671
672                         /*
673                          * Handle the owner died case:
674                          */
675                         if (uval & FUTEX_OWNER_DIED) {
676                                 /*
677                                  * exit_pi_state_list sets owner to NULL and
678                                  * wakes the topmost waiter. The task which
679                                  * acquires the pi_state->rt_mutex will fixup
680                                  * owner.
681                                  */
682                                 if (!pi_state->owner) {
683                                         /*
684                                          * No pi state owner, but the user
685                                          * space TID is not 0. Inconsistent
686                                          * state. [5]
687                                          */
688                                         if (pid)
689                                                 return -EINVAL;
690                                         /*
691                                          * Take a ref on the state and
692                                          * return. [4]
693                                          */
694                                         goto out_state;
695                                 }
696
697                                 /*
698                                  * If TID is 0, then either the dying owner
699                                  * has not yet executed exit_pi_state_list()
700                                  * or some waiter acquired the rtmutex in the
701                                  * pi state, but did not yet fixup the TID in
702                                  * user space.
703                                  *
704                                  * Take a ref on the state and return. [6]
705                                  */
706                                 if (!pid)
707                                         goto out_state;
708                         } else {
709                                 /*
710                                  * If the owner died bit is not set,
711                                  * then the pi_state must have an
712                                  * owner. [7]
713                                  */
714                                 if (!pi_state->owner)
715                                         return -EINVAL;
716                         }
717
718                         /*
719                          * Bail out if user space manipulated the
720                          * futex value. If pi state exists then the
721                          * owner TID must be the same as the user
722                          * space TID. [9/10]
723                          */
724                         if (pid != task_pid_vnr(pi_state->owner))
725                                 return -EINVAL;
726
727                 out_state:
728                         atomic_inc(&pi_state->refcount);
729                         *ps = pi_state;
730                         return 0;
731                 }
732         }
733
734         /*
735          * We are the first waiter - try to look up the real owner and attach
736          * the new pi_state to it, but bail out when TID = 0 [1]
737          */
738         if (!pid)
739                 return -ESRCH;
740         p = futex_find_get_task(pid);
741         if (!p)
742                 return -ESRCH;
743
744         if (!p->mm) {
745                 put_task_struct(p);
746                 return -EPERM;
747         }
748
749         /*
750          * We need to look at the task state flags to figure out,
751          * whether the task is exiting. To protect against the do_exit
752          * change of the task flags, we do this protected by
753          * p->pi_lock:
754          */
755         raw_spin_lock_irq(&p->pi_lock);
756         if (unlikely(p->flags & PF_EXITING)) {
757                 /*
758                  * The task is on the way out. When PF_EXITPIDONE is
759                  * set, we know that the task has finished the
760                  * cleanup:
761                  */
762                 int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN;
763
764                 raw_spin_unlock_irq(&p->pi_lock);
765                 put_task_struct(p);
766                 return ret;
767         }
768
769         /*
770          * No existing pi state. First waiter. [2]
771          */
772         pi_state = alloc_pi_state();
773
774         /*
775          * Initialize the pi_mutex in locked state and make 'p'
776          * the owner of it:
777          */
778         rt_mutex_init_proxy_locked(&pi_state->pi_mutex, p);
779
780         /* Store the key for possible exit cleanups: */
781         pi_state->key = *key;
782
783         WARN_ON(!list_empty(&pi_state->list));
784         list_add(&pi_state->list, &p->pi_state_list);
785         pi_state->owner = p;
786         raw_spin_unlock_irq(&p->pi_lock);
787
788         put_task_struct(p);
789
790         *ps = pi_state;
791
792         return 0;
793 }
794
795 /**
796  * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex
797  * @uaddr:              the pi futex user address
798  * @hb:                 the pi futex hash bucket
799  * @key:                the futex key associated with uaddr and hb
800  * @ps:                 the pi_state pointer where we store the result of the
801  *                      lookup
802  * @task:               the task to perform the atomic lock work for.  This will
803  *                      be "current" except in the case of requeue pi.
804  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
805  *
806  * Return:
807  *  0 - ready to wait;
808  *  1 - acquired the lock;
809  * <0 - error
810  *
811  * The hb->lock and futex_key refs shall be held by the caller.
812  */
813 static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
814                                 union futex_key *key,
815                                 struct futex_pi_state **ps,
816                                 struct task_struct *task, int set_waiters)
817 {
818         int lock_taken, ret, force_take = 0;
819         u32 uval, newval, curval, vpid = task_pid_vnr(task);
820
821 retry:
822         ret = lock_taken = 0;
823
824         /*
825          * To avoid races, we attempt to take the lock here again
826          * (by doing a 0 -> TID atomic cmpxchg), while holding all
827          * the locks. It will most likely not succeed.
828          */
829         newval = vpid;
830         if (set_waiters)
831                 newval |= FUTEX_WAITERS;
832
833         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
834                 return -EFAULT;
835
836         /*
837          * Detect deadlocks.
838          */
839         if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
840                 return -EDEADLK;
841
842         /*
843          * Surprise - we got the lock, but we do not trust user space at all.
844          */
845         if (unlikely(!curval)) {
846                 /*
847                  * We verify whether there is kernel state for this
848                  * futex. If not, we can safely assume, that the 0 ->
849                  * TID transition is correct. If state exists, we do
850                  * not bother to fixup the user space state as it was
851                  * corrupted already.
852                  */
853                 return futex_top_waiter(hb, key) ? -EINVAL : 1;
854         }
855
856         uval = curval;
857
858         /*
859          * Set the FUTEX_WAITERS flag, so the owner will know it has someone
860          * to wake at the next unlock.
861          */
862         newval = curval | FUTEX_WAITERS;
863
864         /*
865          * Should we force take the futex? See below.
866          */
867         if (unlikely(force_take)) {
868                 /*
869                  * Keep the OWNER_DIED and the WAITERS bit and set the
870                  * new TID value.
871                  */
872                 newval = (curval & ~FUTEX_TID_MASK) | vpid;
873                 force_take = 0;
874                 lock_taken = 1;
875         }
876
877         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
878                 return -EFAULT;
879         if (unlikely(curval != uval))
880                 goto retry;
881
882         /*
883          * We took the lock due to forced take over.
884          */
885         if (unlikely(lock_taken))
886                 return 1;
887
888         /*
889          * We dont have the lock. Look up the PI state (or create it if
890          * we are the first waiter):
891          */
892         ret = lookup_pi_state(uval, hb, key, ps);
893
894         if (unlikely(ret)) {
895                 switch (ret) {
896                 case -ESRCH:
897                         /*
898                          * We failed to find an owner for this
899                          * futex. So we have no pi_state to block
900                          * on. This can happen in two cases:
901                          *
902                          * 1) The owner died
903                          * 2) A stale FUTEX_WAITERS bit
904                          *
905                          * Re-read the futex value.
906                          */
907                         if (get_futex_value_locked(&curval, uaddr))
908                                 return -EFAULT;
909
910                         /*
911                          * If the owner died or we have a stale
912                          * WAITERS bit the owner TID in the user space
913                          * futex is 0.
914                          */
915                         if (!(curval & FUTEX_TID_MASK)) {
916                                 force_take = 1;
917                                 goto retry;
918                         }
919                 default:
920                         break;
921                 }
922         }
923
924         return ret;
925 }
926
927 /**
928  * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
929  * @q:  The futex_q to unqueue
930  *
931  * The q->lock_ptr must not be NULL and must be held by the caller.
932  */
933 static void __unqueue_futex(struct futex_q *q)
934 {
935         struct futex_hash_bucket *hb;
936
937         if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
938             || WARN_ON(plist_node_empty(&q->list)))
939                 return;
940
941         hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
942         plist_del(&q->list, &hb->chain);
943 }
944
945 /*
946  * The hash bucket lock must be held when this is called.
947  * Afterwards, the futex_q must not be accessed.
948  */
949 static void wake_futex(struct futex_q *q)
950 {
951         struct task_struct *p = q->task;
952
953         if (WARN(q->pi_state || q->rt_waiter, "refusing to wake PI futex\n"))
954                 return;
955
956         /*
957          * We set q->lock_ptr = NULL _before_ we wake up the task. If
958          * a non-futex wake up happens on another CPU then the task
959          * might exit and p would dereference a non-existing task
960          * struct. Prevent this by holding a reference on p across the
961          * wake up.
962          */
963         get_task_struct(p);
964
965         __unqueue_futex(q);
966         /*
967          * The waiting task can free the futex_q as soon as
968          * q->lock_ptr = NULL is written, without taking any locks. A
969          * memory barrier is required here to prevent the following
970          * store to lock_ptr from getting ahead of the plist_del.
971          */
972         smp_wmb();
973         q->lock_ptr = NULL;
974
975         wake_up_state(p, TASK_NORMAL);
976         put_task_struct(p);
977 }
978
979 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
980 {
981         struct task_struct *new_owner;
982         struct futex_pi_state *pi_state = this->pi_state;
983         u32 uninitialized_var(curval), newval;
984         int ret = 0;
985
986         if (!pi_state)
987                 return -EINVAL;
988
989         /*
990          * If current does not own the pi_state then the futex is
991          * inconsistent and user space fiddled with the futex value.
992          */
993         if (pi_state->owner != current)
994                 return -EINVAL;
995
996         raw_spin_lock(&pi_state->pi_mutex.wait_lock);
997         new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
998
999         /*
1000          * It is possible that the next waiter (the one that brought
1001          * this owner to the kernel) timed out and is no longer
1002          * waiting on the lock.
1003          */
1004         if (!new_owner)
1005                 new_owner = this->task;
1006
1007         /*
1008          * We pass it to the next owner. The WAITERS bit is always
1009          * kept enabled while there is PI state around. We cleanup the
1010          * owner died bit, because we are the owner.
1011          */
1012         newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
1013
1014         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1015                 ret = -EFAULT;
1016         else if (curval != uval)
1017                 ret = -EINVAL;
1018         if (ret) {
1019                 raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1020                 return ret;
1021         }
1022
1023         raw_spin_lock_irq(&pi_state->owner->pi_lock);
1024         WARN_ON(list_empty(&pi_state->list));
1025         list_del_init(&pi_state->list);
1026         raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1027
1028         raw_spin_lock_irq(&new_owner->pi_lock);
1029         WARN_ON(!list_empty(&pi_state->list));
1030         list_add(&pi_state->list, &new_owner->pi_state_list);
1031         pi_state->owner = new_owner;
1032         raw_spin_unlock_irq(&new_owner->pi_lock);
1033
1034         raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
1035         rt_mutex_unlock(&pi_state->pi_mutex);
1036
1037         return 0;
1038 }
1039
1040 static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
1041 {
1042         u32 uninitialized_var(oldval);
1043
1044         /*
1045          * There is no waiter, so we unlock the futex. The owner died
1046          * bit has not to be preserved here. We are the owner:
1047          */
1048         if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
1049                 return -EFAULT;
1050         if (oldval != uval)
1051                 return -EAGAIN;
1052
1053         return 0;
1054 }
1055
1056 /*
1057  * Express the locking dependencies for lockdep:
1058  */
1059 static inline void
1060 double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1061 {
1062         if (hb1 <= hb2) {
1063                 spin_lock(&hb1->lock);
1064                 if (hb1 < hb2)
1065                         spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
1066         } else { /* hb1 > hb2 */
1067                 spin_lock(&hb2->lock);
1068                 spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
1069         }
1070 }
1071
1072 static inline void
1073 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
1074 {
1075         spin_unlock(&hb1->lock);
1076         if (hb1 != hb2)
1077                 spin_unlock(&hb2->lock);
1078 }
1079
1080 /*
1081  * Wake up waiters matching bitset queued on this futex (uaddr).
1082  */
1083 static int
1084 futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset)
1085 {
1086         struct futex_hash_bucket *hb;
1087         struct futex_q *this, *next;
1088         struct plist_head *head;
1089         union futex_key key = FUTEX_KEY_INIT;
1090         int ret;
1091
1092         if (!bitset)
1093                 return -EINVAL;
1094
1095         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_READ);
1096         if (unlikely(ret != 0))
1097                 goto out;
1098
1099         hb = hash_futex(&key);
1100         spin_lock(&hb->lock);
1101         head = &hb->chain;
1102
1103         plist_for_each_entry_safe(this, next, head, list) {
1104                 if (match_futex (&this->key, &key)) {
1105                         if (this->pi_state || this->rt_waiter) {
1106                                 ret = -EINVAL;
1107                                 break;
1108                         }
1109
1110                         /* Check if one of the bits is set in both bitsets */
1111                         if (!(this->bitset & bitset))
1112                                 continue;
1113
1114                         wake_futex(this);
1115                         if (++ret >= nr_wake)
1116                                 break;
1117                 }
1118         }
1119
1120         spin_unlock(&hb->lock);
1121         put_futex_key(&key);
1122 out:
1123         return ret;
1124 }
1125
1126 /*
1127  * Wake up all waiters hashed on the physical page that is mapped
1128  * to this virtual address:
1129  */
1130 static int
1131 futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2,
1132               int nr_wake, int nr_wake2, int op)
1133 {
1134         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1135         struct futex_hash_bucket *hb1, *hb2;
1136         struct plist_head *head;
1137         struct futex_q *this, *next;
1138         int ret, op_ret;
1139
1140 retry:
1141         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1142         if (unlikely(ret != 0))
1143                 goto out;
1144         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
1145         if (unlikely(ret != 0))
1146                 goto out_put_key1;
1147
1148         hb1 = hash_futex(&key1);
1149         hb2 = hash_futex(&key2);
1150
1151 retry_private:
1152         double_lock_hb(hb1, hb2);
1153         op_ret = futex_atomic_op_inuser(op, uaddr2);
1154         if (unlikely(op_ret < 0)) {
1155
1156                 double_unlock_hb(hb1, hb2);
1157
1158 #ifndef CONFIG_MMU
1159                 /*
1160                  * we don't get EFAULT from MMU faults if we don't have an MMU,
1161                  * but we might get them from range checking
1162                  */
1163                 ret = op_ret;
1164                 goto out_put_keys;
1165 #endif
1166
1167                 if (unlikely(op_ret != -EFAULT)) {
1168                         ret = op_ret;
1169                         goto out_put_keys;
1170                 }
1171
1172                 ret = fault_in_user_writeable(uaddr2);
1173                 if (ret)
1174                         goto out_put_keys;
1175
1176                 if (!(flags & FLAGS_SHARED))
1177                         goto retry_private;
1178
1179                 put_futex_key(&key2);
1180                 put_futex_key(&key1);
1181                 goto retry;
1182         }
1183
1184         head = &hb1->chain;
1185
1186         plist_for_each_entry_safe(this, next, head, list) {
1187                 if (match_futex (&this->key, &key1)) {
1188                         if (this->pi_state || this->rt_waiter) {
1189                                 ret = -EINVAL;
1190                                 goto out_unlock;
1191                         }
1192                         wake_futex(this);
1193                         if (++ret >= nr_wake)
1194                                 break;
1195                 }
1196         }
1197
1198         if (op_ret > 0) {
1199                 head = &hb2->chain;
1200
1201                 op_ret = 0;
1202                 plist_for_each_entry_safe(this, next, head, list) {
1203                         if (match_futex (&this->key, &key2)) {
1204                                 if (this->pi_state || this->rt_waiter) {
1205                                         ret = -EINVAL;
1206                                         goto out_unlock;
1207                                 }
1208                                 wake_futex(this);
1209                                 if (++op_ret >= nr_wake2)
1210                                         break;
1211                         }
1212                 }
1213                 ret += op_ret;
1214         }
1215
1216 out_unlock:
1217         double_unlock_hb(hb1, hb2);
1218 out_put_keys:
1219         put_futex_key(&key2);
1220 out_put_key1:
1221         put_futex_key(&key1);
1222 out:
1223         return ret;
1224 }
1225
1226 /**
1227  * requeue_futex() - Requeue a futex_q from one hb to another
1228  * @q:          the futex_q to requeue
1229  * @hb1:        the source hash_bucket
1230  * @hb2:        the target hash_bucket
1231  * @key2:       the new key for the requeued futex_q
1232  */
1233 static inline
1234 void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
1235                    struct futex_hash_bucket *hb2, union futex_key *key2)
1236 {
1237
1238         /*
1239          * If key1 and key2 hash to the same bucket, no need to
1240          * requeue.
1241          */
1242         if (likely(&hb1->chain != &hb2->chain)) {
1243                 plist_del(&q->list, &hb1->chain);
1244                 plist_add(&q->list, &hb2->chain);
1245                 q->lock_ptr = &hb2->lock;
1246         }
1247         get_futex_key_refs(key2);
1248         q->key = *key2;
1249 }
1250
1251 /**
1252  * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
1253  * @q:          the futex_q
1254  * @key:        the key of the requeue target futex
1255  * @hb:         the hash_bucket of the requeue target futex
1256  *
1257  * During futex_requeue, with requeue_pi=1, it is possible to acquire the
1258  * target futex if it is uncontended or via a lock steal.  Set the futex_q key
1259  * to the requeue target futex so the waiter can detect the wakeup on the right
1260  * futex, but remove it from the hb and NULL the rt_waiter so it can detect
1261  * atomic lock acquisition.  Set the q->lock_ptr to the requeue target hb->lock
1262  * to protect access to the pi_state to fixup the owner later.  Must be called
1263  * with both q->lock_ptr and hb->lock held.
1264  */
1265 static inline
1266 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
1267                            struct futex_hash_bucket *hb)
1268 {
1269         get_futex_key_refs(key);
1270         q->key = *key;
1271
1272         __unqueue_futex(q);
1273
1274         WARN_ON(!q->rt_waiter);
1275         q->rt_waiter = NULL;
1276
1277         q->lock_ptr = &hb->lock;
1278
1279         wake_up_state(q->task, TASK_NORMAL);
1280 }
1281
1282 /**
1283  * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
1284  * @pifutex:            the user address of the to futex
1285  * @hb1:                the from futex hash bucket, must be locked by the caller
1286  * @hb2:                the to futex hash bucket, must be locked by the caller
1287  * @key1:               the from futex key
1288  * @key2:               the to futex key
1289  * @ps:                 address to store the pi_state pointer
1290  * @set_waiters:        force setting the FUTEX_WAITERS bit (1) or not (0)
1291  *
1292  * Try and get the lock on behalf of the top waiter if we can do it atomically.
1293  * Wake the top waiter if we succeed.  If the caller specified set_waiters,
1294  * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
1295  * hb1 and hb2 must be held by the caller.
1296  *
1297  * Return:
1298  *  0 - failed to acquire the lock atomically;
1299  * >0 - acquired the lock, return value is vpid of the top_waiter
1300  * <0 - error
1301  */
1302 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
1303                                  struct futex_hash_bucket *hb1,
1304                                  struct futex_hash_bucket *hb2,
1305                                  union futex_key *key1, union futex_key *key2,
1306                                  struct futex_pi_state **ps, int set_waiters)
1307 {
1308         struct futex_q *top_waiter = NULL;
1309         u32 curval;
1310         int ret, vpid;
1311
1312         if (get_futex_value_locked(&curval, pifutex))
1313                 return -EFAULT;
1314
1315         /*
1316          * Find the top_waiter and determine if there are additional waiters.
1317          * If the caller intends to requeue more than 1 waiter to pifutex,
1318          * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
1319          * as we have means to handle the possible fault.  If not, don't set
1320          * the bit unecessarily as it will force the subsequent unlock to enter
1321          * the kernel.
1322          */
1323         top_waiter = futex_top_waiter(hb1, key1);
1324
1325         /* There are no waiters, nothing for us to do. */
1326         if (!top_waiter)
1327                 return 0;
1328
1329         /* Ensure we requeue to the expected futex. */
1330         if (!match_futex(top_waiter->requeue_pi_key, key2))
1331                 return -EINVAL;
1332
1333         /*
1334          * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
1335          * the contended case or if set_waiters is 1.  The pi_state is returned
1336          * in ps in contended cases.
1337          */
1338         vpid = task_pid_vnr(top_waiter->task);
1339         ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
1340                                    set_waiters);
1341         if (ret == 1) {
1342                 requeue_pi_wake_futex(top_waiter, key2, hb2);
1343                 return vpid;
1344         }
1345         return ret;
1346 }
1347
1348 /**
1349  * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
1350  * @uaddr1:     source futex user address
1351  * @flags:      futex flags (FLAGS_SHARED, etc.)
1352  * @uaddr2:     target futex user address
1353  * @nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
1354  * @nr_requeue: number of waiters to requeue (0-INT_MAX)
1355  * @cmpval:     @uaddr1 expected value (or %NULL)
1356  * @requeue_pi: if we are attempting to requeue from a non-pi futex to a
1357  *              pi futex (pi to pi requeue is not supported)
1358  *
1359  * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
1360  * uaddr2 atomically on behalf of the top waiter.
1361  *
1362  * Return:
1363  * >=0 - on success, the number of tasks requeued or woken;
1364  *  <0 - on error
1365  */
1366 static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
1367                          u32 __user *uaddr2, int nr_wake, int nr_requeue,
1368                          u32 *cmpval, int requeue_pi)
1369 {
1370         union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
1371         int drop_count = 0, task_count = 0, ret;
1372         struct futex_pi_state *pi_state = NULL;
1373         struct futex_hash_bucket *hb1, *hb2;
1374         struct plist_head *head1;
1375         struct futex_q *this, *next;
1376
1377         if (requeue_pi) {
1378                 /*
1379                  * Requeue PI only works on two distinct uaddrs. This
1380                  * check is only valid for private futexes. See below.
1381                  */
1382                 if (uaddr1 == uaddr2)
1383                         return -EINVAL;
1384
1385                 /*
1386                  * requeue_pi requires a pi_state, try to allocate it now
1387                  * without any locks in case it fails.
1388                  */
1389                 if (refill_pi_state_cache())
1390                         return -ENOMEM;
1391                 /*
1392                  * requeue_pi must wake as many tasks as it can, up to nr_wake
1393                  * + nr_requeue, since it acquires the rt_mutex prior to
1394                  * returning to userspace, so as to not leave the rt_mutex with
1395                  * waiters and no owner.  However, second and third wake-ups
1396                  * cannot be predicted as they involve race conditions with the
1397                  * first wake and a fault while looking up the pi_state.  Both
1398                  * pthread_cond_signal() and pthread_cond_broadcast() should
1399                  * use nr_wake=1.
1400                  */
1401                 if (nr_wake != 1)
1402                         return -EINVAL;
1403         }
1404
1405 retry:
1406         if (pi_state != NULL) {
1407                 /*
1408                  * We will have to lookup the pi_state again, so free this one
1409                  * to keep the accounting correct.
1410                  */
1411                 free_pi_state(pi_state);
1412                 pi_state = NULL;
1413         }
1414
1415         ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ);
1416         if (unlikely(ret != 0))
1417                 goto out;
1418         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2,
1419                             requeue_pi ? VERIFY_WRITE : VERIFY_READ);
1420         if (unlikely(ret != 0))
1421                 goto out_put_key1;
1422
1423         /*
1424          * The check above which compares uaddrs is not sufficient for
1425          * shared futexes. We need to compare the keys:
1426          */
1427         if (requeue_pi && match_futex(&key1, &key2)) {
1428                 ret = -EINVAL;
1429                 goto out_put_keys;
1430         }
1431
1432         hb1 = hash_futex(&key1);
1433         hb2 = hash_futex(&key2);
1434
1435 retry_private:
1436         double_lock_hb(hb1, hb2);
1437
1438         if (likely(cmpval != NULL)) {
1439                 u32 curval;
1440
1441                 ret = get_futex_value_locked(&curval, uaddr1);
1442
1443                 if (unlikely(ret)) {
1444                         double_unlock_hb(hb1, hb2);
1445
1446                         ret = get_user(curval, uaddr1);
1447                         if (ret)
1448                                 goto out_put_keys;
1449
1450                         if (!(flags & FLAGS_SHARED))
1451                                 goto retry_private;
1452
1453                         put_futex_key(&key2);
1454                         put_futex_key(&key1);
1455                         goto retry;
1456                 }
1457                 if (curval != *cmpval) {
1458                         ret = -EAGAIN;
1459                         goto out_unlock;
1460                 }
1461         }
1462
1463         if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
1464                 /*
1465                  * Attempt to acquire uaddr2 and wake the top waiter. If we
1466                  * intend to requeue waiters, force setting the FUTEX_WAITERS
1467                  * bit.  We force this here where we are able to easily handle
1468                  * faults rather in the requeue loop below.
1469                  */
1470                 ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
1471                                                  &key2, &pi_state, nr_requeue);
1472
1473                 /*
1474                  * At this point the top_waiter has either taken uaddr2 or is
1475                  * waiting on it.  If the former, then the pi_state will not
1476                  * exist yet, look it up one more time to ensure we have a
1477                  * reference to it. If the lock was taken, ret contains the
1478                  * vpid of the top waiter task.
1479                  */
1480                 if (ret > 0) {
1481                         WARN_ON(pi_state);
1482                         drop_count++;
1483                         task_count++;
1484                         /*
1485                          * If we acquired the lock, then the user
1486                          * space value of uaddr2 should be vpid. It
1487                          * cannot be changed by the top waiter as it
1488                          * is blocked on hb2 lock if it tries to do
1489                          * so. If something fiddled with it behind our
1490                          * back the pi state lookup might unearth
1491                          * it. So we rather use the known value than
1492                          * rereading and handing potential crap to
1493                          * lookup_pi_state.
1494                          */
1495                         ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
1496                 }
1497
1498                 switch (ret) {
1499                 case 0:
1500                         break;
1501                 case -EFAULT:
1502                         double_unlock_hb(hb1, hb2);
1503                         put_futex_key(&key2);
1504                         put_futex_key(&key1);
1505                         ret = fault_in_user_writeable(uaddr2);
1506                         if (!ret)
1507                                 goto retry;
1508                         goto out;
1509                 case -EAGAIN:
1510                         /* The owner was exiting, try again. */
1511                         double_unlock_hb(hb1, hb2);
1512                         put_futex_key(&key2);
1513                         put_futex_key(&key1);
1514                         cond_resched();
1515                         goto retry;
1516                 default:
1517                         goto out_unlock;
1518                 }
1519         }
1520
1521         head1 = &hb1->chain;
1522         plist_for_each_entry_safe(this, next, head1, list) {
1523                 if (task_count - nr_wake >= nr_requeue)
1524                         break;
1525
1526                 if (!match_futex(&this->key, &key1))
1527                         continue;
1528
1529                 /*
1530                  * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always
1531                  * be paired with each other and no other futex ops.
1532                  *
1533                  * We should never be requeueing a futex_q with a pi_state,
1534                  * which is awaiting a futex_unlock_pi().
1535                  */
1536                 if ((requeue_pi && !this->rt_waiter) ||
1537                     (!requeue_pi && this->rt_waiter) ||
1538                     this->pi_state) {
1539                         ret = -EINVAL;
1540                         break;
1541                 }
1542
1543                 /*
1544                  * Wake nr_wake waiters.  For requeue_pi, if we acquired the
1545                  * lock, we already woke the top_waiter.  If not, it will be
1546                  * woken by futex_unlock_pi().
1547                  */
1548                 if (++task_count <= nr_wake && !requeue_pi) {
1549                         wake_futex(this);
1550                         continue;
1551                 }
1552
1553                 /* Ensure we requeue to the expected futex for requeue_pi. */
1554                 if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) {
1555                         ret = -EINVAL;
1556                         break;
1557                 }
1558
1559                 /*
1560                  * Requeue nr_requeue waiters and possibly one more in the case
1561                  * of requeue_pi if we couldn't acquire the lock atomically.
1562                  */
1563                 if (requeue_pi) {
1564                         /* Prepare the waiter to take the rt_mutex. */
1565                         atomic_inc(&pi_state->refcount);
1566                         this->pi_state = pi_state;
1567                         ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
1568                                                         this->rt_waiter,
1569                                                         this->task, 1);
1570                         if (ret == 1) {
1571                                 /* We got the lock. */
1572                                 requeue_pi_wake_futex(this, &key2, hb2);
1573                                 drop_count++;
1574                                 continue;
1575                         } else if (ret) {
1576                                 /* -EDEADLK */
1577                                 this->pi_state = NULL;
1578                                 free_pi_state(pi_state);
1579                                 goto out_unlock;
1580                         }
1581                 }
1582                 requeue_futex(this, hb1, hb2, &key2);
1583                 drop_count++;
1584         }
1585
1586 out_unlock:
1587         double_unlock_hb(hb1, hb2);
1588
1589         /*
1590          * drop_futex_key_refs() must be called outside the spinlocks. During
1591          * the requeue we moved futex_q's from the hash bucket at key1 to the
1592          * one at key2 and updated their key pointer.  We no longer need to
1593          * hold the references to key1.
1594          */
1595         while (--drop_count >= 0)
1596                 drop_futex_key_refs(&key1);
1597
1598 out_put_keys:
1599         put_futex_key(&key2);
1600 out_put_key1:
1601         put_futex_key(&key1);
1602 out:
1603         if (pi_state != NULL)
1604                 free_pi_state(pi_state);
1605         return ret ? ret : task_count;
1606 }
1607
1608 /* The key must be already stored in q->key. */
1609 static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
1610         __acquires(&hb->lock)
1611 {
1612         struct futex_hash_bucket *hb;
1613
1614         hb = hash_futex(&q->key);
1615         q->lock_ptr = &hb->lock;
1616
1617         spin_lock(&hb->lock);
1618         return hb;
1619 }
1620
1621 static inline void
1622 queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb)
1623         __releases(&hb->lock)
1624 {
1625         spin_unlock(&hb->lock);
1626 }
1627
1628 /**
1629  * queue_me() - Enqueue the futex_q on the futex_hash_bucket
1630  * @q:  The futex_q to enqueue
1631  * @hb: The destination hash bucket
1632  *
1633  * The hb->lock must be held by the caller, and is released here. A call to
1634  * queue_me() is typically paired with exactly one call to unqueue_me().  The
1635  * exceptions involve the PI related operations, which may use unqueue_me_pi()
1636  * or nothing if the unqueue is done as part of the wake process and the unqueue
1637  * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
1638  * an example).
1639  */
1640 static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
1641         __releases(&hb->lock)
1642 {
1643         int prio;
1644
1645         /*
1646          * The priority used to register this element is
1647          * - either the real thread-priority for the real-time threads
1648          * (i.e. threads with a priority lower than MAX_RT_PRIO)
1649          * - or MAX_RT_PRIO for non-RT threads.
1650          * Thus, all RT-threads are woken first in priority order, and
1651          * the others are woken last, in FIFO order.
1652          */
1653         prio = min(current->normal_prio, MAX_RT_PRIO);
1654
1655         plist_node_init(&q->list, prio);
1656         plist_add(&q->list, &hb->chain);
1657         q->task = current;
1658         spin_unlock(&hb->lock);
1659 }
1660
1661 /**
1662  * unqueue_me() - Remove the futex_q from its futex_hash_bucket
1663  * @q:  The futex_q to unqueue
1664  *
1665  * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
1666  * be paired with exactly one earlier call to queue_me().
1667  *
1668  * Return:
1669  *   1 - if the futex_q was still queued (and we removed unqueued it);
1670  *   0 - if the futex_q was already removed by the waking thread
1671  */
1672 static int unqueue_me(struct futex_q *q)
1673 {
1674         spinlock_t *lock_ptr;
1675         int ret = 0;
1676
1677         /* In the common case we don't take the spinlock, which is nice. */
1678 retry:
1679         lock_ptr = q->lock_ptr;
1680         barrier();
1681         if (lock_ptr != NULL) {
1682                 spin_lock(lock_ptr);
1683                 /*
1684                  * q->lock_ptr can change between reading it and
1685                  * spin_lock(), causing us to take the wrong lock.  This
1686                  * corrects the race condition.
1687                  *
1688                  * Reasoning goes like this: if we have the wrong lock,
1689                  * q->lock_ptr must have changed (maybe several times)
1690                  * between reading it and the spin_lock().  It can
1691                  * change again after the spin_lock() but only if it was
1692                  * already changed before the spin_lock().  It cannot,
1693                  * however, change back to the original value.  Therefore
1694                  * we can detect whether we acquired the correct lock.
1695                  */
1696                 if (unlikely(lock_ptr != q->lock_ptr)) {
1697                         spin_unlock(lock_ptr);
1698                         goto retry;
1699                 }
1700                 __unqueue_futex(q);
1701
1702                 BUG_ON(q->pi_state);
1703
1704                 spin_unlock(lock_ptr);
1705                 ret = 1;
1706         }
1707
1708         drop_futex_key_refs(&q->key);
1709         return ret;
1710 }
1711
1712 /*
1713  * PI futexes can not be requeued and must remove themself from the
1714  * hash bucket. The hash bucket lock (i.e. lock_ptr) is held on entry
1715  * and dropped here.
1716  */
1717 static void unqueue_me_pi(struct futex_q *q)
1718         __releases(q->lock_ptr)
1719 {
1720         __unqueue_futex(q);
1721
1722         BUG_ON(!q->pi_state);
1723         free_pi_state(q->pi_state);
1724         q->pi_state = NULL;
1725
1726         spin_unlock(q->lock_ptr);
1727 }
1728
1729 /*
1730  * Fixup the pi_state owner with the new owner.
1731  *
1732  * Must be called with hash bucket lock held and mm->sem held for non
1733  * private futexes.
1734  */
1735 static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
1736                                 struct task_struct *newowner)
1737 {
1738         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
1739         struct futex_pi_state *pi_state = q->pi_state;
1740         struct task_struct *oldowner = pi_state->owner;
1741         u32 uval, uninitialized_var(curval), newval;
1742         int ret;
1743
1744         /* Owner died? */
1745         if (!pi_state->owner)
1746                 newtid |= FUTEX_OWNER_DIED;
1747
1748         /*
1749          * We are here either because we stole the rtmutex from the
1750          * previous highest priority waiter or we are the highest priority
1751          * waiter but failed to get the rtmutex the first time.
1752          * We have to replace the newowner TID in the user space variable.
1753          * This must be atomic as we have to preserve the owner died bit here.
1754          *
1755          * Note: We write the user space value _before_ changing the pi_state
1756          * because we can fault here. Imagine swapped out pages or a fork
1757          * that marked all the anonymous memory readonly for cow.
1758          *
1759          * Modifying pi_state _before_ the user space value would
1760          * leave the pi_state in an inconsistent state when we fault
1761          * here, because we need to drop the hash bucket lock to
1762          * handle the fault. This might be observed in the PID check
1763          * in lookup_pi_state.
1764          */
1765 retry:
1766         if (get_futex_value_locked(&uval, uaddr))
1767                 goto handle_fault;
1768
1769         while (1) {
1770                 newval = (uval & FUTEX_OWNER_DIED) | newtid;
1771
1772                 if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
1773                         goto handle_fault;
1774                 if (curval == uval)
1775                         break;
1776                 uval = curval;
1777         }
1778
1779         /*
1780          * We fixed up user space. Now we need to fix the pi_state
1781          * itself.
1782          */
1783         if (pi_state->owner != NULL) {
1784                 raw_spin_lock_irq(&pi_state->owner->pi_lock);
1785                 WARN_ON(list_empty(&pi_state->list));
1786                 list_del_init(&pi_state->list);
1787                 raw_spin_unlock_irq(&pi_state->owner->pi_lock);
1788         }
1789
1790         pi_state->owner = newowner;
1791
1792         raw_spin_lock_irq(&newowner->pi_lock);
1793         WARN_ON(!list_empty(&pi_state->list));
1794         list_add(&pi_state->list, &newowner->pi_state_list);
1795         raw_spin_unlock_irq(&newowner->pi_lock);
1796         return 0;
1797
1798         /*
1799          * To handle the page fault we need to drop the hash bucket
1800          * lock here. That gives the other task (either the highest priority
1801          * waiter itself or the task which stole the rtmutex) the
1802          * chance to try the fixup of the pi_state. So once we are
1803          * back from handling the fault we need to check the pi_state
1804          * after reacquiring the hash bucket lock and before trying to
1805          * do another fixup. When the fixup has been done already we
1806          * simply return.
1807          */
1808 handle_fault:
1809         spin_unlock(q->lock_ptr);
1810
1811         ret = fault_in_user_writeable(uaddr);
1812
1813         spin_lock(q->lock_ptr);
1814
1815         /*
1816          * Check if someone else fixed it for us:
1817          */
1818         if (pi_state->owner != oldowner)
1819                 return 0;
1820
1821         if (ret)
1822                 return ret;
1823
1824         goto retry;
1825 }
1826
1827 static long futex_wait_restart(struct restart_block *restart);
1828
1829 /**
1830  * fixup_owner() - Post lock pi_state and corner case management
1831  * @uaddr:      user address of the futex
1832  * @q:          futex_q (contains pi_state and access to the rt_mutex)
1833  * @locked:     if the attempt to take the rt_mutex succeeded (1) or not (0)
1834  *
1835  * After attempting to lock an rt_mutex, this function is called to cleanup
1836  * the pi_state owner as well as handle race conditions that may allow us to
1837  * acquire the lock. Must be called with the hb lock held.
1838  *
1839  * Return:
1840  *  1 - success, lock taken;
1841  *  0 - success, lock not taken;
1842  * <0 - on error (-EFAULT)
1843  */
1844 static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
1845 {
1846         struct task_struct *owner;
1847         int ret = 0;
1848
1849         if (locked) {
1850                 /*
1851                  * Got the lock. We might not be the anticipated owner if we
1852                  * did a lock-steal - fix up the PI-state in that case:
1853                  */
1854                 if (q->pi_state->owner != current)
1855                         ret = fixup_pi_state_owner(uaddr, q, current);
1856                 goto out;
1857         }
1858
1859         /*
1860          * Catch the rare case, where the lock was released when we were on the
1861          * way back before we locked the hash bucket.
1862          */
1863         if (q->pi_state->owner == current) {
1864                 /*
1865                  * Try to get the rt_mutex now. This might fail as some other
1866                  * task acquired the rt_mutex after we removed ourself from the
1867                  * rt_mutex waiters list.
1868                  */
1869                 if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
1870                         locked = 1;
1871                         goto out;
1872                 }
1873
1874                 /*
1875                  * pi_state is incorrect, some other task did a lock steal and
1876                  * we returned due to timeout or signal without taking the
1877                  * rt_mutex. Too late.
1878                  */
1879                 raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
1880                 owner = rt_mutex_owner(&q->pi_state->pi_mutex);
1881                 if (!owner)
1882                         owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
1883                 raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
1884                 ret = fixup_pi_state_owner(uaddr, q, owner);
1885                 goto out;
1886         }
1887
1888         /*
1889          * Paranoia check. If we did not take the lock, then we should not be
1890          * the owner of the rt_mutex.
1891          */
1892         if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
1893                 printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
1894                                 "pi-state %p\n", ret,
1895                                 q->pi_state->pi_mutex.owner,
1896                                 q->pi_state->owner);
1897
1898 out:
1899         return ret ? ret : locked;
1900 }
1901
1902 /**
1903  * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
1904  * @hb:         the futex hash bucket, must be locked by the caller
1905  * @q:          the futex_q to queue up on
1906  * @timeout:    the prepared hrtimer_sleeper, or null for no timeout
1907  */
1908 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
1909                                 struct hrtimer_sleeper *timeout)
1910 {
1911         /*
1912          * The task state is guaranteed to be set before another task can
1913          * wake it. set_current_state() is implemented using set_mb() and
1914          * queue_me() calls spin_unlock() upon completion, both serializing
1915          * access to the hash list and forcing another memory barrier.
1916          */
1917         set_current_state(TASK_INTERRUPTIBLE);
1918         queue_me(q, hb);
1919
1920         /* Arm the timer */
1921         if (timeout) {
1922                 hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
1923                 if (!hrtimer_active(&timeout->timer))
1924                         timeout->task = NULL;
1925         }
1926
1927         /*
1928          * If we have been removed from the hash list, then another task
1929          * has tried to wake us, and we can skip the call to schedule().
1930          */
1931         if (likely(!plist_node_empty(&q->list))) {
1932                 /*
1933                  * If the timer has already expired, current will already be
1934                  * flagged for rescheduling. Only call schedule if there
1935                  * is no timeout, or if it has yet to expire.
1936                  */
1937                 if (!timeout || timeout->task)
1938                         schedule();
1939         }
1940         __set_current_state(TASK_RUNNING);
1941 }
1942
1943 /**
1944  * futex_wait_setup() - Prepare to wait on a futex
1945  * @uaddr:      the futex userspace address
1946  * @val:        the expected value
1947  * @flags:      futex flags (FLAGS_SHARED, etc.)
1948  * @q:          the associated futex_q
1949  * @hb:         storage for hash_bucket pointer to be returned to caller
1950  *
1951  * Setup the futex_q and locate the hash_bucket.  Get the futex value and
1952  * compare it with the expected value.  Handle atomic faults internally.
1953  * Return with the hb lock held and a q.key reference on success, and unlocked
1954  * with no q.key reference on failure.
1955  *
1956  * Return:
1957  *  0 - uaddr contains val and hb has been locked;
1958  * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
1959  */
1960 static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
1961                            struct futex_q *q, struct futex_hash_bucket **hb)
1962 {
1963         u32 uval;
1964         int ret;
1965
1966         /*
1967          * Access the page AFTER the hash-bucket is locked.
1968          * Order is important:
1969          *
1970          *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
1971          *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
1972          *
1973          * The basic logical guarantee of a futex is that it blocks ONLY
1974          * if cond(var) is known to be true at the time of blocking, for
1975          * any cond.  If we locked the hash-bucket after testing *uaddr, that
1976          * would open a race condition where we could block indefinitely with
1977          * cond(var) false, which would violate the guarantee.
1978          *
1979          * On the other hand, we insert q and release the hash-bucket only
1980          * after testing *uaddr.  This guarantees that futex_wait() will NOT
1981          * absorb a wakeup if *uaddr does not match the desired values
1982          * while the syscall executes.
1983          */
1984 retry:
1985         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key, VERIFY_READ);
1986         if (unlikely(ret != 0))
1987                 return ret;
1988
1989 retry_private:
1990         *hb = queue_lock(q);
1991
1992         ret = get_futex_value_locked(&uval, uaddr);
1993
1994         if (ret) {
1995                 queue_unlock(q, *hb);
1996
1997                 ret = get_user(uval, uaddr);
1998                 if (ret)
1999                         goto out;
2000
2001                 if (!(flags & FLAGS_SHARED))
2002                         goto retry_private;
2003
2004                 put_futex_key(&q->key);
2005                 goto retry;
2006         }
2007
2008         if (uval != val) {
2009                 queue_unlock(q, *hb);
2010                 ret = -EWOULDBLOCK;
2011         }
2012
2013 out:
2014         if (ret)
2015                 put_futex_key(&q->key);
2016         return ret;
2017 }
2018
2019 static int futex_wait(u32 __user *uaddr, unsigned int flags, u32 val,
2020                       ktime_t *abs_time, u32 bitset)
2021 {
2022         struct hrtimer_sleeper timeout, *to = NULL;
2023         struct restart_block *restart;
2024         struct futex_hash_bucket *hb;
2025         struct futex_q q = futex_q_init;
2026         int ret;
2027
2028         if (!bitset)
2029                 return -EINVAL;
2030         q.bitset = bitset;
2031
2032         if (abs_time) {
2033                 to = &timeout;
2034
2035                 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2036                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
2037                                       HRTIMER_MODE_ABS);
2038                 hrtimer_init_sleeper(to, current);
2039                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2040                                              current->timer_slack_ns);
2041         }
2042
2043 retry:
2044         /*
2045          * Prepare to wait on uaddr. On success, holds hb lock and increments
2046          * q.key refs.
2047          */
2048         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2049         if (ret)
2050                 goto out;
2051
2052         /* queue_me and wait for wakeup, timeout, or a signal. */
2053         futex_wait_queue_me(hb, &q, to);
2054
2055         /* If we were woken (and unqueued), we succeeded, whatever. */
2056         ret = 0;
2057         /* unqueue_me() drops q.key ref */
2058         if (!unqueue_me(&q))
2059                 goto out;
2060         ret = -ETIMEDOUT;
2061         if (to && !to->task)
2062                 goto out;
2063
2064         /*
2065          * We expect signal_pending(current), but we might be the
2066          * victim of a spurious wakeup as well.
2067          */
2068         if (!signal_pending(current))
2069                 goto retry;
2070
2071         ret = -ERESTARTSYS;
2072         if (!abs_time)
2073                 goto out;
2074
2075         restart = &current_thread_info()->restart_block;
2076         restart->fn = futex_wait_restart;
2077         restart->futex.uaddr = uaddr;
2078         restart->futex.val = val;
2079         restart->futex.time = abs_time->tv64;
2080         restart->futex.bitset = bitset;
2081         restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
2082
2083         ret = -ERESTART_RESTARTBLOCK;
2084
2085 out:
2086         if (to) {
2087                 hrtimer_cancel(&to->timer);
2088                 destroy_hrtimer_on_stack(&to->timer);
2089         }
2090         return ret;
2091 }
2092
2093
2094 static long futex_wait_restart(struct restart_block *restart)
2095 {
2096         u32 __user *uaddr = restart->futex.uaddr;
2097         ktime_t t, *tp = NULL;
2098
2099         if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
2100                 t.tv64 = restart->futex.time;
2101                 tp = &t;
2102         }
2103         restart->fn = do_no_restart_syscall;
2104
2105         return (long)futex_wait(uaddr, restart->futex.flags,
2106                                 restart->futex.val, tp, restart->futex.bitset);
2107 }
2108
2109
2110 /*
2111  * Userspace tried a 0 -> TID atomic transition of the futex value
2112  * and failed. The kernel side here does the whole locking operation:
2113  * if there are waiters then it will block, it does PI, etc. (Due to
2114  * races the kernel might see a 0 value of the futex too.)
2115  */
2116 static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, int detect,
2117                          ktime_t *time, int trylock)
2118 {
2119         struct hrtimer_sleeper timeout, *to = NULL;
2120         struct futex_hash_bucket *hb;
2121         struct futex_q q = futex_q_init;
2122         int res, ret;
2123
2124         if (refill_pi_state_cache())
2125                 return -ENOMEM;
2126
2127         if (time) {
2128                 to = &timeout;
2129                 hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME,
2130                                       HRTIMER_MODE_ABS);
2131                 hrtimer_init_sleeper(to, current);
2132                 hrtimer_set_expires(&to->timer, *time);
2133         }
2134
2135 retry:
2136         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q.key, VERIFY_WRITE);
2137         if (unlikely(ret != 0))
2138                 goto out;
2139
2140 retry_private:
2141         hb = queue_lock(&q);
2142
2143         ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
2144         if (unlikely(ret)) {
2145                 switch (ret) {
2146                 case 1:
2147                         /* We got the lock. */
2148                         ret = 0;
2149                         goto out_unlock_put_key;
2150                 case -EFAULT:
2151                         goto uaddr_faulted;
2152                 case -EAGAIN:
2153                         /*
2154                          * Task is exiting and we just wait for the
2155                          * exit to complete.
2156                          */
2157                         queue_unlock(&q, hb);
2158                         put_futex_key(&q.key);
2159                         cond_resched();
2160                         goto retry;
2161                 default:
2162                         goto out_unlock_put_key;
2163                 }
2164         }
2165
2166         /*
2167          * Only actually queue now that the atomic ops are done:
2168          */
2169         queue_me(&q, hb);
2170
2171         WARN_ON(!q.pi_state);
2172         /*
2173          * Block on the PI mutex:
2174          */
2175         if (!trylock)
2176                 ret = rt_mutex_timed_lock(&q.pi_state->pi_mutex, to, 1);
2177         else {
2178                 ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
2179                 /* Fixup the trylock return value: */
2180                 ret = ret ? 0 : -EWOULDBLOCK;
2181         }
2182
2183         spin_lock(q.lock_ptr);
2184         /*
2185          * Fixup the pi_state owner and possibly acquire the lock if we
2186          * haven't already.
2187          */
2188         res = fixup_owner(uaddr, &q, !ret);
2189         /*
2190          * If fixup_owner() returned an error, proprogate that.  If it acquired
2191          * the lock, clear our -ETIMEDOUT or -EINTR.
2192          */
2193         if (res)
2194                 ret = (res < 0) ? res : 0;
2195
2196         /*
2197          * If fixup_owner() faulted and was unable to handle the fault, unlock
2198          * it and return the fault to userspace.
2199          */
2200         if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
2201                 rt_mutex_unlock(&q.pi_state->pi_mutex);
2202
2203         /* Unqueue and drop the lock */
2204         unqueue_me_pi(&q);
2205
2206         goto out_put_key;
2207
2208 out_unlock_put_key:
2209         queue_unlock(&q, hb);
2210
2211 out_put_key:
2212         put_futex_key(&q.key);
2213 out:
2214         if (to)
2215                 destroy_hrtimer_on_stack(&to->timer);
2216         return ret != -EINTR ? ret : -ERESTARTNOINTR;
2217
2218 uaddr_faulted:
2219         queue_unlock(&q, hb);
2220
2221         ret = fault_in_user_writeable(uaddr);
2222         if (ret)
2223                 goto out_put_key;
2224
2225         if (!(flags & FLAGS_SHARED))
2226                 goto retry_private;
2227
2228         put_futex_key(&q.key);
2229         goto retry;
2230 }
2231
2232 /*
2233  * Userspace attempted a TID -> 0 atomic transition, and failed.
2234  * This is the in-kernel slowpath: we look up the PI state (if any),
2235  * and do the rt-mutex unlock.
2236  */
2237 static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
2238 {
2239         struct futex_hash_bucket *hb;
2240         struct futex_q *this, *next;
2241         struct plist_head *head;
2242         union futex_key key = FUTEX_KEY_INIT;
2243         u32 uval, vpid = task_pid_vnr(current);
2244         int ret;
2245
2246 retry:
2247         if (get_user(uval, uaddr))
2248                 return -EFAULT;
2249         /*
2250          * We release only a lock we actually own:
2251          */
2252         if ((uval & FUTEX_TID_MASK) != vpid)
2253                 return -EPERM;
2254
2255         ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key, VERIFY_WRITE);
2256         if (unlikely(ret != 0))
2257                 goto out;
2258
2259         hb = hash_futex(&key);
2260         spin_lock(&hb->lock);
2261
2262         /*
2263          * To avoid races, try to do the TID -> 0 atomic transition
2264          * again. If it succeeds then we can return without waking
2265          * anyone else up. We only try this if neither the waiters nor
2266          * the owner died bit are set.
2267          */
2268         if (!(uval & ~FUTEX_TID_MASK) &&
2269             cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
2270                 goto pi_faulted;
2271         /*
2272          * Rare case: we managed to release the lock atomically,
2273          * no need to wake anyone else up:
2274          */
2275         if (unlikely(uval == vpid))
2276                 goto out_unlock;
2277
2278         /*
2279          * Ok, other tasks may need to be woken up - check waiters
2280          * and do the wakeup if necessary:
2281          */
2282         head = &hb->chain;
2283
2284         plist_for_each_entry_safe(this, next, head, list) {
2285                 if (!match_futex (&this->key, &key))
2286                         continue;
2287                 ret = wake_futex_pi(uaddr, uval, this);
2288                 /*
2289                  * The atomic access to the futex value
2290                  * generated a pagefault, so retry the
2291                  * user-access and the wakeup:
2292                  */
2293                 if (ret == -EFAULT)
2294                         goto pi_faulted;
2295                 goto out_unlock;
2296         }
2297         /*
2298          * No waiters - kernel unlocks the futex:
2299          */
2300         ret = unlock_futex_pi(uaddr, uval);
2301         if (ret == -EFAULT)
2302                 goto pi_faulted;
2303
2304 out_unlock:
2305         spin_unlock(&hb->lock);
2306         put_futex_key(&key);
2307
2308 out:
2309         return ret;
2310
2311 pi_faulted:
2312         spin_unlock(&hb->lock);
2313         put_futex_key(&key);
2314
2315         ret = fault_in_user_writeable(uaddr);
2316         if (!ret)
2317                 goto retry;
2318
2319         return ret;
2320 }
2321
2322 /**
2323  * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
2324  * @hb:         the hash_bucket futex_q was original enqueued on
2325  * @q:          the futex_q woken while waiting to be requeued
2326  * @key2:       the futex_key of the requeue target futex
2327  * @timeout:    the timeout associated with the wait (NULL if none)
2328  *
2329  * Detect if the task was woken on the initial futex as opposed to the requeue
2330  * target futex.  If so, determine if it was a timeout or a signal that caused
2331  * the wakeup and return the appropriate error code to the caller.  Must be
2332  * called with the hb lock held.
2333  *
2334  * Return:
2335  *  0 = no early wakeup detected;
2336  * <0 = -ETIMEDOUT or -ERESTARTNOINTR
2337  */
2338 static inline
2339 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
2340                                    struct futex_q *q, union futex_key *key2,
2341                                    struct hrtimer_sleeper *timeout)
2342 {
2343         int ret = 0;
2344
2345         /*
2346          * With the hb lock held, we avoid races while we process the wakeup.
2347          * We only need to hold hb (and not hb2) to ensure atomicity as the
2348          * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
2349          * It can't be requeued from uaddr2 to something else since we don't
2350          * support a PI aware source futex for requeue.
2351          */
2352         if (!match_futex(&q->key, key2)) {
2353                 WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
2354                 /*
2355                  * We were woken prior to requeue by a timeout or a signal.
2356                  * Unqueue the futex_q and determine which it was.
2357                  */
2358                 plist_del(&q->list, &hb->chain);
2359
2360                 /* Handle spurious wakeups gracefully */
2361                 ret = -EWOULDBLOCK;
2362                 if (timeout && !timeout->task)
2363                         ret = -ETIMEDOUT;
2364                 else if (signal_pending(current))
2365                         ret = -ERESTARTNOINTR;
2366         }
2367         return ret;
2368 }
2369
2370 /**
2371  * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
2372  * @uaddr:      the futex we initially wait on (non-pi)
2373  * @flags:      futex flags (FLAGS_SHARED, FLAGS_CLOCKRT, etc.), they must be
2374  *              the same type, no requeueing from private to shared, etc.
2375  * @val:        the expected value of uaddr
2376  * @abs_time:   absolute timeout
2377  * @bitset:     32 bit wakeup bitset set by userspace, defaults to all
2378  * @uaddr2:     the pi futex we will take prior to returning to user-space
2379  *
2380  * The caller will wait on uaddr and will be requeued by futex_requeue() to
2381  * uaddr2 which must be PI aware and unique from uaddr.  Normal wakeup will wake
2382  * on uaddr2 and complete the acquisition of the rt_mutex prior to returning to
2383  * userspace.  This ensures the rt_mutex maintains an owner when it has waiters;
2384  * without one, the pi logic would not know which task to boost/deboost, if
2385  * there was a need to.
2386  *
2387  * We call schedule in futex_wait_queue_me() when we enqueue and return there
2388  * via the following--
2389  * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
2390  * 2) wakeup on uaddr2 after a requeue
2391  * 3) signal
2392  * 4) timeout
2393  *
2394  * If 3, cleanup and return -ERESTARTNOINTR.
2395  *
2396  * If 2, we may then block on trying to take the rt_mutex and return via:
2397  * 5) successful lock
2398  * 6) signal
2399  * 7) timeout
2400  * 8) other lock acquisition failure
2401  *
2402  * If 6, return -EWOULDBLOCK (restarting the syscall would do the same).
2403  *
2404  * If 4 or 7, we cleanup and return with -ETIMEDOUT.
2405  *
2406  * Return:
2407  *  0 - On success;
2408  * <0 - On error
2409  */
2410 static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
2411                                  u32 val, ktime_t *abs_time, u32 bitset,
2412                                  u32 __user *uaddr2)
2413 {
2414         struct hrtimer_sleeper timeout, *to = NULL;
2415         struct rt_mutex_waiter rt_waiter;
2416         struct rt_mutex *pi_mutex = NULL;
2417         struct futex_hash_bucket *hb;
2418         union futex_key key2 = FUTEX_KEY_INIT;
2419         struct futex_q q = futex_q_init;
2420         int res, ret;
2421
2422         if (uaddr == uaddr2)
2423                 return -EINVAL;
2424
2425         if (!bitset)
2426                 return -EINVAL;
2427
2428         if (abs_time) {
2429                 to = &timeout;
2430                 hrtimer_init_on_stack(&to->timer, (flags & FLAGS_CLOCKRT) ?
2431                                       CLOCK_REALTIME : CLOCK_MONOTONIC,
2432                                       HRTIMER_MODE_ABS);
2433                 hrtimer_init_sleeper(to, current);
2434                 hrtimer_set_expires_range_ns(&to->timer, *abs_time,
2435                                              current->timer_slack_ns);
2436         }
2437
2438         /*
2439          * The waiter is allocated on our stack, manipulated by the requeue
2440          * code while we sleep on uaddr.
2441          */
2442         debug_rt_mutex_init_waiter(&rt_waiter);
2443         rt_waiter.task = NULL;
2444
2445         ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
2446         if (unlikely(ret != 0))
2447                 goto out;
2448
2449         q.bitset = bitset;
2450         q.rt_waiter = &rt_waiter;
2451         q.requeue_pi_key = &key2;
2452
2453         /*
2454          * Prepare to wait on uaddr. On success, increments q.key (key1) ref
2455          * count.
2456          */
2457         ret = futex_wait_setup(uaddr, val, flags, &q, &hb);
2458         if (ret)
2459                 goto out_key2;
2460
2461         /*
2462          * The check above which compares uaddrs is not sufficient for
2463          * shared futexes. We need to compare the keys:
2464          */
2465         if (match_futex(&q.key, &key2)) {
2466                 ret = -EINVAL;
2467                 goto out_put_keys;
2468         }
2469
2470         /* Queue the futex_q, drop the hb lock, wait for wakeup. */
2471         futex_wait_queue_me(hb, &q, to);
2472
2473         spin_lock(&hb->lock);
2474         ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
2475         spin_unlock(&hb->lock);
2476         if (ret)
2477                 goto out_put_keys;
2478
2479         /*
2480          * In order for us to be here, we know our q.key == key2, and since
2481          * we took the hb->lock above, we also know that futex_requeue() has
2482          * completed and we no longer have to concern ourselves with a wakeup
2483          * race with the atomic proxy lock acquisition by the requeue code. The
2484          * futex_requeue dropped our key1 reference and incremented our key2
2485          * reference count.
2486          */
2487
2488         /* Check if the requeue code acquired the second futex for us. */
2489         if (!q.rt_waiter) {
2490                 /*
2491                  * Got the lock. We might not be the anticipated owner if we
2492                  * did a lock-steal - fix up the PI-state in that case.
2493                  */
2494                 if (q.pi_state && (q.pi_state->owner != current)) {
2495                         spin_lock(q.lock_ptr);
2496                         ret = fixup_pi_state_owner(uaddr2, &q, current);
2497                         spin_unlock(q.lock_ptr);
2498                 }
2499         } else {
2500                 /*
2501                  * We have been woken up by futex_unlock_pi(), a timeout, or a
2502                  * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
2503                  * the pi_state.
2504                  */
2505                 WARN_ON(!q.pi_state);
2506                 pi_mutex = &q.pi_state->pi_mutex;
2507                 ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
2508                 debug_rt_mutex_free_waiter(&rt_waiter);
2509
2510                 spin_lock(q.lock_ptr);
2511                 /*
2512                  * Fixup the pi_state owner and possibly acquire the lock if we
2513                  * haven't already.
2514                  */
2515                 res = fixup_owner(uaddr2, &q, !ret);
2516                 /*
2517                  * If fixup_owner() returned an error, proprogate that.  If it
2518                  * acquired the lock, clear -ETIMEDOUT or -EINTR.
2519                  */
2520                 if (res)
2521                         ret = (res < 0) ? res : 0;
2522
2523                 /* Unqueue and drop the lock. */
2524                 unqueue_me_pi(&q);
2525         }
2526
2527         /*
2528          * If fixup_pi_state_owner() faulted and was unable to handle the
2529          * fault, unlock the rt_mutex and return the fault to userspace.
2530          */
2531         if (ret == -EFAULT) {
2532                 if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
2533                         rt_mutex_unlock(pi_mutex);
2534         } else if (ret == -EINTR) {
2535                 /*
2536                  * We've already been requeued, but cannot restart by calling
2537                  * futex_lock_pi() directly. We could restart this syscall, but
2538                  * it would detect that the user space "val" changed and return
2539                  * -EWOULDBLOCK.  Save the overhead of the restart and return
2540                  * -EWOULDBLOCK directly.
2541                  */
2542                 ret = -EWOULDBLOCK;
2543         }
2544
2545 out_put_keys:
2546         put_futex_key(&q.key);
2547 out_key2:
2548         put_futex_key(&key2);
2549
2550 out:
2551         if (to) {
2552                 hrtimer_cancel(&to->timer);
2553                 destroy_hrtimer_on_stack(&to->timer);
2554         }
2555         return ret;
2556 }
2557
2558 /*
2559  * Support for robust futexes: the kernel cleans up held futexes at
2560  * thread exit time.
2561  *
2562  * Implementation: user-space maintains a per-thread list of locks it
2563  * is holding. Upon do_exit(), the kernel carefully walks this list,
2564  * and marks all locks that are owned by this thread with the
2565  * FUTEX_OWNER_DIED bit, and wakes up a waiter (if any). The list is
2566  * always manipulated with the lock held, so the list is private and
2567  * per-thread. Userspace also maintains a per-thread 'list_op_pending'
2568  * field, to allow the kernel to clean up if the thread dies after
2569  * acquiring the lock, but just before it could have added itself to
2570  * the list. There can only be one such pending lock.
2571  */
2572
2573 /**
2574  * sys_set_robust_list() - Set the robust-futex list head of a task
2575  * @head:       pointer to the list-head
2576  * @len:        length of the list-head, as userspace expects
2577  */
2578 SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head,
2579                 size_t, len)
2580 {
2581         if (!futex_cmpxchg_enabled)
2582                 return -ENOSYS;
2583         /*
2584          * The kernel knows only one size for now:
2585          */
2586         if (unlikely(len != sizeof(*head)))
2587                 return -EINVAL;
2588
2589         current->robust_list = head;
2590
2591         return 0;
2592 }
2593
2594 /**
2595  * sys_get_robust_list() - Get the robust-futex list head of a task
2596  * @pid:        pid of the process [zero for current task]
2597  * @head_ptr:   pointer to a list-head pointer, the kernel fills it in
2598  * @len_ptr:    pointer to a length field, the kernel fills in the header size
2599  */
2600 SYSCALL_DEFINE3(get_robust_list, int, pid,
2601                 struct robust_list_head __user * __user *, head_ptr,
2602                 size_t __user *, len_ptr)
2603 {
2604         struct robust_list_head __user *head;
2605         unsigned long ret;
2606         struct task_struct *p;
2607
2608         if (!futex_cmpxchg_enabled)
2609                 return -ENOSYS;
2610
2611         rcu_read_lock();
2612
2613         ret = -ESRCH;
2614         if (!pid)
2615                 p = current;
2616         else {
2617                 p = find_task_by_vpid(pid);
2618                 if (!p)
2619                         goto err_unlock;
2620         }
2621
2622         ret = -EPERM;
2623         if (!ptrace_may_access(p, PTRACE_MODE_READ))
2624                 goto err_unlock;
2625
2626         head = p->robust_list;
2627         rcu_read_unlock();
2628
2629         if (put_user(sizeof(*head), len_ptr))
2630                 return -EFAULT;
2631         return put_user(head, head_ptr);
2632
2633 err_unlock:
2634         rcu_read_unlock();
2635
2636         return ret;
2637 }
2638
2639 /*
2640  * Process a futex-list entry, check whether it's owned by the
2641  * dying task, and do notification if so:
2642  */
2643 int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
2644 {
2645         u32 uval, uninitialized_var(nval), mval;
2646
2647 retry:
2648         if (get_user(uval, uaddr))
2649                 return -1;
2650
2651         if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) {
2652                 /*
2653                  * Ok, this dying thread is truly holding a futex
2654                  * of interest. Set the OWNER_DIED bit atomically
2655                  * via cmpxchg, and if the value had FUTEX_WAITERS
2656                  * set, wake up a waiter (if any). (We have to do a
2657                  * futex_wake() even if OWNER_DIED is already set -
2658                  * to handle the rare but possible case of recursive
2659                  * thread-death.) The rest of the cleanup is done in
2660                  * userspace.
2661                  */
2662                 mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
2663                 /*
2664                  * We are not holding a lock here, but we want to have
2665                  * the pagefault_disable/enable() protection because
2666                  * we want to handle the fault gracefully. If the
2667                  * access fails we try to fault in the futex with R/W
2668                  * verification via get_user_pages. get_user() above
2669                  * does not guarantee R/W access. If that fails we
2670                  * give up and leave the futex locked.
2671                  */
2672                 if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
2673                         if (fault_in_user_writeable(uaddr))
2674                                 return -1;
2675                         goto retry;
2676                 }
2677                 if (nval != uval)
2678                         goto retry;
2679
2680                 /*
2681                  * Wake robust non-PI futexes here. The wakeup of
2682                  * PI futexes happens in exit_pi_state():
2683                  */
2684                 if (!pi && (uval & FUTEX_WAITERS))
2685                         futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY);
2686         }
2687         return 0;
2688 }
2689
2690 /*
2691  * Fetch a robust-list pointer. Bit 0 signals PI futexes:
2692  */
2693 static inline int fetch_robust_entry(struct robust_list __user **entry,
2694                                      struct robust_list __user * __user *head,
2695                                      unsigned int *pi)
2696 {
2697         unsigned long uentry;
2698
2699         if (get_user(uentry, (unsigned long __user *)head))
2700                 return -EFAULT;
2701
2702         *entry = (void __user *)(uentry & ~1UL);
2703         *pi = uentry & 1;
2704
2705         return 0;
2706 }
2707
2708 /*
2709  * Walk curr->robust_list (very carefully, it's a userspace list!)
2710  * and mark any locks found there dead, and notify any waiters.
2711  *
2712  * We silently return on any sign of list-walking problem.
2713  */
2714 void exit_robust_list(struct task_struct *curr)
2715 {
2716         struct robust_list_head __user *head = curr->robust_list;
2717         struct robust_list __user *entry, *next_entry, *pending;
2718         unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
2719         unsigned int uninitialized_var(next_pi);
2720         unsigned long futex_offset;
2721         int rc;
2722
2723         if (!futex_cmpxchg_enabled)
2724                 return;
2725
2726         /*
2727          * Fetch the list head (which was registered earlier, via
2728          * sys_set_robust_list()):
2729          */
2730         if (fetch_robust_entry(&entry, &head->list.next, &pi))
2731                 return;
2732         /*
2733          * Fetch the relative futex offset:
2734          */
2735         if (get_user(futex_offset, &head->futex_offset))
2736                 return;
2737         /*
2738          * Fetch any possibly pending lock-add first, and handle it
2739          * if it exists:
2740          */
2741         if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
2742                 return;
2743
2744         next_entry = NULL;      /* avoid warning with gcc */
2745         while (entry != &head->list) {
2746                 /*
2747                  * Fetch the next entry in the list before calling
2748                  * handle_futex_death:
2749                  */
2750                 rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi);
2751                 /*
2752                  * A pending lock might already be on the list, so
2753                  * don't process it twice:
2754                  */
2755                 if (entry != pending)
2756                         if (handle_futex_death((void __user *)entry + futex_offset,
2757                                                 curr, pi))
2758                                 return;
2759                 if (rc)
2760                         return;
2761                 entry = next_entry;
2762                 pi = next_pi;
2763                 /*
2764                  * Avoid excessively long or circular lists:
2765                  */
2766                 if (!--limit)
2767                         break;
2768
2769                 cond_resched();
2770         }
2771
2772         if (pending)
2773                 handle_futex_death((void __user *)pending + futex_offset,
2774                                    curr, pip);
2775 }
2776
2777 long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
2778                 u32 __user *uaddr2, u32 val2, u32 val3)
2779 {
2780         int cmd = op & FUTEX_CMD_MASK;
2781         unsigned int flags = 0;
2782
2783         if (!(op & FUTEX_PRIVATE_FLAG))
2784                 flags |= FLAGS_SHARED;
2785
2786         if (op & FUTEX_CLOCK_REALTIME) {
2787                 flags |= FLAGS_CLOCKRT;
2788                 if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
2789                         return -ENOSYS;
2790         }
2791
2792         switch (cmd) {
2793         case FUTEX_LOCK_PI:
2794         case FUTEX_UNLOCK_PI:
2795         case FUTEX_TRYLOCK_PI:
2796         case FUTEX_WAIT_REQUEUE_PI:
2797         case FUTEX_CMP_REQUEUE_PI:
2798                 if (!futex_cmpxchg_enabled)
2799                         return -ENOSYS;
2800         }
2801
2802         switch (cmd) {
2803         case FUTEX_WAIT:
2804                 val3 = FUTEX_BITSET_MATCH_ANY;
2805         case FUTEX_WAIT_BITSET:
2806                 return futex_wait(uaddr, flags, val, timeout, val3);
2807         case FUTEX_WAKE:
2808                 val3 = FUTEX_BITSET_MATCH_ANY;
2809         case FUTEX_WAKE_BITSET:
2810                 return futex_wake(uaddr, flags, val, val3);
2811         case FUTEX_REQUEUE:
2812                 return futex_requeue(uaddr, flags, uaddr2, val, val2, NULL, 0);
2813         case FUTEX_CMP_REQUEUE:
2814                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 0);
2815         case FUTEX_WAKE_OP:
2816                 return futex_wake_op(uaddr, flags, uaddr2, val, val2, val3);
2817         case FUTEX_LOCK_PI:
2818                 return futex_lock_pi(uaddr, flags, val, timeout, 0);
2819         case FUTEX_UNLOCK_PI:
2820                 return futex_unlock_pi(uaddr, flags);
2821         case FUTEX_TRYLOCK_PI:
2822                 return futex_lock_pi(uaddr, flags, 0, timeout, 1);
2823         case FUTEX_WAIT_REQUEUE_PI:
2824                 val3 = FUTEX_BITSET_MATCH_ANY;
2825                 return futex_wait_requeue_pi(uaddr, flags, val, timeout, val3,
2826                                              uaddr2);
2827         case FUTEX_CMP_REQUEUE_PI:
2828                 return futex_requeue(uaddr, flags, uaddr2, val, val2, &val3, 1);
2829         }
2830         return -ENOSYS;
2831 }
2832
2833
2834 SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
2835                 struct timespec __user *, utime, u32 __user *, uaddr2,
2836                 u32, val3)
2837 {
2838         struct timespec ts;
2839         ktime_t t, *tp = NULL;
2840         u32 val2 = 0;
2841         int cmd = op & FUTEX_CMD_MASK;
2842
2843         if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
2844                       cmd == FUTEX_WAIT_BITSET ||
2845                       cmd == FUTEX_WAIT_REQUEUE_PI)) {
2846                 if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
2847                         return -EFAULT;
2848                 if (!timespec_valid(&ts))
2849                         return -EINVAL;
2850
2851                 t = timespec_to_ktime(ts);
2852                 if (cmd == FUTEX_WAIT)
2853                         t = ktime_add_safe(ktime_get(), t);
2854                 tp = &t;
2855         }
2856         /*
2857          * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
2858          * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
2859          */
2860         if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
2861             cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
2862                 val2 = (u32) (unsigned long) utime;
2863
2864         return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
2865 }
2866
2867 static void __init futex_detect_cmpxchg(void)
2868 {
2869 #ifndef CONFIG_HAVE_FUTEX_CMPXCHG
2870         u32 curval;
2871
2872         /*
2873          * This will fail and we want it. Some arch implementations do
2874          * runtime detection of the futex_atomic_cmpxchg_inatomic()
2875          * functionality. We want to know that before we call in any
2876          * of the complex code paths. Also we want to prevent
2877          * registration of robust lists in that case. NULL is
2878          * guaranteed to fault and we get -EFAULT on functional
2879          * implementation, the non-functional ones will return
2880          * -ENOSYS.
2881          */
2882         if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
2883                 futex_cmpxchg_enabled = 1;
2884 #endif
2885 }
2886
2887 static int __init futex_init(void)
2888 {
2889         int i;
2890
2891         futex_detect_cmpxchg();
2892
2893         for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
2894                 plist_head_init(&futex_queues[i].chain);
2895                 spin_lock_init(&futex_queues[i].lock);
2896         }
2897
2898         return 0;
2899 }
2900 __initcall(futex_init);