libpthread/nptl/allocatestack.c

   1 /* Copyright (C) 2002-2007, 2009 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <assert.h>
  21 #include <errno.h>
  22 #include <signal.h>
  23 #include <stdint.h>
  24 #include <string.h>
  25 #include <unistd.h>
  26 #include <sys/mman.h>
  27 #include <sys/param.h>
  28 #include <tls.h>
  29 #include <lowlevellock.h>
  30 #include <link.h>
  31 #include <bits/kernel-features.h>
  32
  33
  34 #ifndef NEED_SEPARATE_REGISTER_STACK
  35
  36 /* Most architectures have exactly one stack pointer.  Some have more.  */
  37 # define STACK_VARIABLES void *stackaddr = NULL
  38
  39 /* How to pass the values to the 'create_thread' function.  */
  40 # define STACK_VARIABLES_ARGS stackaddr
  41
  42 /* How to declare function which gets there parameters.  */
  43 # define STACK_VARIABLES_PARMS void *stackaddr
  44
  45 /* How to declare allocate_stack.  */
  46 # define ALLOCATE_STACK_PARMS void **stack
  47
  48 /* This is how the function is called.  We do it this way to allow
  49    other variants of the function to have more parameters.  */
  50 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  51
  52 #else
  53
  54 /* We need two stacks.  The kernel will place them but we have to tell
  55    the kernel about the size of the reserved address space.  */
  56 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  57
  58 /* How to pass the values to the 'create_thread' function.  */
  59 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  60
  61 /* How to declare function which gets there parameters.  */
  62 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  63
  64 /* How to declare allocate_stack.  */
  65 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  66
  67 /* This is how the function is called.  We do it this way to allow
  68    other variants of the function to have more parameters.  */
  69 # define ALLOCATE_STACK(attr, pd) \
  70   allocate_stack (attr, pd, &stackaddr, &stacksize)
  71
  72 #endif
  73
  74
  75 /* Default alignment of stack.  */
  76 #ifndef STACK_ALIGN
  77 # define STACK_ALIGN __alignof__ (long double)
  78 #endif
  79
  80 /* Default value for minimal stack size after allocating thread
  81    descriptor and guard.  */
  82 #ifndef MINIMAL_REST_STACK
  83 # define MINIMAL_REST_STACK     4096
  84 #endif
  85
  86
  87 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  88    a stack.  Use it when possible.  */
  89 #ifndef MAP_STACK
  90 # define MAP_STACK 0
  91 #endif
  92
  93 /* This yields the pointer that TLS support code calls the thread pointer.  */
  94 #if defined(TLS_TCB_AT_TP)
  95 # define TLS_TPADJ(pd) (pd)
  96 #elif defined(TLS_DTV_AT_TP)
  97 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  98 #endif
  99
 100 /* Cache handling for not-yet free stacks.  */
 101
 102 /* Maximum size in kB of cache.  */
 103 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 104 static size_t stack_cache_actsize;
 105
 106 /* Mutex protecting this variable.  */
 107 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 108
 109 /* List of queued stack frames.  */
 110 static LIST_HEAD (stack_cache);
 111
 112 /* List of the stacks in use.  */
 113 static LIST_HEAD (stack_used);
 114
 115 /* We need to record what list operations we are going to do so that,
 116    in case of an asynchronous interruption due to a fork() call, we
 117    can correct for the work.  */
 118 static uintptr_t in_flight_stack;
 119
 120 /* List of the threads with user provided stacks in use.  No need to
 121    initialize this, since it's done in __pthread_initialize_minimal.  */
 122 list_t __stack_user __attribute__ ((nocommon));
 123 hidden_data_def (__stack_user)
 124
 125 #if defined COLORING_INCREMENT && COLORING_INCREMENT != 0
 126 /* Number of threads created.  */
 127 static unsigned int nptl_ncreated;
 128 #endif
 129
 130
 131 /* Check whether the stack is still used or not.  */
 132 #define FREE_P(descr) ((descr)->tid <= 0)
 133
 134
 135 static void
 136 stack_list_del (list_t *elem)
 137 {
 138   in_flight_stack = (uintptr_t) elem;
 139
 140   atomic_write_barrier ();
 141
 142   list_del (elem);
 143
 144   atomic_write_barrier ();
 145
 146   in_flight_stack = 0;
 147 }
 148
 149
 150 static void
 151 stack_list_add (list_t *elem, list_t *list)
 152 {
 153   in_flight_stack = (uintptr_t) elem | 1;
 154
 155   atomic_write_barrier ();
 156
 157   list_add (elem, list);
 158
 159   atomic_write_barrier ();
 160
 161   in_flight_stack = 0;
 162 }
 163
 164
 165 /* We create a double linked list of all cache entries.  Double linked
 166    because this allows removing entries from the end.  */
 167
 168
 169 /* Get a stack frame from the cache.  We have to match by size since
 170    some blocks might be too small or far too large.  */
 171 static struct pthread *
 172 get_cached_stack (size_t *sizep, void **memp)
 173 {
 174   size_t size = *sizep;
 175   struct pthread *result = NULL;
 176   list_t *entry;
 177
 178   lll_lock (stack_cache_lock, LLL_PRIVATE);
 179
 180   /* Search the cache for a matching entry.  We search for the
 181      smallest stack which has at least the required size.  Note that
 182      in normal situations the size of all allocated stacks is the
 183      same.  As the very least there are only a few different sizes.
 184      Therefore this loop will exit early most of the time with an
 185      exact match.  */
 186   list_for_each (entry, &stack_cache)
 187     {
 188       struct pthread *curr;
 189
 190       curr = list_entry (entry, struct pthread, list);
 191       if (FREE_P (curr) && curr->stackblock_size >= size)
 192         {
 193           if (curr->stackblock_size == size)
 194             {
 195               result = curr;
 196               break;
 197             }
 198
 199           if (result == NULL
 200               || result->stackblock_size > curr->stackblock_size)
 201             result = curr;
 202         }
 203     }
 204
 205   if (__builtin_expect (result == NULL, 0)
 206       /* Make sure the size difference is not too excessive.  In that
 207          case we do not use the block.  */
 208       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 209     {
 210       /* Release the lock.  */
 211       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 212
 213       return NULL;
 214     }
 215
 216   /* Dequeue the entry.  */
 217   stack_list_del (&result->list);
 218
 219   /* And add to the list of stacks in use.  */
 220   stack_list_add (&result->list, &stack_used);
 221
 222   /* And decrease the cache size.  */
 223   stack_cache_actsize -= result->stackblock_size;
 224
 225   /* Release the lock early.  */
 226   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 227
 228   /* Report size and location of the stack to the caller.  */
 229   *sizep = result->stackblock_size;
 230   *memp = result->stackblock;
 231
 232   /* Cancellation handling is back to the default.  */
 233   result->cancelhandling = 0;
 234   result->cleanup = NULL;
 235
 236   /* No pending event.  */
 237   result->nextevent = NULL;
 238
 239   /* Clear the DTV.  */
 240   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 241   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 242
 243   /* Re-initialize the TLS.  */
 244   _dl_allocate_tls_init (TLS_TPADJ (result));
 245
 246   return result;
 247 }
 248
 249
 250 /* Free stacks until cache size is lower than LIMIT.  */
 251 void
 252 __free_stacks (size_t limit)
 253 {
 254   /* We reduce the size of the cache.  Remove the last entries until
 255      the size is below the limit.  */
 256   list_t *entry;
 257   list_t *prev;
 258
 259   /* Search from the end of the list.  */
 260   list_for_each_prev_safe (entry, prev, &stack_cache)
 261     {
 262       struct pthread *curr;
 263
 264       curr = list_entry (entry, struct pthread, list);
 265       if (FREE_P (curr))
 266         {
 267           /* Unlink the block.  */
 268           stack_list_del (entry);
 269
 270           /* Account for the freed memory.  */
 271           stack_cache_actsize -= curr->stackblock_size;
 272
 273           /* Free the memory associated with the ELF TLS.  */
 274           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 275
 276           /* Remove this block.  This should never fail.  If it does
 277              something is really wrong.  */
 278           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 279             abort ();
 280
 281           /* Maybe we have freed enough.  */
 282           if (stack_cache_actsize <= limit)
 283             break;
 284         }
 285     }
 286 }
 287
 288
 289 /* Add a stack frame which is not used anymore to the stack.  Must be
 290    called with the cache lock held.  */
 291 static inline void
 292 __attribute ((always_inline))
 293 queue_stack (struct pthread *stack)
 294 {
 295   /* We unconditionally add the stack to the list.  The memory may
 296      still be in use but it will not be reused until the kernel marks
 297      the stack as not used anymore.  */
 298   stack_list_add (&stack->list, &stack_cache);
 299
 300   stack_cache_actsize += stack->stackblock_size;
 301   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 302     __free_stacks (stack_cache_maxsize);
 303 }
 304
 305
 306 static int
 307 internal_function
 308 change_stack_perm (struct pthread *pd
 309 #ifdef NEED_SEPARATE_REGISTER_STACK
 310                    , size_t pagemask
 311 #endif
 312                    )
 313 {
 314 #ifdef NEED_SEPARATE_REGISTER_STACK
 315   void *stack = (pd->stackblock
 316                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 317                       & pagemask) + pd->guardsize) & pagemask));
 318   size_t len = pd->stackblock + pd->stackblock_size - stack;
 319 #elif defined _STACK_GROWS_DOWN
 320   void *stack = pd->stackblock + pd->guardsize;
 321   size_t len = pd->stackblock_size - pd->guardsize;
 322 #elif defined _STACK_GROWS_UP
 323   void *stack = pd->stackblock;
 324   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 325 #else
 326 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 327 #endif
 328   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 329     return errno;
 330
 331   return 0;
 332 }
 333
 334
 335 static int
 336 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 337                 ALLOCATE_STACK_PARMS)
 338 {
 339   struct pthread *pd;
 340   size_t size;
 341   size_t pagesize_m1 = __getpagesize () - 1;
 342   void *stacktop;
 343
 344   assert (attr != NULL);
 345   assert (powerof2 (pagesize_m1 + 1));
 346   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 347
 348   /* Get the stack size from the attribute if it is set.  Otherwise we
 349      use the default we determined at start time.  */
 350   size = attr->stacksize ?: __default_stacksize;
 351
 352   /* Get memory for the stack.  */
 353   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 354     {
 355       uintptr_t adj;
 356
 357       /* If the user also specified the size of the stack make sure it
 358          is large enough.  */
 359       if (attr->stacksize != 0
 360           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 361         return EINVAL;
 362
 363       /* Adjust stack size for alignment of the TLS block.  */
 364 #if defined(TLS_TCB_AT_TP)
 365       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 366             & __static_tls_align_m1;
 367       assert (size > adj + TLS_TCB_SIZE);
 368 #elif defined(TLS_DTV_AT_TP)
 369       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 370             & __static_tls_align_m1;
 371       assert (size > adj);
 372 #endif
 373
 374       /* The user provided some memory.  Let's hope it matches the
 375          size...  We do not allocate guard pages if the user provided
 376          the stack.  It is the user's responsibility to do this if it
 377          is wanted.  */
 378 #if defined(TLS_TCB_AT_TP)
 379       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 380                                - TLS_TCB_SIZE - adj);
 381 #elif defined(TLS_DTV_AT_TP)
 382       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 383                                 - __static_tls_size - adj)
 384                                - TLS_PRE_TCB_SIZE);
 385 #endif
 386
 387       /* The user provided stack memory needs to be cleared.  */
 388       memset (pd, '\0', sizeof (struct pthread));
 389
 390       /* The first TSD block is included in the TCB.  */
 391       pd->specific[0] = pd->specific_1stblock;
 392
 393       /* Remember the stack-related values.  */
 394       pd->stackblock = (char *) attr->stackaddr - size;
 395       pd->stackblock_size = size;
 396
 397       /* This is a user-provided stack.  It will not be queued in the
 398          stack cache nor will the memory (except the TLS memory) be freed.  */
 399       pd->user_stack = true;
 400
 401       /* This is at least the second thread.  */
 402       pd->header.multiple_threads = 1;
 403 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 404       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 405 #endif
 406
 407 #ifndef __ASSUME_PRIVATE_FUTEX
 408       /* The thread must know when private futexes are supported.  */
 409       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 410                                                 header.private_futex);
 411 #endif
 412
 413 #ifdef NEED_DL_SYSINFO
 414       /* Copy the sysinfo value from the parent.  */
 415       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 416 #endif
 417
 418       /* The process ID is also the same as that of the caller.  */
 419       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 420
 421       /* Allocate the DTV for this thread.  */
 422       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 423         {
 424           /* Something went wrong.  */
 425           assert (errno == ENOMEM);
 426           return EAGAIN;
 427         }
 428
 429
 430       /* Prepare to modify global data.  */
 431       lll_lock (stack_cache_lock, LLL_PRIVATE);
 432
 433       /* And add to the list of stacks in use.  */
 434       list_add (&pd->list, &__stack_user);
 435
 436       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 437     }
 438   else
 439     {
 440       /* Allocate some anonymous memory.  If possible use the cache.  */
 441       size_t guardsize;
 442       size_t reqsize;
 443       void *mem = 0;
 444       const int prot = (PROT_READ | PROT_WRITE);
 445
 446 #if defined COLORING_INCREMENT && COLORING_INCREMENT != 0
 447       /* Add one more page for stack coloring.  Don't do it for stacks
 448          with 16 times pagesize or larger.  This might just cause
 449          unnecessary misalignment.  */
 450       if (size <= 16 * pagesize_m1)
 451         size += pagesize_m1 + 1;
 452 #endif
 453
 454       /* Adjust the stack size for alignment.  */
 455       size &= ~__static_tls_align_m1;
 456       assert (size != 0);
 457
 458       /* Make sure the size of the stack is enough for the guard and
 459          eventually the thread descriptor.  */
 460       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 461       if (__builtin_expect (size < ((guardsize + __static_tls_size
 462                                      + MINIMAL_REST_STACK + pagesize_m1)
 463                                     & ~pagesize_m1),
 464                             0))
 465         /* The stack is too small (or the guard too large).  */
 466         return EINVAL;
 467
 468       /* Try to get a stack from the cache.  */
 469       reqsize = size;
 470       pd = get_cached_stack (&size, &mem);
 471       if (pd == NULL)
 472         {
 473           /* To avoid aliasing effects on a larger scale than pages we
 474              adjust the allocated stack size if necessary.  This way
 475              allocations directly following each other will not have
 476              aliasing problems.  */
 477 #if defined MULTI_PAGE_ALIASING && MULTI_PAGE_ALIASING != 0
 478           if ((size % MULTI_PAGE_ALIASING) == 0)
 479             size += pagesize_m1 + 1;
 480 #endif
 481
 482           mem = mmap (NULL, size, prot,
 483                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 484
 485           if (__builtin_expect (mem == MAP_FAILED, 0))
 486             {
 487               if (errno == ENOMEM)
 488                 __set_errno (EAGAIN);
 489
 490                return errno;
 491             }
 492
 493           /* SIZE is guaranteed to be greater than zero.
 494              So we can never get a null pointer back from mmap.  */
 495           assert (mem != NULL);
 496
 497 #if defined COLORING_INCREMENT && COLORING_INCREMENT != 0
 498           /* Atomically increment NCREATED.  */
 499           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 500
 501           /* We chose the offset for coloring by incrementing it for
 502              every new thread by a fixed amount.  The offset used
 503              module the page size.  Even if coloring would be better
 504              relative to higher alignment values it makes no sense to
 505              do it since the mmap() interface does not allow us to
 506              specify any alignment for the returned memory block.  */
 507           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 508
 509           /* Make sure the coloring offsets does not disturb the alignment
 510              of the TCB and static TLS block.  */
 511           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 512             coloring = (((coloring + __static_tls_align_m1)
 513                          & ~(__static_tls_align_m1))
 514                         & ~pagesize_m1);
 515 #else
 516           /* Unless specified we do not make any adjustments.  */
 517 # define coloring 0
 518 #endif
 519
 520           /* Place the thread descriptor at the end of the stack.  */
 521 #if defined(TLS_TCB_AT_TP)
 522           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 523 #elif defined(TLS_DTV_AT_TP)
 524           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 525                                     - __static_tls_size)
 526                                     & ~__static_tls_align_m1)
 527                                    - TLS_PRE_TCB_SIZE);
 528 #endif
 529
 530           /* Remember the stack-related values.  */
 531           pd->stackblock = mem;
 532           pd->stackblock_size = size;
 533
 534           /* We allocated the first block thread-specific data array.
 535              This address will not change for the lifetime of this
 536              descriptor.  */
 537           pd->specific[0] = pd->specific_1stblock;
 538
 539           /* This is at least the second thread.  */
 540           pd->header.multiple_threads = 1;
 541 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 542           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 543 #endif
 544
 545 #ifndef __ASSUME_PRIVATE_FUTEX
 546           /* The thread must know when private futexes are supported.  */
 547           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 548                                                     header.private_futex);
 549 #endif
 550
 551 #ifdef NEED_DL_SYSINFO
 552           /* Copy the sysinfo value from the parent.  */
 553           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 554 #endif
 555
 556           /* The process ID is also the same as that of the caller.  */
 557           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 558
 559           /* Allocate the DTV for this thread.  */
 560           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 561             {
 562               /* Something went wrong.  */
 563               assert (errno == ENOMEM);
 564
 565               /* Free the stack memory we just allocated.  */
 566               (void) munmap (mem, size);
 567
 568               return EAGAIN;
 569             }
 570
 571
 572           /* Prepare to modify global data.  */
 573           lll_lock (stack_cache_lock, LLL_PRIVATE);
 574
 575           /* And add to the list of stacks in use.  */
 576           stack_list_add (&pd->list, &stack_used);
 577
 578           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 579
 580
 581           /* Note that all of the stack and the thread descriptor is
 582              zeroed.  This means we do not have to initialize fields
 583              with initial value zero.  This is specifically true for
 584              the 'tid' field which is always set back to zero once the
 585              stack is not used anymore and for the 'guardsize' field
 586              which will be read next.  */
 587         }
 588
 589       /* Create or resize the guard area if necessary.  */
 590       if (__builtin_expect (guardsize > pd->guardsize, 0))
 591         {
 592 #ifdef NEED_SEPARATE_REGISTER_STACK
 593           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 594 #elif defined _STACK_GROWS_DOWN
 595           char *guard = mem;
 596 #elif defined _STACK_GROWS_UP
 597           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 598 #endif
 599           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 600             {
 601               int err;
 602             mprot_error:
 603               err = errno;
 604
 605               lll_lock (stack_cache_lock, LLL_PRIVATE);
 606
 607               /* Remove the thread from the list.  */
 608               stack_list_del (&pd->list);
 609
 610               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 611
 612               /* Get rid of the TLS block we allocated.  */
 613               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 614
 615               /* Free the stack memory regardless of whether the size
 616                  of the cache is over the limit or not.  If this piece
 617                  of memory caused problems we better do not use it
 618                  anymore.  Uh, and we ignore possible errors.  There
 619                  is nothing we could do.  */
 620               (void) munmap (mem, size);
 621
 622               return err;
 623             }
 624
 625           pd->guardsize = guardsize;
 626         }
 627       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 628                                  0))
 629         {
 630           /* The old guard area is too large.  */
 631
 632 #ifdef NEED_SEPARATE_REGISTER_STACK
 633           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 634           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 635
 636           if (oldguard < guard
 637               && mprotect (oldguard, guard - oldguard, prot) != 0)
 638             goto mprot_error;
 639
 640           if (mprotect (guard + guardsize,
 641                         oldguard + pd->guardsize - guard - guardsize,
 642                         prot) != 0)
 643             goto mprot_error;
 644 #elif defined _STACK_GROWS_DOWN
 645           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 646                         prot) != 0)
 647             goto mprot_error;
 648 #elif defined _STACK_GROWS_UP
 649           if (mprotect ((char *) pd - pd->guardsize,
 650                         pd->guardsize - guardsize, prot) != 0)
 651             goto mprot_error;
 652 #endif
 653
 654           pd->guardsize = guardsize;
 655         }
 656       /* The pthread_getattr_np() calls need to get passed the size
 657          requested in the attribute, regardless of how large the
 658          actually used guardsize is.  */
 659       pd->reported_guardsize = guardsize;
 660     }
 661
 662   /* Initialize the lock.  We have to do this unconditionally since the
 663      stillborn thread could be canceled while the lock is taken.  */
 664   pd->lock = LLL_LOCK_INITIALIZER;
 665
 666   /* The robust mutex lists also need to be initialized
 667      unconditionally because the cleanup for the previous stack owner
 668      might have happened in the kernel.  */
 669   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 670                                   - offsetof (pthread_mutex_t,
 671                                               __data.__list.__next));
 672   pd->robust_head.list_op_pending = NULL;
 673 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 674   pd->robust_prev = &pd->robust_head;
 675 #endif
 676   pd->robust_head.list = &pd->robust_head;
 677
 678   /* We place the thread descriptor at the end of the stack.  */
 679   *pdp = pd;
 680
 681 #if defined(TLS_TCB_AT_TP)
 682   /* The stack begins before the TCB and the static TLS block.  */
 683   stacktop = ((char *) (pd + 1) - __static_tls_size);
 684 #elif defined(TLS_DTV_AT_TP)
 685   stacktop = (char *) (pd - 1);
 686 #endif
 687
 688 #ifdef NEED_SEPARATE_REGISTER_STACK
 689   *stack = pd->stackblock;
 690   *stacksize = stacktop - *stack;
 691 #elif defined _STACK_GROWS_DOWN
 692   *stack = stacktop;
 693 #elif defined _STACK_GROWS_UP
 694   *stack = pd->stackblock;
 695   assert (*stack > 0);
 696 #endif
 697
 698   return 0;
 699 }
 700
 701
 702 void
 703 internal_function
 704 __deallocate_stack (struct pthread *pd)
 705 {
 706   lll_lock (stack_cache_lock, LLL_PRIVATE);
 707
 708   /* Remove the thread from the list of threads with user defined
 709      stacks.  */
 710   stack_list_del (&pd->list);
 711
 712   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 713      not reset the 'used' flag in the 'tid' field.  This is done by
 714      the kernel.  If no thread has been created yet this field is
 715      still zero.  */
 716   if (__builtin_expect (! pd->user_stack, 1))
 717     (void) queue_stack (pd);
 718   else
 719     /* Free the memory associated with the ELF TLS.  */
 720     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 721
 722   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 723 }
 724
 725
 726 int
 727 internal_function
 728 __make_stacks_executable (void **stack_endp)
 729 {
 730   /* First the main thread's stack.  */
 731   int err = EPERM;
 732   if (err != 0)
 733     return err;
 734
 735 #ifdef NEED_SEPARATE_REGISTER_STACK
 736   const size_t pagemask = ~(__getpagesize () - 1);
 737 #endif
 738
 739   lll_lock (stack_cache_lock, LLL_PRIVATE);
 740
 741   list_t *runp;
 742   list_for_each (runp, &stack_used)
 743     {
 744       err = change_stack_perm (list_entry (runp, struct pthread, list)
 745 #ifdef NEED_SEPARATE_REGISTER_STACK
 746                                , pagemask
 747 #endif
 748                                );
 749       if (err != 0)
 750         break;
 751     }
 752
 753   /* Also change the permission for the currently unused stacks.  This
 754      might be wasted time but better spend it here than adding a check
 755      in the fast path.  */
 756   if (err == 0)
 757     list_for_each (runp, &stack_cache)
 758       {
 759         err = change_stack_perm (list_entry (runp, struct pthread, list)
 760 #ifdef NEED_SEPARATE_REGISTER_STACK
 761                                  , pagemask
 762 #endif
 763                                  );
 764         if (err != 0)
 765           break;
 766       }
 767
 768   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 769
 770   return err;
 771 }
 772
 773
 774 /* In case of a fork() call the memory allocation in the child will be
 775    the same but only one thread is running.  All stacks except that of
 776    the one running thread are not used anymore.  We have to recycle
 777    them.  */
 778 void
 779 __reclaim_stacks (void)
 780 {
 781   struct pthread *self = (struct pthread *) THREAD_SELF;
 782
 783   /* No locking necessary.  The caller is the only stack in use.  But
 784      we have to be aware that we might have interrupted a list
 785      operation.  */
 786
 787   if (in_flight_stack != 0)
 788     {
 789       bool add_p = in_flight_stack & 1;
 790       list_t *elem = (list_t *) (in_flight_stack & ~UINTMAX_C (1));
 791
 792       if (add_p)
 793         {
 794           /* We always add at the beginning of the list.  So in this
 795              case we only need to check the beginning of these lists.  */
 796           int check_list (list_t *l)
 797           {
 798             if (l->next->prev != l)
 799               {
 800                 assert (l->next->prev == elem);
 801
 802                 elem->next = l->next;
 803                 elem->prev = l;
 804                 l->next = elem;
 805
 806                 return 1;
 807               }
 808
 809             return 0;
 810           }
 811
 812           if (check_list (&stack_used) == 0)
 813             (void) check_list (&stack_cache);
 814         }
 815       else
 816         {
 817           /* We can simply always replay the delete operation.  */
 818           elem->next->prev = elem->prev;
 819           elem->prev->next = elem->next;
 820         }
 821     }
 822
 823   /* Mark all stacks except the still running one as free.  */
 824   list_t *runp;
 825   list_for_each (runp, &stack_used)
 826     {
 827       struct pthread *curp = list_entry (runp, struct pthread, list);
 828       if (curp != self)
 829         {
 830           /* This marks the stack as free.  */
 831           curp->tid = 0;
 832
 833           /* The PID field must be initialized for the new process.  */
 834           curp->pid = self->pid;
 835
 836           /* Account for the size of the stack.  */
 837           stack_cache_actsize += curp->stackblock_size;
 838
 839           if (curp->specific_used)
 840             {
 841               /* Clear the thread-specific data.  */
 842               memset (curp->specific_1stblock, '\0',
 843                       sizeof (curp->specific_1stblock));
 844
 845               curp->specific_used = false;
 846
 847               size_t cnt;
 848               for (cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 849                 if (curp->specific[cnt] != NULL)
 850                   {
 851                     memset (curp->specific[cnt], '\0',
 852                             sizeof (curp->specific_1stblock));
 853
 854                     /* We have allocated the block which we do not
 855                        free here so re-set the bit.  */
 856                     curp->specific_used = true;
 857                   }
 858             }
 859         }
 860     }
 861
 862   /* Reset the PIDs in any cached stacks.  */
 863   list_for_each (runp, &stack_cache)
 864     {
 865       struct pthread *curp = list_entry (runp, struct pthread, list);
 866       curp->pid = self->pid;
 867     }
 868
 869   /* Add the stack of all running threads to the cache.  */
 870   list_splice (&stack_used, &stack_cache);
 871
 872   /* Remove the entry for the current thread to from the cache list
 873      and add it to the list of running threads.  Which of the two
 874      lists is decided by the user_stack flag.  */
 875   stack_list_del (&self->list);
 876
 877   /* Re-initialize the lists for all the threads.  */
 878   INIT_LIST_HEAD (&stack_used);
 879   INIT_LIST_HEAD (&__stack_user);
 880
 881   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 882     list_add (&self->list, &__stack_user);
 883   else
 884     list_add (&self->list, &stack_used);
 885
 886   /* There is one thread running.  */
 887   __nptl_nthreads = 1;
 888
 889   in_flight_stack = 0;
 890
 891   /* Initialize the lock.  */
 892   stack_cache_lock = LLL_LOCK_INITIALIZER;
 893 }
 894
 895
 896 #if HP_TIMING_AVAIL
 897 # undef __find_thread_by_id
 898 /* Find a thread given the thread ID.  */
 899 attribute_hidden
 900 struct pthread *
 901 __find_thread_by_id (pid_t tid)
 902 {
 903   struct pthread *result = NULL;
 904
 905   lll_lock (stack_cache_lock, LLL_PRIVATE);
 906
 907   /* Iterate over the list with system-allocated threads first.  */
 908   list_t *runp;
 909   list_for_each (runp, &stack_used)
 910     {
 911       struct pthread *curp;
 912
 913       curp = list_entry (runp, struct pthread, list);
 914
 915       if (curp->tid == tid)
 916         {
 917           result = curp;
 918           goto out;
 919         }
 920     }
 921
 922   /* Now the list with threads using user-allocated stacks.  */
 923   list_for_each (runp, &__stack_user)
 924     {
 925       struct pthread *curp;
 926
 927       curp = list_entry (runp, struct pthread, list);
 928
 929       if (curp->tid == tid)
 930         {
 931           result = curp;
 932           goto out;
 933         }
 934     }
 935
 936  out:
 937   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 938
 939   return result;
 940 }
 941 #endif
 942
 943
 944 static void
 945 internal_function
 946 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 947 {
 948   int ch;
 949
 950   /* Don't let the thread exit before the setxid handler runs.  */
 951   t->setxid_futex = 0;
 952
 953   do
 954     {
 955       ch = t->cancelhandling;
 956
 957       /* If the thread is exiting right now, ignore it.  */
 958       if ((ch & EXITING_BITMASK) != 0)
 959         return;
 960     }
 961   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 962                                                ch | SETXID_BITMASK, ch));
 963 }
 964
 965
 966 static void
 967 internal_function
 968 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
 969 {
 970   int ch;
 971
 972   do
 973     {
 974       ch = t->cancelhandling;
 975       if ((ch & SETXID_BITMASK) == 0)
 976         return;
 977     }
 978   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 979                                                ch & ~SETXID_BITMASK, ch));
 980
 981   /* Release the futex just in case.  */
 982   t->setxid_futex = 1;
 983   lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
 984 }
 985
 986
 987 static int
 988 internal_function
 989 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
 990 {
 991   if ((t->cancelhandling & SETXID_BITMASK) == 0)
 992     return 0;
 993
 994   int val;
 995   INTERNAL_SYSCALL_DECL (err);
 996 #if defined (__ASSUME_TGKILL) && __ASSUME_TGKILL
 997   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
 998                           t->tid, SIGSETXID);
 999 #else
1000 # ifdef __NR_tgkill
1001   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1002                           t->tid, SIGSETXID);
1003   if (INTERNAL_SYSCALL_ERROR_P (val, err)
1004       && INTERNAL_SYSCALL_ERRNO (val, err) == ENOSYS)
1005 # endif
1006     val = INTERNAL_SYSCALL (tkill, err, 2, t->tid, SIGSETXID);
1007 #endif
1008
1009   /* If this failed, it must have had not started yet or else exited.  */
1010   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1011     {
1012       atomic_increment (&cmdp->cntr);
1013       return 1;
1014     }
1015   else
1016     return 0;
1017 }
1018
1019
1020 int
1021 attribute_hidden
1022 __nptl_setxid (struct xid_command *cmdp)
1023 {
1024   int signalled;
1025   int result;
1026   lll_lock (stack_cache_lock, LLL_PRIVATE);
1027
1028   __xidcmd = cmdp;
1029   cmdp->cntr = 0;
1030
1031   struct pthread *self = THREAD_SELF;
1032
1033   /* Iterate over the list with system-allocated threads first.  */
1034   list_t *runp;
1035   list_for_each (runp, &stack_used)
1036     {
1037       struct pthread *t = list_entry (runp, struct pthread, list);
1038       if (t == self)
1039         continue;
1040
1041       setxid_mark_thread (cmdp, t);
1042     }
1043
1044   /* Now the list with threads using user-allocated stacks.  */
1045   list_for_each (runp, &__stack_user)
1046     {
1047       struct pthread *t = list_entry (runp, struct pthread, list);
1048       if (t == self)
1049         continue;
1050
1051       setxid_mark_thread (cmdp, t);
1052     }
1053
1054   /* Iterate until we don't succeed in signalling anyone.  That means
1055      we have gotten all running threads, and their children will be
1056      automatically correct once started.  */
1057   do
1058     {
1059       signalled = 0;
1060
1061       list_for_each (runp, &stack_used)
1062         {
1063           struct pthread *t = list_entry (runp, struct pthread, list);
1064           if (t == self)
1065             continue;
1066
1067           signalled += setxid_signal_thread (cmdp, t);
1068         }
1069
1070       list_for_each (runp, &__stack_user)
1071         {
1072           struct pthread *t = list_entry (runp, struct pthread, list);
1073           if (t == self)
1074             continue;
1075
1076           signalled += setxid_signal_thread (cmdp, t);
1077         }
1078
1079       int cur = cmdp->cntr;
1080       while (cur != 0)
1081         {
1082           lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1083           cur = cmdp->cntr;
1084         }
1085     }
1086   while (signalled != 0);
1087
1088   /* Clean up flags, so that no thread blocks during exit waiting
1089      for a signal which will never come.  */
1090   list_for_each (runp, &stack_used)
1091     {
1092       struct pthread *t = list_entry (runp, struct pthread, list);
1093       if (t == self)
1094         continue;
1095
1096       setxid_unmark_thread (cmdp, t);
1097     }
1098
1099   list_for_each (runp, &__stack_user)
1100     {
1101       struct pthread *t = list_entry (runp, struct pthread, list);
1102       if (t == self)
1103         continue;
1104
1105       setxid_unmark_thread (cmdp, t);
1106     }
1107
1108   /* This must be last, otherwise the current thread might not have
1109      permissions to send SIGSETXID syscall to the other threads.  */
1110   INTERNAL_SYSCALL_DECL (err);
1111   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1112                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1113   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1114     {
1115       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1116       result = -1;
1117     }
1118
1119   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1120   return result;
1121 }
1122
1123 static inline void __attribute__((always_inline))
1124 init_one_static_tls (struct pthread *curp, struct link_map *map)
1125 {
1126   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1127 # if defined(TLS_TCB_AT_TP)
1128   void *dest = (char *) curp - map->l_tls_offset;
1129 # elif defined(TLS_DTV_AT_TP)
1130   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1131 # else
1132 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1133 # endif
1134
1135   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1136   dtv[map->l_tls_modid].pointer.val = dest;
1137   dtv[map->l_tls_modid].pointer.is_static = true;
1138
1139   /* Initialize the memory.  */
1140   memset (mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1141           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1142 }
1143
1144 void
1145 attribute_hidden
1146 __pthread_init_static_tls (struct link_map *map)
1147 {
1148   lll_lock (stack_cache_lock, LLL_PRIVATE);
1149
1150   /* Iterate over the list with system-allocated threads first.  */
1151   list_t *runp;
1152   list_for_each (runp, &stack_used)
1153     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1154
1155   /* Now the list with threads using user-allocated stacks.  */
1156   list_for_each (runp, &__stack_user)
1157     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1158
1159   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1160 }
1161
1162
1163 void
1164 attribute_hidden
1165 __wait_lookup_done (void)
1166 {
1167   lll_lock (stack_cache_lock, LLL_PRIVATE);
1168
1169   struct pthread *self = THREAD_SELF;
1170
1171   /* Iterate over the list with system-allocated threads first.  */
1172   list_t *runp;
1173   list_for_each (runp, &stack_used)
1174     {
1175       struct pthread *t = list_entry (runp, struct pthread, list);
1176       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1177         continue;
1178
1179       int *const gscope_flagp = &t->header.gscope_flag;
1180
1181       /* We have to wait until this thread is done with the global
1182          scope.  First tell the thread that we are waiting and
1183          possibly have to be woken.  */
1184       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1185                                                 THREAD_GSCOPE_FLAG_WAIT,
1186                                                 THREAD_GSCOPE_FLAG_USED))
1187         continue;
1188
1189       do
1190         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1191       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1192     }
1193
1194   /* Now the list with threads using user-allocated stacks.  */
1195   list_for_each (runp, &__stack_user)
1196     {
1197       struct pthread *t = list_entry (runp, struct pthread, list);
1198       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1199         continue;
1200
1201       int *const gscope_flagp = &t->header.gscope_flag;
1202
1203       /* We have to wait until this thread is done with the global
1204          scope.  First tell the thread that we are waiting and
1205          possibly have to be woken.  */
1206       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1207                                                 THREAD_GSCOPE_FLAG_WAIT,
1208                                                 THREAD_GSCOPE_FLAG_USED))
1209         continue;
1210
1211       do
1212         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1213       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1214     }
1215
1216   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1217 }