libpthread/nptl/allocatestack.c

   1 /* Copyright (C) 2002-2007, 2009 Free Software Foundation, Inc.
   2    This file is part of the GNU C Library.
   3    Contributed by Ulrich Drepper <drepper@redhat.com>, 2002.
   4
   5    The GNU C Library is free software; you can redistribute it and/or
   6    modify it under the terms of the GNU Lesser General Public
   7    License as published by the Free Software Foundation; either
   8    version 2.1 of the License, or (at your option) any later version.
   9
  10    The GNU C Library is distributed in the hope that it will be useful,
  11    but WITHOUT ANY WARRANTY; without even the implied warranty of
  12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13    Lesser General Public License for more details.
  14
  15    You should have received a copy of the GNU Lesser General Public
  16    License along with the GNU C Library; if not, write to the Free
  17    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  18    02111-1307 USA.  */
  19
  20 #include <assert.h>
  21 #include <errno.h>
  22 #include <signal.h>
  23 #include <stdint.h>
  24 #include <string.h>
  25 #include <unistd.h>
  26 #include <sys/mman.h>
  27 #include <sys/param.h>
  28 #include <tls.h>
  29 #include <lowlevellock.h>
  30 #include <link.h>
  31 #include <bits/kernel-features.h>
  32
  33
  34 #ifndef NEED_SEPARATE_REGISTER_STACK
  35
  36 /* Most architectures have exactly one stack pointer.  Some have more.  */
  37 # define STACK_VARIABLES void *stackaddr = NULL
  38
  39 /* How to pass the values to the 'create_thread' function.  */
  40 # define STACK_VARIABLES_ARGS stackaddr
  41
  42 /* How to declare function which gets there parameters.  */
  43 # define STACK_VARIABLES_PARMS void *stackaddr
  44
  45 /* How to declare allocate_stack.  */
  46 # define ALLOCATE_STACK_PARMS void **stack
  47
  48 /* This is how the function is called.  We do it this way to allow
  49    other variants of the function to have more parameters.  */
  50 # define ALLOCATE_STACK(attr, pd) allocate_stack (attr, pd, &stackaddr)
  51
  52 #else
  53
  54 /* We need two stacks.  The kernel will place them but we have to tell
  55    the kernel about the size of the reserved address space.  */
  56 # define STACK_VARIABLES void *stackaddr = NULL; size_t stacksize = 0
  57
  58 /* How to pass the values to the 'create_thread' function.  */
  59 # define STACK_VARIABLES_ARGS stackaddr, stacksize
  60
  61 /* How to declare function which gets there parameters.  */
  62 # define STACK_VARIABLES_PARMS void *stackaddr, size_t stacksize
  63
  64 /* How to declare allocate_stack.  */
  65 # define ALLOCATE_STACK_PARMS void **stack, size_t *stacksize
  66
  67 /* This is how the function is called.  We do it this way to allow
  68    other variants of the function to have more parameters.  */
  69 # define ALLOCATE_STACK(attr, pd) \
  70   allocate_stack (attr, pd, &stackaddr, &stacksize)
  71
  72 #endif
  73
  74
  75 /* Default alignment of stack.  */
  76 #ifndef STACK_ALIGN
  77 # define STACK_ALIGN __alignof__ (long double)
  78 #endif
  79
  80 /* Default value for minimal stack size after allocating thread
  81    descriptor and guard.  */
  82 #ifndef MINIMAL_REST_STACK
  83 # define MINIMAL_REST_STACK     4096
  84 #endif
  85
  86
  87 /* Newer kernels have the MAP_STACK flag to indicate a mapping is used for
  88    a stack.  Use it when possible.  */
  89 #ifndef MAP_STACK
  90 # define MAP_STACK 0
  91 #endif
  92
  93 /* This yields the pointer that TLS support code calls the thread pointer.  */
  94 #if defined(TLS_TCB_AT_TP)
  95 # define TLS_TPADJ(pd) (pd)
  96 #elif defined(TLS_DTV_AT_TP)
  97 # define TLS_TPADJ(pd) ((struct pthread *)((char *) (pd) + TLS_PRE_TCB_SIZE))
  98 #endif
  99
 100 /* Cache handling for not-yet free stacks.  */
 101
 102 /* Maximum size in kB of cache.  */
 103 static size_t stack_cache_maxsize = 40 * 1024 * 1024; /* 40MiBi by default.  */
 104 static size_t stack_cache_actsize;
 105
 106 /* Mutex protecting this variable.  */
 107 static int stack_cache_lock = LLL_LOCK_INITIALIZER;
 108
 109 /* List of queued stack frames.  */
 110 static LIST_HEAD (stack_cache);
 111
 112 /* List of the stacks in use.  */
 113 static LIST_HEAD (stack_used);
 114
 115 /* We need to record what list operations we are going to do so that,
 116    in case of an asynchronous interruption due to a fork() call, we
 117    can correct for the work.  */
 118 static uintptr_t in_flight_stack;
 119
 120 /* List of the threads with user provided stacks in use.  No need to
 121    initialize this, since it's done in __pthread_initialize_minimal.  */
 122 list_t __stack_user __attribute__ ((nocommon));
 123 hidden_data_def (__stack_user)
 124
 125 #if COLORING_INCREMENT != 0
 126 /* Number of threads created.  */
 127 static unsigned int nptl_ncreated;
 128 #endif
 129
 130
 131 /* Check whether the stack is still used or not.  */
 132 #define FREE_P(descr) ((descr)->tid <= 0)
 133
 134
 135 static void
 136 stack_list_del (list_t *elem)
 137 {
 138   in_flight_stack = (uintptr_t) elem;
 139
 140   atomic_write_barrier ();
 141
 142   list_del (elem);
 143
 144   atomic_write_barrier ();
 145
 146   in_flight_stack = 0;
 147 }
 148
 149
 150 static void
 151 stack_list_add (list_t *elem, list_t *list)
 152 {
 153   in_flight_stack = (uintptr_t) elem | 1;
 154
 155   atomic_write_barrier ();
 156
 157   list_add (elem, list);
 158
 159   atomic_write_barrier ();
 160
 161   in_flight_stack = 0;
 162 }
 163
 164
 165 /* We create a double linked list of all cache entries.  Double linked
 166    because this allows removing entries from the end.  */
 167
 168
 169 /* Get a stack frame from the cache.  We have to match by size since
 170    some blocks might be too small or far too large.  */
 171 static struct pthread *
 172 get_cached_stack (size_t *sizep, void **memp)
 173 {
 174   size_t size = *sizep;
 175   struct pthread *result = NULL;
 176   list_t *entry;
 177
 178   lll_lock (stack_cache_lock, LLL_PRIVATE);
 179
 180   /* Search the cache for a matching entry.  We search for the
 181      smallest stack which has at least the required size.  Note that
 182      in normal situations the size of all allocated stacks is the
 183      same.  As the very least there are only a few different sizes.
 184      Therefore this loop will exit early most of the time with an
 185      exact match.  */
 186   list_for_each (entry, &stack_cache)
 187     {
 188       struct pthread *curr;
 189
 190       curr = list_entry (entry, struct pthread, list);
 191       if (FREE_P (curr) && curr->stackblock_size >= size)
 192         {
 193           if (curr->stackblock_size == size)
 194             {
 195               result = curr;
 196               break;
 197             }
 198
 199           if (result == NULL
 200               || result->stackblock_size > curr->stackblock_size)
 201             result = curr;
 202         }
 203     }
 204
 205   if (__builtin_expect (result == NULL, 0)
 206       /* Make sure the size difference is not too excessive.  In that
 207          case we do not use the block.  */
 208       || __builtin_expect (result->stackblock_size > 4 * size, 0))
 209     {
 210       /* Release the lock.  */
 211       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 212
 213       return NULL;
 214     }
 215
 216   /* Dequeue the entry.  */
 217   stack_list_del (&result->list);
 218
 219   /* And add to the list of stacks in use.  */
 220   stack_list_add (&result->list, &stack_used);
 221
 222   /* And decrease the cache size.  */
 223   stack_cache_actsize -= result->stackblock_size;
 224
 225   /* Release the lock early.  */
 226   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 227
 228   /* Report size and location of the stack to the caller.  */
 229   *sizep = result->stackblock_size;
 230   *memp = result->stackblock;
 231
 232   /* Cancellation handling is back to the default.  */
 233   result->cancelhandling = 0;
 234   result->cleanup = NULL;
 235
 236   /* No pending event.  */
 237   result->nextevent = NULL;
 238
 239   /* Clear the DTV.  */
 240   dtv_t *dtv = GET_DTV (TLS_TPADJ (result));
 241   memset (dtv, '\0', (dtv[-1].counter + 1) * sizeof (dtv_t));
 242
 243   /* Re-initialize the TLS.  */
 244   _dl_allocate_tls_init (TLS_TPADJ (result));
 245
 246   return result;
 247 }
 248
 249
 250 /* Free stacks until cache size is lower than LIMIT.  */
 251 void
 252 __free_stacks (size_t limit)
 253 {
 254   /* We reduce the size of the cache.  Remove the last entries until
 255      the size is below the limit.  */
 256   list_t *entry;
 257   list_t *prev;
 258
 259   /* Search from the end of the list.  */
 260   list_for_each_prev_safe (entry, prev, &stack_cache)
 261     {
 262       struct pthread *curr;
 263
 264       curr = list_entry (entry, struct pthread, list);
 265       if (FREE_P (curr))
 266         {
 267           /* Unlink the block.  */
 268           stack_list_del (entry);
 269
 270           /* Account for the freed memory.  */
 271           stack_cache_actsize -= curr->stackblock_size;
 272
 273           /* Free the memory associated with the ELF TLS.  */
 274           _dl_deallocate_tls (TLS_TPADJ (curr), false);
 275
 276           /* Remove this block.  This should never fail.  If it does
 277              something is really wrong.  */
 278           if (munmap (curr->stackblock, curr->stackblock_size) != 0)
 279             abort ();
 280
 281           /* Maybe we have freed enough.  */
 282           if (stack_cache_actsize <= limit)
 283             break;
 284         }
 285     }
 286 }
 287
 288
 289 /* Add a stack frame which is not used anymore to the stack.  Must be
 290    called with the cache lock held.  */
 291 static inline void
 292 __attribute ((always_inline))
 293 queue_stack (struct pthread *stack)
 294 {
 295   /* We unconditionally add the stack to the list.  The memory may
 296      still be in use but it will not be reused until the kernel marks
 297      the stack as not used anymore.  */
 298   stack_list_add (&stack->list, &stack_cache);
 299
 300   stack_cache_actsize += stack->stackblock_size;
 301   if (__builtin_expect (stack_cache_actsize > stack_cache_maxsize, 0))
 302     __free_stacks (stack_cache_maxsize);
 303 }
 304
 305
 306 static int
 307 internal_function
 308 change_stack_perm (struct pthread *pd
 309 #ifdef NEED_SEPARATE_REGISTER_STACK
 310                    , size_t pagemask
 311 #endif
 312                    )
 313 {
 314 #ifdef NEED_SEPARATE_REGISTER_STACK
 315   void *stack = (pd->stackblock
 316                  + (((((pd->stackblock_size - pd->guardsize) / 2)
 317                       & pagemask) + pd->guardsize) & pagemask));
 318   size_t len = pd->stackblock + pd->stackblock_size - stack;
 319 #elif _STACK_GROWS_DOWN
 320   void *stack = pd->stackblock + pd->guardsize;
 321   size_t len = pd->stackblock_size - pd->guardsize;
 322 #elif _STACK_GROWS_UP
 323   void *stack = pd->stackblock;
 324   size_t len = (uintptr_t) pd - pd->guardsize - (uintptr_t) pd->stackblock;
 325 #else
 326 # error "Define either _STACK_GROWS_DOWN or _STACK_GROWS_UP"
 327 #endif
 328   if (mprotect (stack, len, PROT_READ | PROT_WRITE | PROT_EXEC) != 0)
 329     return errno;
 330
 331   return 0;
 332 }
 333
 334
 335 static int
 336 allocate_stack (const struct pthread_attr *attr, struct pthread **pdp,
 337                 ALLOCATE_STACK_PARMS)
 338 {
 339   struct pthread *pd;
 340   size_t size;
 341   size_t pagesize_m1 = __getpagesize () - 1;
 342   void *stacktop;
 343
 344   assert (attr != NULL);
 345   assert (powerof2 (pagesize_m1 + 1));
 346   assert (TCB_ALIGNMENT >= STACK_ALIGN);
 347
 348   /* Get the stack size from the attribute if it is set.  Otherwise we
 349      use the default we determined at start time.  */
 350   size = attr->stacksize ?: __default_stacksize;
 351
 352   /* Get memory for the stack.  */
 353   if (__builtin_expect (attr->flags & ATTR_FLAG_STACKADDR, 0))
 354     {
 355       uintptr_t adj;
 356
 357       /* If the user also specified the size of the stack make sure it
 358          is large enough.  */
 359       if (attr->stacksize != 0
 360           && attr->stacksize < (__static_tls_size + MINIMAL_REST_STACK))
 361         return EINVAL;
 362
 363       /* Adjust stack size for alignment of the TLS block.  */
 364 #if defined(TLS_TCB_AT_TP)
 365       adj = ((uintptr_t) attr->stackaddr - TLS_TCB_SIZE)
 366             & __static_tls_align_m1;
 367       assert (size > adj + TLS_TCB_SIZE);
 368 #elif defined(TLS_DTV_AT_TP)
 369       adj = ((uintptr_t) attr->stackaddr - __static_tls_size)
 370             & __static_tls_align_m1;
 371       assert (size > adj);
 372 #endif
 373
 374       /* The user provided some memory.  Let's hope it matches the
 375          size...  We do not allocate guard pages if the user provided
 376          the stack.  It is the user's responsibility to do this if it
 377          is wanted.  */
 378 #if defined(TLS_TCB_AT_TP)
 379       pd = (struct pthread *) ((uintptr_t) attr->stackaddr
 380                                - TLS_TCB_SIZE - adj);
 381 #elif defined(TLS_DTV_AT_TP)
 382       pd = (struct pthread *) (((uintptr_t) attr->stackaddr
 383                                 - __static_tls_size - adj)
 384                                - TLS_PRE_TCB_SIZE);
 385 #endif
 386
 387       /* The user provided stack memory needs to be cleared.  */
 388       memset (pd, '\0', sizeof (struct pthread));
 389
 390       /* The first TSD block is included in the TCB.  */
 391       pd->specific[0] = pd->specific_1stblock;
 392
 393       /* Remember the stack-related values.  */
 394       pd->stackblock = (char *) attr->stackaddr - size;
 395       pd->stackblock_size = size;
 396
 397       /* This is a user-provided stack.  It will not be queued in the
 398          stack cache nor will the memory (except the TLS memory) be freed.  */
 399       pd->user_stack = true;
 400
 401       /* This is at least the second thread.  */
 402       pd->header.multiple_threads = 1;
 403 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 404       __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 405 #endif
 406
 407 #ifndef __ASSUME_PRIVATE_FUTEX
 408       /* The thread must know when private futexes are supported.  */
 409       pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 410                                                 header.private_futex);
 411 #endif
 412
 413 #ifdef NEED_DL_SYSINFO
 414       /* Copy the sysinfo value from the parent.  */
 415       THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 416 #endif
 417
 418       /* The process ID is also the same as that of the caller.  */
 419       pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 420
 421       /* Allocate the DTV for this thread.  */
 422       if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 423         {
 424           /* Something went wrong.  */
 425           assert (errno == ENOMEM);
 426           return EAGAIN;
 427         }
 428
 429
 430       /* Prepare to modify global data.  */
 431       lll_lock (stack_cache_lock, LLL_PRIVATE);
 432
 433       /* And add to the list of stacks in use.  */
 434       list_add (&pd->list, &__stack_user);
 435
 436       lll_unlock (stack_cache_lock, LLL_PRIVATE);
 437     }
 438   else
 439     {
 440       /* Allocate some anonymous memory.  If possible use the cache.  */
 441       size_t guardsize;
 442       size_t reqsize;
 443       void *mem = 0;
 444       const int prot = (PROT_READ | PROT_WRITE);
 445
 446 #if COLORING_INCREMENT != 0
 447       /* Add one more page for stack coloring.  Don't do it for stacks
 448          with 16 times pagesize or larger.  This might just cause
 449          unnecessary misalignment.  */
 450       if (size <= 16 * pagesize_m1)
 451         size += pagesize_m1 + 1;
 452 #endif
 453
 454       /* Adjust the stack size for alignment.  */
 455       size &= ~__static_tls_align_m1;
 456       assert (size != 0);
 457
 458       /* Make sure the size of the stack is enough for the guard and
 459          eventually the thread descriptor.  */
 460       guardsize = (attr->guardsize + pagesize_m1) & ~pagesize_m1;
 461       if (__builtin_expect (size < ((guardsize + __static_tls_size
 462                                      + MINIMAL_REST_STACK + pagesize_m1)
 463                                     & ~pagesize_m1),
 464                             0))
 465         /* The stack is too small (or the guard too large).  */
 466         return EINVAL;
 467
 468       /* Try to get a stack from the cache.  */
 469       reqsize = size;
 470       pd = get_cached_stack (&size, &mem);
 471       if (pd == NULL)
 472         {
 473           /* To avoid aliasing effects on a larger scale than pages we
 474              adjust the allocated stack size if necessary.  This way
 475              allocations directly following each other will not have
 476              aliasing problems.  */
 477 #if MULTI_PAGE_ALIASING != 0
 478           if ((size % MULTI_PAGE_ALIASING) == 0)
 479             size += pagesize_m1 + 1;
 480 #endif
 481
 482           mem = mmap (NULL, size, prot,
 483                       MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
 484
 485           if (__builtin_expect (mem == MAP_FAILED, 0))
 486             {
 487               if (errno == ENOMEM)
 488                 __set_errno (EAGAIN);
 489
 490                return errno;
 491             }
 492
 493           /* SIZE is guaranteed to be greater than zero.
 494              So we can never get a null pointer back from mmap.  */
 495           assert (mem != NULL);
 496
 497 #if COLORING_INCREMENT != 0
 498           /* Atomically increment NCREATED.  */
 499           unsigned int ncreated = atomic_increment_val (&nptl_ncreated);
 500
 501           /* We chose the offset for coloring by incrementing it for
 502              every new thread by a fixed amount.  The offset used
 503              module the page size.  Even if coloring would be better
 504              relative to higher alignment values it makes no sense to
 505              do it since the mmap() interface does not allow us to
 506              specify any alignment for the returned memory block.  */
 507           size_t coloring = (ncreated * COLORING_INCREMENT) & pagesize_m1;
 508
 509           /* Make sure the coloring offsets does not disturb the alignment
 510              of the TCB and static TLS block.  */
 511           if (__builtin_expect ((coloring & __static_tls_align_m1) != 0, 0))
 512             coloring = (((coloring + __static_tls_align_m1)
 513                          & ~(__static_tls_align_m1))
 514                         & ~pagesize_m1);
 515 #else
 516           /* Unless specified we do not make any adjustments.  */
 517 # define coloring 0
 518 #endif
 519
 520           /* Place the thread descriptor at the end of the stack.  */
 521 #if defined(TLS_TCB_AT_TP)
 522           pd = (struct pthread *) ((char *) mem + size - coloring) - 1;
 523 #elif defined(TLS_DTV_AT_TP)
 524           pd = (struct pthread *) ((((uintptr_t) mem + size - coloring
 525                                     - __static_tls_size)
 526                                     & ~__static_tls_align_m1)
 527                                    - TLS_PRE_TCB_SIZE);
 528 #endif
 529
 530           /* Remember the stack-related values.  */
 531           pd->stackblock = mem;
 532           pd->stackblock_size = size;
 533
 534           /* We allocated the first block thread-specific data array.
 535              This address will not change for the lifetime of this
 536              descriptor.  */
 537           pd->specific[0] = pd->specific_1stblock;
 538
 539           /* This is at least the second thread.  */
 540           pd->header.multiple_threads = 1;
 541 #ifndef TLS_MULTIPLE_THREADS_IN_TCB
 542           __pthread_multiple_threads = *__libc_multiple_threads_ptr = 1;
 543 #endif
 544
 545 #ifndef __ASSUME_PRIVATE_FUTEX
 546           /* The thread must know when private futexes are supported.  */
 547           pd->header.private_futex = THREAD_GETMEM (THREAD_SELF,
 548                                                     header.private_futex);
 549 #endif
 550
 551 #ifdef NEED_DL_SYSINFO
 552           /* Copy the sysinfo value from the parent.  */
 553           THREAD_SYSINFO(pd) = THREAD_SELF_SYSINFO;
 554 #endif
 555
 556           /* The process ID is also the same as that of the caller.  */
 557           pd->pid = THREAD_GETMEM (THREAD_SELF, pid);
 558
 559           /* Allocate the DTV for this thread.  */
 560           if (_dl_allocate_tls (TLS_TPADJ (pd)) == NULL)
 561             {
 562               /* Something went wrong.  */
 563               assert (errno == ENOMEM);
 564
 565               /* Free the stack memory we just allocated.  */
 566               (void) munmap (mem, size);
 567
 568               return EAGAIN;
 569             }
 570
 571
 572           /* Prepare to modify global data.  */
 573           lll_lock (stack_cache_lock, LLL_PRIVATE);
 574
 575           /* And add to the list of stacks in use.  */
 576           stack_list_add (&pd->list, &stack_used);
 577
 578           lll_unlock (stack_cache_lock, LLL_PRIVATE);
 579
 580
 581           /* Note that all of the stack and the thread descriptor is
 582              zeroed.  This means we do not have to initialize fields
 583              with initial value zero.  This is specifically true for
 584              the 'tid' field which is always set back to zero once the
 585              stack is not used anymore and for the 'guardsize' field
 586              which will be read next.  */
 587         }
 588
 589       /* Create or resize the guard area if necessary.  */
 590       if (__builtin_expect (guardsize > pd->guardsize, 0))
 591         {
 592 #ifdef NEED_SEPARATE_REGISTER_STACK
 593           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 594 #elif _STACK_GROWS_DOWN
 595           char *guard = mem;
 596 # elif _STACK_GROWS_UP
 597           char *guard = (char *) (((uintptr_t) pd - guardsize) & ~pagesize_m1);
 598 #endif
 599           if (mprotect (guard, guardsize, PROT_NONE) != 0)
 600             {
 601               int err;
 602             mprot_error:
 603               err = errno;
 604
 605               lll_lock (stack_cache_lock, LLL_PRIVATE);
 606
 607               /* Remove the thread from the list.  */
 608               stack_list_del (&pd->list);
 609
 610               lll_unlock (stack_cache_lock, LLL_PRIVATE);
 611
 612               /* Get rid of the TLS block we allocated.  */
 613               _dl_deallocate_tls (TLS_TPADJ (pd), false);
 614
 615               /* Free the stack memory regardless of whether the size
 616                  of the cache is over the limit or not.  If this piece
 617                  of memory caused problems we better do not use it
 618                  anymore.  Uh, and we ignore possible errors.  There
 619                  is nothing we could do.  */
 620               (void) munmap (mem, size);
 621
 622               return err;
 623             }
 624
 625           pd->guardsize = guardsize;
 626         }
 627       else if (__builtin_expect (pd->guardsize - guardsize > size - reqsize,
 628                                  0))
 629         {
 630           /* The old guard area is too large.  */
 631
 632 #ifdef NEED_SEPARATE_REGISTER_STACK
 633           char *guard = mem + (((size - guardsize) / 2) & ~pagesize_m1);
 634           char *oldguard = mem + (((size - pd->guardsize) / 2) & ~pagesize_m1);
 635
 636           if (oldguard < guard
 637               && mprotect (oldguard, guard - oldguard, prot) != 0)
 638             goto mprot_error;
 639
 640           if (mprotect (guard + guardsize,
 641                         oldguard + pd->guardsize - guard - guardsize,
 642                         prot) != 0)
 643             goto mprot_error;
 644 #elif _STACK_GROWS_DOWN
 645           if (mprotect ((char *) mem + guardsize, pd->guardsize - guardsize,
 646                         prot) != 0)
 647             goto mprot_error;
 648 #elif _STACK_GROWS_UP
 649           if (mprotect ((char *) pd - pd->guardsize,
 650                         pd->guardsize - guardsize, prot) != 0)
 651             goto mprot_error;
 652 #endif
 653
 654           pd->guardsize = guardsize;
 655         }
 656       /* The pthread_getattr_np() calls need to get passed the size
 657          requested in the attribute, regardless of how large the
 658          actually used guardsize is.  */
 659       pd->reported_guardsize = guardsize;
 660     }
 661
 662   /* Initialize the lock.  We have to do this unconditionally since the
 663      stillborn thread could be canceled while the lock is taken.  */
 664   pd->lock = LLL_LOCK_INITIALIZER;
 665
 666   /* The robust mutex lists also need to be initialized
 667      unconditionally because the cleanup for the previous stack owner
 668      might have happened in the kernel.  */
 669   pd->robust_head.futex_offset = (offsetof (pthread_mutex_t, __data.__lock)
 670                                   - offsetof (pthread_mutex_t,
 671                                               __data.__list.__next));
 672   pd->robust_head.list_op_pending = NULL;
 673 #ifdef __PTHREAD_MUTEX_HAVE_PREV
 674   pd->robust_prev = &pd->robust_head;
 675 #endif
 676   pd->robust_head.list = &pd->robust_head;
 677
 678   /* We place the thread descriptor at the end of the stack.  */
 679   *pdp = pd;
 680
 681 #if defined(TLS_TCB_AT_TP)
 682   /* The stack begins before the TCB and the static TLS block.  */
 683   stacktop = ((char *) (pd + 1) - __static_tls_size);
 684 #elif defined(TLS_DTV_AT_TP)
 685   stacktop = (char *) (pd - 1);
 686 #endif
 687
 688 #ifdef NEED_SEPARATE_REGISTER_STACK
 689   *stack = pd->stackblock;
 690   *stacksize = stacktop - *stack;
 691 #elif _STACK_GROWS_DOWN
 692   *stack = stacktop;
 693 #elif _STACK_GROWS_UP
 694   *stack = pd->stackblock;
 695   assert (*stack > 0);
 696 #endif
 697
 698   return 0;
 699 }
 700
 701
 702 void
 703 internal_function
 704 __deallocate_stack (struct pthread *pd)
 705 {
 706   lll_lock (stack_cache_lock, LLL_PRIVATE);
 707
 708   /* Remove the thread from the list of threads with user defined
 709      stacks.  */
 710   stack_list_del (&pd->list);
 711
 712   /* Not much to do.  Just free the mmap()ed memory.  Note that we do
 713      not reset the 'used' flag in the 'tid' field.  This is done by
 714      the kernel.  If no thread has been created yet this field is
 715      still zero.  */
 716   if (__builtin_expect (! pd->user_stack, 1))
 717     (void) queue_stack (pd);
 718   else
 719     /* Free the memory associated with the ELF TLS.  */
 720     _dl_deallocate_tls (TLS_TPADJ (pd), false);
 721
 722   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 723 }
 724
 725
 726 int
 727 internal_function
 728 __make_stacks_executable (void **stack_endp)
 729 {
 730   /* First the main thread's stack.  */
 731   int err = EPERM;
 732   if (err != 0)
 733     return err;
 734
 735 #ifdef NEED_SEPARATE_REGISTER_STACK
 736   const size_t pagemask = ~(__getpagesize () - 1);
 737 #endif
 738
 739   lll_lock (stack_cache_lock, LLL_PRIVATE);
 740
 741   list_t *runp;
 742   list_for_each (runp, &stack_used)
 743     {
 744       err = change_stack_perm (list_entry (runp, struct pthread, list)
 745 #ifdef NEED_SEPARATE_REGISTER_STACK
 746                                , pagemask
 747 #endif
 748                                );
 749       if (err != 0)
 750         break;
 751     }
 752
 753   /* Also change the permission for the currently unused stacks.  This
 754      might be wasted time but better spend it here than adding a check
 755      in the fast path.  */
 756   if (err == 0)
 757     list_for_each (runp, &stack_cache)
 758       {
 759         err = change_stack_perm (list_entry (runp, struct pthread, list)
 760 #ifdef NEED_SEPARATE_REGISTER_STACK
 761                                  , pagemask
 762 #endif
 763                                  );
 764         if (err != 0)
 765           break;
 766       }
 767
 768   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 769
 770   return err;
 771 }
 772
 773
 774 /* In case of a fork() call the memory allocation in the child will be
 775    the same but only one thread is running.  All stacks except that of
 776    the one running thread are not used anymore.  We have to recycle
 777    them.  */
 778 void
 779 __reclaim_stacks (void)
 780 {
 781   struct pthread *self = (struct pthread *) THREAD_SELF;
 782
 783   /* No locking necessary.  The caller is the only stack in use.  But
 784      we have to be aware that we might have interrupted a list
 785      operation.  */
 786
 787   if (in_flight_stack != 0)
 788     {
 789       bool add_p = in_flight_stack & 1;
 790       list_t *elem = (list_t *) (in_flight_stack & ~UINTMAX_C (1));
 791
 792       if (add_p)
 793         {
 794           /* We always add at the beginning of the list.  So in this
 795              case we only need to check the beginning of these lists.  */
 796           int check_list (list_t *l)
 797           {
 798             if (l->next->prev != l)
 799               {
 800                 assert (l->next->prev == elem);
 801
 802                 elem->next = l->next;
 803                 elem->prev = l;
 804                 l->next = elem;
 805
 806                 return 1;
 807               }
 808
 809             return 0;
 810           }
 811
 812           if (check_list (&stack_used) == 0)
 813             (void) check_list (&stack_cache);
 814         }
 815       else
 816         {
 817           /* We can simply always replay the delete operation.  */
 818           elem->next->prev = elem->prev;
 819           elem->prev->next = elem->next;
 820         }
 821     }
 822
 823   /* Mark all stacks except the still running one as free.  */
 824   list_t *runp;
 825   list_for_each (runp, &stack_used)
 826     {
 827       struct pthread *curp = list_entry (runp, struct pthread, list);
 828       if (curp != self)
 829         {
 830           /* This marks the stack as free.  */
 831           curp->tid = 0;
 832
 833           /* The PID field must be initialized for the new process.  */
 834           curp->pid = self->pid;
 835
 836           /* Account for the size of the stack.  */
 837           stack_cache_actsize += curp->stackblock_size;
 838
 839           if (curp->specific_used)
 840             {
 841               /* Clear the thread-specific data.  */
 842               memset (curp->specific_1stblock, '\0',
 843                       sizeof (curp->specific_1stblock));
 844
 845               curp->specific_used = false;
 846
 847               for (size_t cnt = 1; cnt < PTHREAD_KEY_1STLEVEL_SIZE; ++cnt)
 848                 if (curp->specific[cnt] != NULL)
 849                   {
 850                     memset (curp->specific[cnt], '\0',
 851                             sizeof (curp->specific_1stblock));
 852
 853                     /* We have allocated the block which we do not
 854                        free here so re-set the bit.  */
 855                     curp->specific_used = true;
 856                   }
 857             }
 858         }
 859     }
 860
 861   /* Reset the PIDs in any cached stacks.  */
 862   list_for_each (runp, &stack_cache)
 863     {
 864       struct pthread *curp = list_entry (runp, struct pthread, list);
 865       curp->pid = self->pid;
 866     }
 867
 868   /* Add the stack of all running threads to the cache.  */
 869   list_splice (&stack_used, &stack_cache);
 870
 871   /* Remove the entry for the current thread to from the cache list
 872      and add it to the list of running threads.  Which of the two
 873      lists is decided by the user_stack flag.  */
 874   stack_list_del (&self->list);
 875
 876   /* Re-initialize the lists for all the threads.  */
 877   INIT_LIST_HEAD (&stack_used);
 878   INIT_LIST_HEAD (&__stack_user);
 879
 880   if (__builtin_expect (THREAD_GETMEM (self, user_stack), 0))
 881     list_add (&self->list, &__stack_user);
 882   else
 883     list_add (&self->list, &stack_used);
 884
 885   /* There is one thread running.  */
 886   __nptl_nthreads = 1;
 887
 888   in_flight_stack = 0;
 889
 890   /* Initialize the lock.  */
 891   stack_cache_lock = LLL_LOCK_INITIALIZER;
 892 }
 893
 894
 895 #if HP_TIMING_AVAIL
 896 # undef __find_thread_by_id
 897 /* Find a thread given the thread ID.  */
 898 attribute_hidden
 899 struct pthread *
 900 __find_thread_by_id (pid_t tid)
 901 {
 902   struct pthread *result = NULL;
 903
 904   lll_lock (stack_cache_lock, LLL_PRIVATE);
 905
 906   /* Iterate over the list with system-allocated threads first.  */
 907   list_t *runp;
 908   list_for_each (runp, &stack_used)
 909     {
 910       struct pthread *curp;
 911
 912       curp = list_entry (runp, struct pthread, list);
 913
 914       if (curp->tid == tid)
 915         {
 916           result = curp;
 917           goto out;
 918         }
 919     }
 920
 921   /* Now the list with threads using user-allocated stacks.  */
 922   list_for_each (runp, &__stack_user)
 923     {
 924       struct pthread *curp;
 925
 926       curp = list_entry (runp, struct pthread, list);
 927
 928       if (curp->tid == tid)
 929         {
 930           result = curp;
 931           goto out;
 932         }
 933     }
 934
 935  out:
 936   lll_unlock (stack_cache_lock, LLL_PRIVATE);
 937
 938   return result;
 939 }
 940 #endif
 941
 942
 943 static void
 944 internal_function
 945 setxid_mark_thread (struct xid_command *cmdp, struct pthread *t)
 946 {
 947   int ch;
 948
 949   /* Don't let the thread exit before the setxid handler runs.  */
 950   t->setxid_futex = 0;
 951
 952   do
 953     {
 954       ch = t->cancelhandling;
 955
 956       /* If the thread is exiting right now, ignore it.  */
 957       if ((ch & EXITING_BITMASK) != 0)
 958         return;
 959     }
 960   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 961                                                ch | SETXID_BITMASK, ch));
 962 }
 963
 964
 965 static void
 966 internal_function
 967 setxid_unmark_thread (struct xid_command *cmdp, struct pthread *t)
 968 {
 969   int ch;
 970
 971   do
 972     {
 973       ch = t->cancelhandling;
 974       if ((ch & SETXID_BITMASK) == 0)
 975         return;
 976     }
 977   while (atomic_compare_and_exchange_bool_acq (&t->cancelhandling,
 978                                                ch & ~SETXID_BITMASK, ch));
 979
 980   /* Release the futex just in case.  */
 981   t->setxid_futex = 1;
 982   lll_futex_wake (&t->setxid_futex, 1, LLL_PRIVATE);
 983 }
 984
 985
 986 static int
 987 internal_function
 988 setxid_signal_thread (struct xid_command *cmdp, struct pthread *t)
 989 {
 990   if ((t->cancelhandling & SETXID_BITMASK) == 0)
 991     return 0;
 992
 993   int val;
 994   INTERNAL_SYSCALL_DECL (err);
 995 #if __ASSUME_TGKILL
 996   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
 997                           t->tid, SIGSETXID);
 998 #else
 999 # ifdef __NR_tgkill
1000   val = INTERNAL_SYSCALL (tgkill, err, 3, THREAD_GETMEM (THREAD_SELF, pid),
1001                           t->tid, SIGSETXID);
1002   if (INTERNAL_SYSCALL_ERROR_P (val, err)
1003       && INTERNAL_SYSCALL_ERRNO (val, err) == ENOSYS)
1004 # endif
1005     val = INTERNAL_SYSCALL (tkill, err, 2, t->tid, SIGSETXID);
1006 #endif
1007
1008   /* If this failed, it must have had not started yet or else exited.  */
1009   if (!INTERNAL_SYSCALL_ERROR_P (val, err))
1010     {
1011       atomic_increment (&cmdp->cntr);
1012       return 1;
1013     }
1014   else
1015     return 0;
1016 }
1017
1018
1019 int
1020 attribute_hidden
1021 __nptl_setxid (struct xid_command *cmdp)
1022 {
1023   int signalled;
1024   int result;
1025   lll_lock (stack_cache_lock, LLL_PRIVATE);
1026
1027   __xidcmd = cmdp;
1028   cmdp->cntr = 0;
1029
1030   struct pthread *self = THREAD_SELF;
1031
1032   /* Iterate over the list with system-allocated threads first.  */
1033   list_t *runp;
1034   list_for_each (runp, &stack_used)
1035     {
1036       struct pthread *t = list_entry (runp, struct pthread, list);
1037       if (t == self)
1038         continue;
1039
1040       setxid_mark_thread (cmdp, t);
1041     }
1042
1043   /* Now the list with threads using user-allocated stacks.  */
1044   list_for_each (runp, &__stack_user)
1045     {
1046       struct pthread *t = list_entry (runp, struct pthread, list);
1047       if (t == self)
1048         continue;
1049
1050       setxid_mark_thread (cmdp, t);
1051     }
1052
1053   /* Iterate until we don't succeed in signalling anyone.  That means
1054      we have gotten all running threads, and their children will be
1055      automatically correct once started.  */
1056   do
1057     {
1058       signalled = 0;
1059
1060       list_for_each (runp, &stack_used)
1061         {
1062           struct pthread *t = list_entry (runp, struct pthread, list);
1063           if (t == self)
1064             continue;
1065
1066           signalled += setxid_signal_thread (cmdp, t);
1067         }
1068
1069       list_for_each (runp, &__stack_user)
1070         {
1071           struct pthread *t = list_entry (runp, struct pthread, list);
1072           if (t == self)
1073             continue;
1074
1075           signalled += setxid_signal_thread (cmdp, t);
1076         }
1077
1078       int cur = cmdp->cntr;
1079       while (cur != 0)
1080         {
1081           lll_futex_wait (&cmdp->cntr, cur, LLL_PRIVATE);
1082           cur = cmdp->cntr;
1083         }
1084     }
1085   while (signalled != 0);
1086
1087   /* Clean up flags, so that no thread blocks during exit waiting
1088      for a signal which will never come.  */
1089   list_for_each (runp, &stack_used)
1090     {
1091       struct pthread *t = list_entry (runp, struct pthread, list);
1092       if (t == self)
1093         continue;
1094
1095       setxid_unmark_thread (cmdp, t);
1096     }
1097
1098   list_for_each (runp, &__stack_user)
1099     {
1100       struct pthread *t = list_entry (runp, struct pthread, list);
1101       if (t == self)
1102         continue;
1103
1104       setxid_unmark_thread (cmdp, t);
1105     }
1106
1107   /* This must be last, otherwise the current thread might not have
1108      permissions to send SIGSETXID syscall to the other threads.  */
1109   INTERNAL_SYSCALL_DECL (err);
1110   result = INTERNAL_SYSCALL_NCS (cmdp->syscall_no, err, 3,
1111                                  cmdp->id[0], cmdp->id[1], cmdp->id[2]);
1112   if (INTERNAL_SYSCALL_ERROR_P (result, err))
1113     {
1114       __set_errno (INTERNAL_SYSCALL_ERRNO (result, err));
1115       result = -1;
1116     }
1117
1118   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1119   return result;
1120 }
1121
1122 static inline void __attribute__((always_inline))
1123 init_one_static_tls (struct pthread *curp, struct link_map *map)
1124 {
1125   dtv_t *dtv = GET_DTV (TLS_TPADJ (curp));
1126 # if defined(TLS_TCB_AT_TP)
1127   void *dest = (char *) curp - map->l_tls_offset;
1128 # elif defined(TLS_DTV_AT_TP)
1129   void *dest = (char *) curp + map->l_tls_offset + TLS_PRE_TCB_SIZE;
1130 # else
1131 #  error "Either TLS_TCB_AT_TP or TLS_DTV_AT_TP must be defined"
1132 # endif
1133
1134   /* Fill in the DTV slot so that a later LD/GD access will find it.  */
1135   dtv[map->l_tls_modid].pointer.val = dest;
1136   dtv[map->l_tls_modid].pointer.is_static = true;
1137
1138   /* Initialize the memory.  */
1139   memset (mempcpy (dest, map->l_tls_initimage, map->l_tls_initimage_size),
1140           '\0', map->l_tls_blocksize - map->l_tls_initimage_size);
1141 }
1142
1143 void
1144 attribute_hidden
1145 __pthread_init_static_tls (struct link_map *map)
1146 {
1147   lll_lock (stack_cache_lock, LLL_PRIVATE);
1148
1149   /* Iterate over the list with system-allocated threads first.  */
1150   list_t *runp;
1151   list_for_each (runp, &stack_used)
1152     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1153
1154   /* Now the list with threads using user-allocated stacks.  */
1155   list_for_each (runp, &__stack_user)
1156     init_one_static_tls (list_entry (runp, struct pthread, list), map);
1157
1158   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1159 }
1160
1161
1162 void
1163 attribute_hidden
1164 __wait_lookup_done (void)
1165 {
1166   lll_lock (stack_cache_lock, LLL_PRIVATE);
1167
1168   struct pthread *self = THREAD_SELF;
1169
1170   /* Iterate over the list with system-allocated threads first.  */
1171   list_t *runp;
1172   list_for_each (runp, &stack_used)
1173     {
1174       struct pthread *t = list_entry (runp, struct pthread, list);
1175       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1176         continue;
1177
1178       int *const gscope_flagp = &t->header.gscope_flag;
1179
1180       /* We have to wait until this thread is done with the global
1181          scope.  First tell the thread that we are waiting and
1182          possibly have to be woken.  */
1183       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1184                                                 THREAD_GSCOPE_FLAG_WAIT,
1185                                                 THREAD_GSCOPE_FLAG_USED))
1186         continue;
1187
1188       do
1189         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1190       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1191     }
1192
1193   /* Now the list with threads using user-allocated stacks.  */
1194   list_for_each (runp, &__stack_user)
1195     {
1196       struct pthread *t = list_entry (runp, struct pthread, list);
1197       if (t == self || t->header.gscope_flag == THREAD_GSCOPE_FLAG_UNUSED)
1198         continue;
1199
1200       int *const gscope_flagp = &t->header.gscope_flag;
1201
1202       /* We have to wait until this thread is done with the global
1203          scope.  First tell the thread that we are waiting and
1204          possibly have to be woken.  */
1205       if (atomic_compare_and_exchange_bool_acq (gscope_flagp,
1206                                                 THREAD_GSCOPE_FLAG_WAIT,
1207                                                 THREAD_GSCOPE_FLAG_USED))
1208         continue;
1209
1210       do
1211         lll_futex_wait (gscope_flagp, THREAD_GSCOPE_FLAG_WAIT, LLL_PRIVATE);
1212       while (*gscope_flagp == THREAD_GSCOPE_FLAG_WAIT);
1213     }
1214
1215   lll_unlock (stack_cache_lock, LLL_PRIVATE);
1216 }