mm/page_alloc.c

   1 /*
   2  *  linux/mm/page_alloc.c
   3  *
   4  *  Manages the free list, the system allocates free pages here.
   5  *  Note that kmalloc() lives in slab.c
   6  *
   7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
   8  *  Swap reorganised 29.12.95, Stephen Tweedie
   9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
  10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
  11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
  12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
  13  */
  14
  15 #include <linux/config.h>
  16 #include <linux/mm.h>
  17 #include <linux/swap.h>
  18 #include <linux/swapctl.h>
  19 #include <linux/interrupt.h>
  20 #include <linux/pagemap.h>
  21 #include <linux/bootmem.h>
  22 #include <linux/slab.h>
  23 #include <linux/module.h>
  24
  25 int nr_swap_pages;
  26 int nr_active_pages;
  27 int nr_inactive_pages;
  28 LIST_HEAD(inactive_list);
  29 LIST_HEAD(active_list);
  30 pg_data_t *pgdat_list;
  31
  32 /*
  33  *
  34  * The zone_table array is used to look up the address of the
  35  * struct zone corresponding to a given zone number (ZONE_DMA,
  36  * ZONE_NORMAL, or ZONE_HIGHMEM).
  37  */
  38 zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
  39 EXPORT_SYMBOL(zone_table);
  40
  41 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
  42 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
  43 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
  44 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
  45 static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
  46
  47 int vm_gfp_debug = 0;
  48
  49 static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
  50
  51 static spinlock_t free_pages_ok_no_irq_lock = SPIN_LOCK_UNLOCKED;
  52 struct page * free_pages_ok_no_irq_head;
  53
  54 static void do_free_pages_ok_no_irq(void * arg)
  55 {
  56        struct page * page, * __page;
  57
  58        spin_lock_irq(&free_pages_ok_no_irq_lock);
  59
  60        page = free_pages_ok_no_irq_head;
  61        free_pages_ok_no_irq_head = NULL;
  62
  63        spin_unlock_irq(&free_pages_ok_no_irq_lock);
  64
  65        while (page) {
  66                __page = page;
  67                page = page->next_hash;
  68                __free_pages_ok(__page, __page->index);
  69        }
  70 }
  71
  72 static struct tq_struct free_pages_ok_no_irq_task = {
  73        .routine        = do_free_pages_ok_no_irq,
  74 };
  75
  76
  77 /*
  78  * Temporary debugging check.
  79  */
  80 #define BAD_RANGE(zone, page)                                           \
  81 (                                                                       \
  82         (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
  83         || (((page) - mem_map) < (zone)->zone_start_mapnr)              \
  84         || ((zone) != page_zone(page))                                  \
  85 )
  86
  87 /*
  88  * Freeing function for a buddy system allocator.
  89  * Contrary to prior comments, this is *NOT* hairy, and there
  90  * is no reason for anyone not to understand it.
  91  *
  92  * The concept of a buddy system is to maintain direct-mapped tables
  93  * (containing bit values) for memory blocks of various "orders".
  94  * The bottom level table contains the map for the smallest allocatable
  95  * units of memory (here, pages), and each level above it describes
  96  * pairs of units from the levels below, hence, "buddies".
  97  * At a high level, all that happens here is marking the table entry
  98  * at the bottom level available, and propagating the changes upward
  99  * as necessary, plus some accounting needed to play nicely with other
 100  * parts of the VM system.
 101  * At each level, we keep one bit for each pair of blocks, which
 102  * is set to 1 iff only one of the pair is allocated.  So when we
 103  * are allocating or freeing one, we can derive the state of the
 104  * other.  That is, if we allocate a small block, and both were
 105  * free, the remainder of the region must be split into blocks.
 106  * If a block is freed, and its buddy is also free, then this
 107  * triggers coalescing into a block of larger size.
 108  *
 109  * -- wli
 110  */
 111
 112 static void fastcall __free_pages_ok (struct page *page, unsigned int order)
 113 {
 114         unsigned long index, page_idx, mask, flags;
 115         free_area_t *area;
 116         struct page *base;
 117         zone_t *zone;
 118
 119         /*
 120          * Yes, think what happens when other parts of the kernel take
 121          * a reference to a page in order to pin it for io. -ben
 122          */
 123         if (PageLRU(page)) {
 124                 if (unlikely(in_interrupt())) {
 125                         unsigned long flags;
 126
 127                         spin_lock_irqsave(&free_pages_ok_no_irq_lock, flags);
 128                         page->next_hash = free_pages_ok_no_irq_head;
 129                         free_pages_ok_no_irq_head = page;
 130                         page->index = order;
 131
 132                         spin_unlock_irqrestore(&free_pages_ok_no_irq_lock, flags);
 133
 134                         schedule_task(&free_pages_ok_no_irq_task);
 135                         return;
 136                 }
 137
 138                 lru_cache_del(page);
 139         }
 140
 141         if (page->buffers)
 142                 BUG();
 143         if (page->mapping)
 144                 BUG();
 145         if (!VALID_PAGE(page))
 146                 BUG();
 147         if (PageLocked(page))
 148                 BUG();
 149         if (PageActive(page))
 150                 BUG();
 151         ClearPageReferenced(page);
 152         ClearPageDirty(page);
 153
 154         if (current->flags & PF_FREE_PAGES)
 155                 goto local_freelist;
 156  back_local_freelist:
 157
 158         zone = page_zone(page);
 159
 160         mask = (~0UL) << order;
 161         base = zone->zone_mem_map;
 162         page_idx = page - base;
 163         if (page_idx & ~mask)
 164                 BUG();
 165         index = page_idx >> (1 + order);
 166
 167         area = zone->free_area + order;
 168
 169         spin_lock_irqsave(&zone->lock, flags);
 170
 171         zone->free_pages -= mask;
 172
 173         while (mask + (1 << (MAX_ORDER-1))) {
 174                 struct page *buddy1, *buddy2;
 175
 176                 if (area >= zone->free_area + MAX_ORDER)
 177                         BUG();
 178                 if (!__test_and_change_bit(index, area->map))
 179                         /*
 180                          * the buddy page is still allocated.
 181                          */
 182                         break;
 183                 /*
 184                  * Move the buddy up one level.
 185                  * This code is taking advantage of the identity:
 186                  *      -mask = 1+~mask
 187                  */
 188                 buddy1 = base + (page_idx ^ -mask);
 189                 buddy2 = base + page_idx;
 190                 if (BAD_RANGE(zone,buddy1))
 191                         BUG();
 192                 if (BAD_RANGE(zone,buddy2))
 193                         BUG();
 194
 195                 list_del(&buddy1->list);
 196                 mask <<= 1;
 197                 area++;
 198                 index >>= 1;
 199                 page_idx &= mask;
 200         }
 201         list_add(&(base + page_idx)->list, &area->free_list);
 202
 203         spin_unlock_irqrestore(&zone->lock, flags);
 204         return;
 205
 206  local_freelist:
 207         if (current->nr_local_pages)
 208                 goto back_local_freelist;
 209         if (in_interrupt())
 210                 goto back_local_freelist;
 211
 212         list_add(&page->list, &current->local_pages);
 213         page->index = order;
 214         current->nr_local_pages++;
 215 }
 216
 217 #define MARK_USED(index, order, area) \
 218         __change_bit((index) >> (1+(order)), (area)->map)
 219
 220 static inline struct page * expand (zone_t *zone, struct page *page,
 221          unsigned long index, int low, int high, free_area_t * area)
 222 {
 223         unsigned long size = 1 << high;
 224
 225         while (high > low) {
 226                 if (BAD_RANGE(zone,page))
 227                         BUG();
 228                 area--;
 229                 high--;
 230                 size >>= 1;
 231                 list_add(&(page)->list, &(area)->free_list);
 232                 MARK_USED(index, high, area);
 233                 index += size;
 234                 page += size;
 235         }
 236         if (BAD_RANGE(zone,page))
 237                 BUG();
 238         return page;
 239 }
 240
 241 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
 242 static struct page * fastcall rmqueue(zone_t *zone, unsigned int order)
 243 {
 244         free_area_t * area = zone->free_area + order;
 245         unsigned int curr_order = order;
 246         struct list_head *head, *curr;
 247         unsigned long flags;
 248         struct page *page;
 249
 250         spin_lock_irqsave(&zone->lock, flags);
 251         do {
 252                 head = &area->free_list;
 253                 curr = head->next;
 254
 255                 if (curr != head) {
 256                         unsigned int index;
 257
 258                         page = list_entry(curr, struct page, list);
 259                         if (BAD_RANGE(zone,page))
 260                                 BUG();
 261                         list_del(curr);
 262                         index = page - zone->zone_mem_map;
 263                         if (curr_order != MAX_ORDER-1)
 264                                 MARK_USED(index, curr_order, area);
 265                         zone->free_pages -= 1UL << order;
 266
 267                         page = expand(zone, page, index, order, curr_order, area);
 268                         spin_unlock_irqrestore(&zone->lock, flags);
 269
 270                         set_page_count(page, 1);
 271                         if (BAD_RANGE(zone,page))
 272                                 BUG();
 273                         if (PageLRU(page))
 274                                 BUG();
 275                         if (PageActive(page))
 276                                 BUG();
 277                         return page;
 278                 }
 279                 curr_order++;
 280                 area++;
 281         } while (curr_order < MAX_ORDER);
 282         spin_unlock_irqrestore(&zone->lock, flags);
 283
 284         return NULL;
 285 }
 286
 287 #ifndef CONFIG_DISCONTIGMEM
 288 struct page * fastcall _alloc_pages(unsigned int gfp_mask, unsigned int order)
 289 {
 290         return __alloc_pages(gfp_mask, order,
 291                 contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
 292 }
 293 #endif
 294
 295 static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
 296 static struct page * fastcall balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
 297 {
 298         struct page * page = NULL;
 299         int __freed;
 300
 301         if (in_interrupt())
 302                 BUG();
 303
 304         current->allocation_order = order;
 305         current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
 306
 307         __freed = try_to_free_pages_zone(classzone, gfp_mask);
 308
 309         current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
 310
 311         if (current->nr_local_pages) {
 312                 struct list_head * entry, * local_pages;
 313                 struct page * tmp;
 314                 int nr_pages;
 315
 316                 local_pages = &current->local_pages;
 317
 318                 if (likely(__freed)) {
 319                         /* pick from the last inserted so we're lifo */
 320                         entry = local_pages->next;
 321                         do {
 322                                 tmp = list_entry(entry, struct page, list);
 323                                 if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
 324                                         list_del(entry);
 325                                         current->nr_local_pages--;
 326                                         set_page_count(tmp, 1);
 327                                         page = tmp;
 328
 329                                         if (page->buffers)
 330                                                 BUG();
 331                                         if (page->mapping)
 332                                                 BUG();
 333                                         if (!VALID_PAGE(page))
 334                                                 BUG();
 335                                         if (PageLocked(page))
 336                                                 BUG();
 337                                         if (PageLRU(page))
 338                                                 BUG();
 339                                         if (PageActive(page))
 340                                                 BUG();
 341                                         if (PageDirty(page))
 342                                                 BUG();
 343
 344                                         break;
 345                                 }
 346                         } while ((entry = entry->next) != local_pages);
 347                 }
 348
 349                 nr_pages = current->nr_local_pages;
 350                 /* free in reverse order so that the global order will be lifo */
 351                 while ((entry = local_pages->prev) != local_pages) {
 352                         list_del(entry);
 353                         tmp = list_entry(entry, struct page, list);
 354                         __free_pages_ok(tmp, tmp->index);
 355                         if (!nr_pages--)
 356                                 BUG();
 357                 }
 358                 current->nr_local_pages = 0;
 359         }
 360
 361         *freed = __freed;
 362         return page;
 363 }
 364
 365 static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
 366 {
 367         long free = zone->free_pages - (1UL << order);
 368         return free >= 0 ? free : 0;
 369 }
 370
 371 /*
 372  * This is the 'heart' of the zoned buddy allocator:
 373  */
 374 struct page * fastcall __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
 375 {
 376         zone_t **zone, * classzone;
 377         struct page * page;
 378         int freed, class_idx;
 379
 380         zone = zonelist->zones;
 381         classzone = *zone;
 382         class_idx = zone_idx(classzone);
 383
 384         for (;;) {
 385                 zone_t *z = *(zone++);
 386                 if (!z)
 387                         break;
 388
 389                 if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
 390                         page = rmqueue(z, order);
 391                         if (page)
 392                                 return page;
 393                 }
 394         }
 395
 396         classzone->need_balance = 1;
 397         mb();
 398         if (waitqueue_active(&kswapd_wait))
 399                 wake_up_interruptible(&kswapd_wait);
 400
 401         zone = zonelist->zones;
 402         for (;;) {
 403                 unsigned long min;
 404                 zone_t *z = *(zone++);
 405                 if (!z)
 406                         break;
 407
 408                 min = z->watermarks[class_idx].min;
 409                 if (!(gfp_mask & __GFP_WAIT))
 410                         min >>= 2;
 411                 if (zone_free_pages(z, order) > min) {
 412                         page = rmqueue(z, order);
 413                         if (page)
 414                                 return page;
 415                 }
 416         }
 417
 418         /* here we're in the low on memory slow path */
 419
 420         if ((current->flags & PF_MEMALLOC) &&
 421                         (!in_interrupt() || (current->flags & PF_MEMDIE))) {
 422                 zone = zonelist->zones;
 423                 for (;;) {
 424                         zone_t *z = *(zone++);
 425                         if (!z)
 426                                 break;
 427
 428                         page = rmqueue(z, order);
 429                         if (page)
 430                                 return page;
 431                 }
 432                 return NULL;
 433         }
 434
 435         /* Atomic allocations - we can't balance anything */
 436         if (!(gfp_mask & __GFP_WAIT))
 437                 goto out;
 438
 439  rebalance:
 440         page = balance_classzone(classzone, gfp_mask, order, &freed);
 441         if (page)
 442                 return page;
 443
 444         zone = zonelist->zones;
 445         if (likely(freed)) {
 446                 for (;;) {
 447                         zone_t *z = *(zone++);
 448                         if (!z)
 449                                 break;
 450
 451                         if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
 452                                 page = rmqueue(z, order);
 453                                 if (page)
 454                                         return page;
 455                         }
 456                 }
 457                 goto rebalance;
 458         } else {
 459                 /*
 460                  * Check that no other task is been killed meanwhile,
 461                  * in such a case we can succeed the allocation.
 462                  */
 463                 for (;;) {
 464                         zone_t *z = *(zone++);
 465                         if (!z)
 466                                 break;
 467
 468                         if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
 469                                 page = rmqueue(z, order);
 470                                 if (page)
 471                                         return page;
 472                         }
 473                 }
 474         }
 475
 476  out:
 477         printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
 478                order, gfp_mask, !!(current->flags & PF_MEMALLOC));
 479         if (unlikely(vm_gfp_debug))
 480                 dump_stack();
 481         return NULL;
 482 }
 483
 484 /*
 485  * Common helper functions.
 486  */
 487 fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
 488 {
 489         struct page * page;
 490
 491         page = alloc_pages(gfp_mask, order);
 492         if (!page)
 493                 return 0;
 494         return (unsigned long) page_address(page);
 495 }
 496
 497 fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
 498 {
 499         struct page * page;
 500
 501         page = alloc_pages(gfp_mask, 0);
 502         if (page) {
 503                 void *address = page_address(page);
 504                 clear_page(address);
 505                 return (unsigned long) address;
 506         }
 507         return 0;
 508 }
 509
 510 fastcall void __free_pages(struct page *page, unsigned int order)
 511 {
 512         if (!PageReserved(page) && put_page_testzero(page))
 513                 __free_pages_ok(page, order);
 514 }
 515
 516 fastcall void free_pages(unsigned long addr, unsigned int order)
 517 {
 518         if (addr != 0)
 519                 __free_pages(virt_to_page(addr), order);
 520 }
 521
 522 /*
 523  * Total amount of free (allocatable) RAM:
 524  */
 525 unsigned int nr_free_pages (void)
 526 {
 527         unsigned int sum = 0;
 528         zone_t *zone;
 529
 530         for_each_zone(zone)
 531                 sum += zone->free_pages;
 532
 533         return sum;
 534 }
 535
 536 /*
 537  * Amount of free RAM allocatable as buffer memory:
 538  */
 539 unsigned int nr_free_buffer_pages (void)
 540 {
 541         pg_data_t *pgdat;
 542         unsigned int sum = 0;
 543         zonelist_t *zonelist;
 544         zone_t **zonep, *zone;
 545
 546         for_each_pgdat(pgdat) {
 547                 int class_idx;
 548                 zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
 549                 zonep = zonelist->zones;
 550                 zone = *zonep;
 551                 class_idx = zone_idx(zone);
 552
 553                 sum += zone->nr_cache_pages;
 554                 for (; zone; zone = *zonep++) {
 555                         int free = zone->free_pages - zone->watermarks[class_idx].high;
 556                         if (free <= 0)
 557                                 continue;
 558                         sum += free;
 559                 }
 560         }
 561
 562         return sum;
 563 }
 564
 565 #if CONFIG_HIGHMEM
 566 unsigned int nr_free_highpages (void)
 567 {
 568         pg_data_t *pgdat;
 569         unsigned int pages = 0;
 570
 571         for_each_pgdat(pgdat)
 572                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
 573
 574         return pages;
 575 }
 576
 577 unsigned int freeable_lowmem(void)
 578 {
 579         unsigned int pages = 0;
 580         pg_data_t *pgdat;
 581
 582         for_each_pgdat(pgdat) {
 583                 pages += pgdat->node_zones[ZONE_DMA].free_pages;
 584                 pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
 585                 pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
 586                 pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
 587                 pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
 588                 pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
 589         }
 590
 591         return pages;
 592 }
 593 #endif
 594
 595 #define K(x) ((x) << (PAGE_SHIFT-10))
 596
 597 /*
 598  * Show free area list (used inside shift_scroll-lock stuff)
 599  * We also calculate the percentage fragmentation. We do this by counting the
 600  * memory on each free list with the exception of the first item on the list.
 601  */
 602 void show_free_areas_core(pg_data_t *pgdat)
 603 {
 604         unsigned int order;
 605         unsigned type;
 606         pg_data_t *tmpdat = pgdat;
 607
 608         printk("Free pages:      %6dkB (%6dkB HighMem)\n",
 609                 K(nr_free_pages()),
 610                 K(nr_free_highpages()));
 611
 612         while (tmpdat) {
 613                 zone_t *zone;
 614                 for (zone = tmpdat->node_zones;
 615                                 zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
 616                         printk("Zone:%s freepages:%6lukB\n",
 617                                         zone->name,
 618                                         K(zone->free_pages));
 619
 620                 tmpdat = tmpdat->node_next;
 621         }
 622
 623         printk("( Active: %d, inactive: %d, free: %d )\n",
 624                nr_active_pages,
 625                nr_inactive_pages,
 626                nr_free_pages());
 627
 628         for (type = 0; type < MAX_NR_ZONES; type++) {
 629                 struct list_head *head, *curr;
 630                 zone_t *zone = pgdat->node_zones + type;
 631                 unsigned long nr, total, flags;
 632
 633                 total = 0;
 634                 if (zone->size) {
 635                         spin_lock_irqsave(&zone->lock, flags);
 636                         for (order = 0; order < MAX_ORDER; order++) {
 637                                 head = &(zone->free_area + order)->free_list;
 638                                 curr = head;
 639                                 nr = 0;
 640                                 for (;;) {
 641                                         if ((curr = curr->next) == head)
 642                                                 break;
 643                                         nr++;
 644                                 }
 645                                 total += nr * (1 << order);
 646                                 printk("%lu*%lukB ", nr, K(1UL) << order);
 647                         }
 648                         spin_unlock_irqrestore(&zone->lock, flags);
 649                 }
 650                 printk("= %lukB)\n", K(total));
 651         }
 652
 653 #ifdef SWAP_CACHE_INFO
 654         show_swap_cache_info();
 655 #endif
 656 }
 657
 658 void show_free_areas(void)
 659 {
 660         show_free_areas_core(pgdat_list);
 661 }
 662
 663 /*
 664  * Builds allocation fallback zone lists.
 665  */
 666 static inline void build_zonelists(pg_data_t *pgdat)
 667 {
 668         int i, j, k;
 669
 670         for (i = 0; i <= GFP_ZONEMASK; i++) {
 671                 zonelist_t *zonelist;
 672                 zone_t *zone;
 673
 674                 zonelist = pgdat->node_zonelists + i;
 675                 memset(zonelist, 0, sizeof(*zonelist));
 676
 677                 j = 0;
 678                 k = ZONE_NORMAL;
 679                 if (i & __GFP_HIGHMEM)
 680                         k = ZONE_HIGHMEM;
 681                 if (i & __GFP_DMA)
 682                         k = ZONE_DMA;
 683
 684                 switch (k) {
 685                         default:
 686                                 BUG();
 687                         /*
 688                          * fallthrough:
 689                          */
 690                         case ZONE_HIGHMEM:
 691                                 zone = pgdat->node_zones + ZONE_HIGHMEM;
 692                                 if (zone->size) {
 693 #ifndef CONFIG_HIGHMEM
 694                                         BUG();
 695 #endif
 696                                         zonelist->zones[j++] = zone;
 697                                 }
 698                         case ZONE_NORMAL:
 699                                 zone = pgdat->node_zones + ZONE_NORMAL;
 700                                 if (zone->size)
 701                                         zonelist->zones[j++] = zone;
 702                         case ZONE_DMA:
 703                                 zone = pgdat->node_zones + ZONE_DMA;
 704                                 if (zone->size)
 705                                         zonelist->zones[j++] = zone;
 706                 }
 707                 zonelist->zones[j++] = NULL;
 708         }
 709 }
 710
 711 /*
 712  * Helper functions to size the waitqueue hash table.
 713  * Essentially these want to choose hash table sizes sufficiently
 714  * large so that collisions trying to wait on pages are rare.
 715  * But in fact, the number of active page waitqueues on typical
 716  * systems is ridiculously low, less than 200. So this is even
 717  * conservative, even though it seems large.
 718  *
 719  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
 720  * waitqueues, i.e. the size of the waitq table given the number of pages.
 721  */
 722 #define PAGES_PER_WAITQUEUE     256
 723
 724 static inline unsigned long wait_table_size(unsigned long pages)
 725 {
 726         unsigned long size = 1;
 727
 728         pages /= PAGES_PER_WAITQUEUE;
 729
 730         while (size < pages)
 731                 size <<= 1;
 732
 733         /*
 734          * Once we have dozens or even hundreds of threads sleeping
 735          * on IO we've got bigger problems than wait queue collision.
 736          * Limit the size of the wait table to a reasonable size.
 737          */
 738         size = min(size, 4096UL);
 739
 740         return size;
 741 }
 742
 743 /*
 744  * This is an integer logarithm so that shifts can be used later
 745  * to extract the more random high bits from the multiplicative
 746  * hash function before the remainder is taken.
 747  */
 748 static inline unsigned long wait_table_bits(unsigned long size)
 749 {
 750         return ffz(~size);
 751 }
 752
 753 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
 754
 755 /*
 756  * Set up the zone data structures:
 757  *   - mark all pages reserved
 758  *   - mark all memory queues empty
 759  *   - clear the memory bitmaps
 760  */
 761 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
 762         unsigned long *zones_size, unsigned long zone_start_paddr,
 763         unsigned long *zholes_size, struct page *lmem_map)
 764 {
 765         unsigned long i, j;
 766         unsigned long map_size;
 767         unsigned long totalpages, offset, realtotalpages;
 768         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
 769
 770         if (zone_start_paddr & ~PAGE_MASK)
 771                 BUG();
 772
 773         totalpages = 0;
 774         for (i = 0; i < MAX_NR_ZONES; i++) {
 775                 unsigned long size = zones_size[i];
 776                 totalpages += size;
 777         }
 778         realtotalpages = totalpages;
 779         if (zholes_size)
 780                 for (i = 0; i < MAX_NR_ZONES; i++)
 781                         realtotalpages -= zholes_size[i];
 782
 783         printk("On node %d totalpages: %lu\n", nid, realtotalpages);
 784
 785         /*
 786          * Some architectures (with lots of mem and discontinous memory
 787          * maps) have to search for a good mem_map area:
 788          * For discontigmem, the conceptual mem map array starts from
 789          * PAGE_OFFSET, we need to align the actual array onto a mem map
 790          * boundary, so that MAP_NR works.
 791          */
 792         map_size = (totalpages + 1)*sizeof(struct page);
 793         if (lmem_map == (struct page *)0) {
 794                 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
 795                 lmem_map = (struct page *)(PAGE_OFFSET +
 796                         MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
 797         }
 798         *gmap = pgdat->node_mem_map = lmem_map;
 799         pgdat->node_size = totalpages;
 800         pgdat->node_start_paddr = zone_start_paddr;
 801         pgdat->node_start_mapnr = (lmem_map - mem_map);
 802         pgdat->nr_zones = 0;
 803
 804         offset = lmem_map - mem_map;
 805         for (j = 0; j < MAX_NR_ZONES; j++) {
 806                 zone_t *zone = pgdat->node_zones + j;
 807                 unsigned long mask;
 808                 unsigned long size, realsize;
 809                 int idx;
 810
 811                 zone_table[nid * MAX_NR_ZONES + j] = zone;
 812                 realsize = size = zones_size[j];
 813                 if (zholes_size)
 814                         realsize -= zholes_size[j];
 815
 816                 printk("zone(%lu): %lu pages.\n", j, size);
 817                 zone->size = size;
 818                 zone->realsize = realsize;
 819                 zone->name = zone_names[j];
 820                 zone->lock = SPIN_LOCK_UNLOCKED;
 821                 zone->zone_pgdat = pgdat;
 822                 zone->free_pages = 0;
 823                 zone->need_balance = 0;
 824                  zone->nr_active_pages = zone->nr_inactive_pages = 0;
 825
 826
 827                 if (!size)
 828                         continue;
 829
 830                 /*
 831                  * The per-page waitqueue mechanism uses hashed waitqueues
 832                  * per zone.
 833                  */
 834                 zone->wait_table_size = wait_table_size(size);
 835                 zone->wait_table_shift =
 836                         BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
 837                 zone->wait_table = (wait_queue_head_t *)
 838                         alloc_bootmem_node(pgdat, zone->wait_table_size
 839                                                 * sizeof(wait_queue_head_t));
 840
 841                 for(i = 0; i < zone->wait_table_size; ++i)
 842                         init_waitqueue_head(zone->wait_table + i);
 843
 844                 pgdat->nr_zones = j+1;
 845
 846                 mask = (realsize / zone_balance_ratio[j]);
 847                 if (mask < zone_balance_min[j])
 848                         mask = zone_balance_min[j];
 849                 else if (mask > zone_balance_max[j])
 850                         mask = zone_balance_max[j];
 851                 zone->watermarks[j].min = mask;
 852                 zone->watermarks[j].low = mask*2;
 853                 zone->watermarks[j].high = mask*3;
 854                 /* now set the watermarks of the lower zones in the "j" classzone */
 855                 for (idx = j-1; idx >= 0; idx--) {
 856                         zone_t * lower_zone = pgdat->node_zones + idx;
 857                         unsigned long lower_zone_reserve;
 858                         if (!lower_zone->size)
 859                                 continue;
 860
 861                         mask = lower_zone->watermarks[idx].min;
 862                         lower_zone->watermarks[j].min = mask;
 863                         lower_zone->watermarks[j].low = mask*2;
 864                         lower_zone->watermarks[j].high = mask*3;
 865
 866                         /* now the brainer part */
 867                         lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
 868                         lower_zone->watermarks[j].min += lower_zone_reserve;
 869                         lower_zone->watermarks[j].low += lower_zone_reserve;
 870                         lower_zone->watermarks[j].high += lower_zone_reserve;
 871
 872                         realsize += lower_zone->realsize;
 873                 }
 874
 875                 zone->zone_mem_map = mem_map + offset;
 876                 zone->zone_start_mapnr = offset;
 877                 zone->zone_start_paddr = zone_start_paddr;
 878
 879                 if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
 880                         printk("BUG: wrong zone alignment, it will crash\n");
 881
 882                 /*
 883                  * Initially all pages are reserved - free ones are freed
 884                  * up by free_all_bootmem() once the early boot process is
 885                  * done. Non-atomic initialization, single-pass.
 886                  */
 887                 for (i = 0; i < size; i++) {
 888                         struct page *page = mem_map + offset + i;
 889                         set_page_zone(page, nid * MAX_NR_ZONES + j);
 890                         set_page_count(page, 0);
 891                         SetPageReserved(page);
 892                         INIT_LIST_HEAD(&page->list);
 893                         if (j != ZONE_HIGHMEM)
 894                                 set_page_address(page, __va(zone_start_paddr));
 895                         zone_start_paddr += PAGE_SIZE;
 896                 }
 897
 898                 offset += size;
 899                 for (i = 0; ; i++) {
 900                         unsigned long bitmap_size;
 901
 902                         INIT_LIST_HEAD(&zone->free_area[i].free_list);
 903                         if (i == MAX_ORDER-1) {
 904                                 zone->free_area[i].map = NULL;
 905                                 break;
 906                         }
 907
 908                         /*
 909                          * Page buddy system uses "index >> (i+1)",
 910                          * where "index" is at most "size-1".
 911                          *
 912                          * The extra "+3" is to round down to byte
 913                          * size (8 bits per byte assumption). Thus
 914                          * we get "(size-1) >> (i+4)" as the last byte
 915                          * we can access.
 916                          *
 917                          * The "+1" is because we want to round the
 918                          * byte allocation up rather than down. So
 919                          * we should have had a "+7" before we shifted
 920                          * down by three. Also, we have to add one as
 921                          * we actually _use_ the last bit (it's [0,n]
 922                          * inclusive, not [0,n[).
 923                          *
 924                          * So we actually had +7+1 before we shift
 925                          * down by 3. But (n+8) >> 3 == (n >> 3) + 1
 926                          * (modulo overflows, which we do not have).
 927                          *
 928                          * Finally, we LONG_ALIGN because all bitmap
 929                          * operations are on longs.
 930                          */
 931                         bitmap_size = (size-1) >> (i+4);
 932                         bitmap_size = LONG_ALIGN(bitmap_size+1);
 933                         zone->free_area[i].map =
 934                           (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
 935                 }
 936         }
 937         build_zonelists(pgdat);
 938 }
 939
 940 void __init free_area_init(unsigned long *zones_size)
 941 {
 942         free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
 943 }
 944
 945 static int __init setup_mem_frac(char *str)
 946 {
 947         int j = 0;
 948
 949         while (get_option(&str, &zone_balance_ratio[j++]) == 2);
 950         printk("setup_mem_frac: ");
 951         for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
 952         printk("\n");
 953         return 1;
 954 }
 955
 956 __setup("memfrac=", setup_mem_frac);
 957
 958 static int __init setup_lower_zone_reserve(char *str)
 959 {
 960         int j = 0;
 961
 962         while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
 963         printk("setup_lower_zone_reserve: ");
 964         for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d  ", lower_zone_reserve_ratio[j]);
 965         printk("\n");
 966         return 1;
 967 }
 968
 969 __setup("lower_zone_reserve=", setup_lower_zone_reserve);