OSDN Git Service

[PATCH] x86_64: Make sure to validate all 64bits of ptrace information
[linux-kernel-docs/linux-2.4.36.git] / mm / page_alloc.c
1 /*
2  *  linux/mm/page_alloc.c
3  *
4  *  Manages the free list, the system allocates free pages here.
5  *  Note that kmalloc() lives in slab.c
6  *
7  *  Copyright (C) 1991, 1992, 1993, 1994  Linus Torvalds
8  *  Swap reorganised 29.12.95, Stephen Tweedie
9  *  Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10  *  Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11  *  Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12  *  Zone balancing, Kanoj Sarcar, SGI, Jan 2000
13  */
14
15 #include <linux/config.h>
16 #include <linux/mm.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/interrupt.h>
20 #include <linux/pagemap.h>
21 #include <linux/bootmem.h>
22 #include <linux/slab.h>
23 #include <linux/module.h>
24
25 int nr_swap_pages;
26 int nr_active_pages;
27 int nr_inactive_pages;
28 LIST_HEAD(inactive_list);
29 LIST_HEAD(active_list);
30 pg_data_t *pgdat_list;
31
32 /*
33  *
34  * The zone_table array is used to look up the address of the
35  * struct zone corresponding to a given zone number (ZONE_DMA,
36  * ZONE_NORMAL, or ZONE_HIGHMEM).
37  */
38 zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
39 EXPORT_SYMBOL(zone_table);
40
41 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
42 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
43 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
44 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
45 static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
46
47 int vm_gfp_debug = 0;
48
49 static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
50
51 static spinlock_t free_pages_ok_no_irq_lock = SPIN_LOCK_UNLOCKED;
52 struct page * free_pages_ok_no_irq_head;
53
54 static void do_free_pages_ok_no_irq(void * arg)
55 {
56        struct page * page, * __page;
57
58        spin_lock_irq(&free_pages_ok_no_irq_lock);
59
60        page = free_pages_ok_no_irq_head;
61        free_pages_ok_no_irq_head = NULL;
62
63        spin_unlock_irq(&free_pages_ok_no_irq_lock);
64
65        while (page) {
66                __page = page;
67                page = page->next_hash;
68                __free_pages_ok(__page, __page->index);
69        }
70 }
71
72 static struct tq_struct free_pages_ok_no_irq_task = {
73        .routine        = do_free_pages_ok_no_irq,
74 };
75
76
77 /*
78  * Temporary debugging check.
79  */
80 #define BAD_RANGE(zone, page)                                           \
81 (                                                                       \
82         (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
83         || (((page) - mem_map) < (zone)->zone_start_mapnr)              \
84         || ((zone) != page_zone(page))                                  \
85 )
86
87 /*
88  * Freeing function for a buddy system allocator.
89  * Contrary to prior comments, this is *NOT* hairy, and there
90  * is no reason for anyone not to understand it.
91  *
92  * The concept of a buddy system is to maintain direct-mapped tables
93  * (containing bit values) for memory blocks of various "orders".
94  * The bottom level table contains the map for the smallest allocatable
95  * units of memory (here, pages), and each level above it describes
96  * pairs of units from the levels below, hence, "buddies".
97  * At a high level, all that happens here is marking the table entry
98  * at the bottom level available, and propagating the changes upward
99  * as necessary, plus some accounting needed to play nicely with other
100  * parts of the VM system.
101  * At each level, we keep one bit for each pair of blocks, which
102  * is set to 1 iff only one of the pair is allocated.  So when we
103  * are allocating or freeing one, we can derive the state of the
104  * other.  That is, if we allocate a small block, and both were   
105  * free, the remainder of the region must be split into blocks.   
106  * If a block is freed, and its buddy is also free, then this
107  * triggers coalescing into a block of larger size.            
108  *
109  * -- wli
110  */
111
112 static void fastcall __free_pages_ok (struct page *page, unsigned int order)
113 {
114         unsigned long index, page_idx, mask, flags;
115         free_area_t *area;
116         struct page *base;
117         zone_t *zone;
118
119         /*
120          * Yes, think what happens when other parts of the kernel take 
121          * a reference to a page in order to pin it for io. -ben
122          */
123         if (PageLRU(page)) {
124                 if (unlikely(in_interrupt())) {
125                         unsigned long flags;
126
127                         spin_lock_irqsave(&free_pages_ok_no_irq_lock, flags);
128                         page->next_hash = free_pages_ok_no_irq_head;
129                         free_pages_ok_no_irq_head = page;
130                         page->index = order;
131         
132                         spin_unlock_irqrestore(&free_pages_ok_no_irq_lock, flags);
133         
134                         schedule_task(&free_pages_ok_no_irq_task);
135                         return;
136                 }
137                 
138                 lru_cache_del(page);
139         }
140
141         if (page->buffers)
142                 BUG();
143         if (page->mapping)
144                 BUG();
145         if (!VALID_PAGE(page))
146                 BUG();
147         if (PageLocked(page))
148                 BUG();
149         if (PageActive(page))
150                 BUG();
151         ClearPageReferenced(page);
152         ClearPageDirty(page);
153
154         if (current->flags & PF_FREE_PAGES)
155                 goto local_freelist;
156  back_local_freelist:
157
158         zone = page_zone(page);
159
160         mask = (~0UL) << order;
161         base = zone->zone_mem_map;
162         page_idx = page - base;
163         if (page_idx & ~mask)
164                 BUG();
165         index = page_idx >> (1 + order);
166
167         area = zone->free_area + order;
168
169         spin_lock_irqsave(&zone->lock, flags);
170
171         zone->free_pages -= mask;
172
173         while (mask + (1 << (MAX_ORDER-1))) {
174                 struct page *buddy1, *buddy2;
175
176                 if (area >= zone->free_area + MAX_ORDER)
177                         BUG();
178                 if (!__test_and_change_bit(index, area->map))
179                         /*
180                          * the buddy page is still allocated.
181                          */
182                         break;
183                 /*
184                  * Move the buddy up one level.
185                  * This code is taking advantage of the identity:
186                  *      -mask = 1+~mask
187                  */
188                 buddy1 = base + (page_idx ^ -mask);
189                 buddy2 = base + page_idx;
190                 if (BAD_RANGE(zone,buddy1))
191                         BUG();
192                 if (BAD_RANGE(zone,buddy2))
193                         BUG();
194
195                 list_del(&buddy1->list);
196                 mask <<= 1;
197                 area++;
198                 index >>= 1;
199                 page_idx &= mask;
200         }
201         list_add(&(base + page_idx)->list, &area->free_list);
202
203         spin_unlock_irqrestore(&zone->lock, flags);
204         return;
205
206  local_freelist:
207         if (current->nr_local_pages)
208                 goto back_local_freelist;
209         if (in_interrupt())
210                 goto back_local_freelist;               
211
212         list_add(&page->list, &current->local_pages);
213         page->index = order;
214         current->nr_local_pages++;
215 }
216
217 #define MARK_USED(index, order, area) \
218         __change_bit((index) >> (1+(order)), (area)->map)
219
220 static inline struct page * expand (zone_t *zone, struct page *page,
221          unsigned long index, int low, int high, free_area_t * area)
222 {
223         unsigned long size = 1 << high;
224
225         while (high > low) {
226                 if (BAD_RANGE(zone,page))
227                         BUG();
228                 area--;
229                 high--;
230                 size >>= 1;
231                 list_add(&(page)->list, &(area)->free_list);
232                 MARK_USED(index, high, area);
233                 index += size;
234                 page += size;
235         }
236         if (BAD_RANGE(zone,page))
237                 BUG();
238         return page;
239 }
240
241 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
242 static struct page * fastcall rmqueue(zone_t *zone, unsigned int order)
243 {
244         free_area_t * area = zone->free_area + order;
245         unsigned int curr_order = order;
246         struct list_head *head, *curr;
247         unsigned long flags;
248         struct page *page;
249
250         spin_lock_irqsave(&zone->lock, flags);
251         do {
252                 head = &area->free_list;
253                 curr = head->next;
254
255                 if (curr != head) {
256                         unsigned int index;
257
258                         page = list_entry(curr, struct page, list);
259                         if (BAD_RANGE(zone,page))
260                                 BUG();
261                         list_del(curr);
262                         index = page - zone->zone_mem_map;
263                         if (curr_order != MAX_ORDER-1)
264                                 MARK_USED(index, curr_order, area);
265                         zone->free_pages -= 1UL << order;
266
267                         page = expand(zone, page, index, order, curr_order, area);
268                         spin_unlock_irqrestore(&zone->lock, flags);
269
270                         set_page_count(page, 1);
271                         if (BAD_RANGE(zone,page))
272                                 BUG();
273                         if (PageLRU(page))
274                                 BUG();
275                         if (PageActive(page))
276                                 BUG();
277                         return page;    
278                 }
279                 curr_order++;
280                 area++;
281         } while (curr_order < MAX_ORDER);
282         spin_unlock_irqrestore(&zone->lock, flags);
283
284         return NULL;
285 }
286
287 #ifndef CONFIG_DISCONTIGMEM
288 struct page * fastcall _alloc_pages(unsigned int gfp_mask, unsigned int order)
289 {
290         return __alloc_pages(gfp_mask, order,
291                 contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
292 }
293 #endif
294
295 static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
296 static struct page * fastcall balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
297 {
298         struct page * page = NULL;
299         int __freed;
300
301         if (in_interrupt())
302                 BUG();
303
304         current->allocation_order = order;
305         current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
306
307         __freed = try_to_free_pages_zone(classzone, gfp_mask);
308
309         current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
310
311         if (current->nr_local_pages) {
312                 struct list_head * entry, * local_pages;
313                 struct page * tmp;
314                 int nr_pages;
315
316                 local_pages = &current->local_pages;
317
318                 if (likely(__freed)) {
319                         /* pick from the last inserted so we're lifo */
320                         entry = local_pages->next;
321                         do {
322                                 tmp = list_entry(entry, struct page, list);
323                                 if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
324                                         list_del(entry);
325                                         current->nr_local_pages--;
326                                         set_page_count(tmp, 1);
327                                         page = tmp;
328
329                                         if (page->buffers)
330                                                 BUG();
331                                         if (page->mapping)
332                                                 BUG();
333                                         if (!VALID_PAGE(page))
334                                                 BUG();
335                                         if (PageLocked(page))
336                                                 BUG();
337                                         if (PageLRU(page))
338                                                 BUG();
339                                         if (PageActive(page))
340                                                 BUG();
341                                         if (PageDirty(page))
342                                                 BUG();
343
344                                         break;
345                                 }
346                         } while ((entry = entry->next) != local_pages);
347                 }
348
349                 nr_pages = current->nr_local_pages;
350                 /* free in reverse order so that the global order will be lifo */
351                 while ((entry = local_pages->prev) != local_pages) {
352                         list_del(entry);
353                         tmp = list_entry(entry, struct page, list);
354                         __free_pages_ok(tmp, tmp->index);
355                         if (!nr_pages--)
356                                 BUG();
357                 }
358                 current->nr_local_pages = 0;
359         }
360
361         *freed = __freed;
362         return page;
363 }
364
365 static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
366 {
367         long free = zone->free_pages - (1UL << order);
368         return free >= 0 ? free : 0;
369 }
370
371 /*
372  * This is the 'heart' of the zoned buddy allocator:
373  */
374 struct page * fastcall __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
375 {
376         zone_t **zone, * classzone;
377         struct page * page;
378         int freed, class_idx;
379
380         zone = zonelist->zones;
381         classzone = *zone;
382         class_idx = zone_idx(classzone);
383
384         for (;;) {
385                 zone_t *z = *(zone++);
386                 if (!z)
387                         break;
388
389                 if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
390                         page = rmqueue(z, order);
391                         if (page)
392                                 return page;
393                 }
394         }
395
396         classzone->need_balance = 1;
397         mb();
398         if (waitqueue_active(&kswapd_wait))
399                 wake_up_interruptible(&kswapd_wait);
400
401         zone = zonelist->zones;
402         for (;;) {
403                 unsigned long min;
404                 zone_t *z = *(zone++);
405                 if (!z)
406                         break;
407
408                 min = z->watermarks[class_idx].min;
409                 if (!(gfp_mask & __GFP_WAIT))
410                         min >>= 2;
411                 if (zone_free_pages(z, order) > min) {
412                         page = rmqueue(z, order);
413                         if (page)
414                                 return page;
415                 }
416         }
417
418         /* here we're in the low on memory slow path */
419
420         if ((current->flags & PF_MEMALLOC) && 
421                         (!in_interrupt() || (current->flags & PF_MEMDIE))) {
422                 zone = zonelist->zones;
423                 for (;;) {
424                         zone_t *z = *(zone++);
425                         if (!z)
426                                 break;
427
428                         page = rmqueue(z, order);
429                         if (page)
430                                 return page;
431                 }
432                 return NULL;
433         }
434
435         /* Atomic allocations - we can't balance anything */
436         if (!(gfp_mask & __GFP_WAIT))
437                 goto out;
438
439  rebalance:
440         page = balance_classzone(classzone, gfp_mask, order, &freed);
441         if (page)
442                 return page;
443
444         zone = zonelist->zones;
445         if (likely(freed)) {
446                 for (;;) {
447                         zone_t *z = *(zone++);
448                         if (!z)
449                                 break;
450
451                         if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
452                                 page = rmqueue(z, order);
453                                 if (page)
454                                         return page;
455                         }
456                 }
457                 goto rebalance;
458         } else {
459                 /* 
460                  * Check that no other task is been killed meanwhile,
461                  * in such a case we can succeed the allocation.
462                  */
463                 for (;;) {
464                         zone_t *z = *(zone++);
465                         if (!z)
466                                 break;
467
468                         if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
469                                 page = rmqueue(z, order);
470                                 if (page)
471                                         return page;
472                         }
473                 }
474         }
475
476  out:
477         printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
478                order, gfp_mask, !!(current->flags & PF_MEMALLOC));
479         if (unlikely(vm_gfp_debug))
480                 dump_stack();
481         return NULL;
482 }
483
484 /*
485  * Common helper functions.
486  */
487 fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
488 {
489         struct page * page;
490
491         page = alloc_pages(gfp_mask, order);
492         if (!page)
493                 return 0;
494         return (unsigned long) page_address(page);
495 }
496
497 fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
498 {
499         struct page * page;
500
501         page = alloc_pages(gfp_mask, 0);
502         if (page) {
503                 void *address = page_address(page);
504                 clear_page(address);
505                 return (unsigned long) address;
506         }
507         return 0;
508 }
509
510 fastcall void __free_pages(struct page *page, unsigned int order)
511 {
512         if (!PageReserved(page) && put_page_testzero(page))
513                 __free_pages_ok(page, order);
514 }
515
516 fastcall void free_pages(unsigned long addr, unsigned int order)
517 {
518         if (addr != 0)
519                 __free_pages(virt_to_page(addr), order);
520 }
521
522 /*
523  * Total amount of free (allocatable) RAM:
524  */
525 unsigned int nr_free_pages (void)
526 {
527         unsigned int sum = 0;
528         zone_t *zone;
529
530         for_each_zone(zone)
531                 sum += zone->free_pages;
532
533         return sum;
534 }
535
536 /*
537  * Amount of free RAM allocatable as buffer memory:
538  */
539 unsigned int nr_free_buffer_pages (void)
540 {
541         pg_data_t *pgdat;
542         unsigned int sum = 0;
543         zonelist_t *zonelist;
544         zone_t **zonep, *zone;
545
546         for_each_pgdat(pgdat) {
547                 int class_idx;
548                 zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
549                 zonep = zonelist->zones;
550                 zone = *zonep;
551                 class_idx = zone_idx(zone);
552
553                 sum += zone->nr_cache_pages;
554                 for (; zone; zone = *zonep++) {
555                         int free = zone->free_pages - zone->watermarks[class_idx].high;
556                         if (free <= 0)
557                                 continue;
558                         sum += free;
559                 }
560         }
561
562         return sum;
563 }
564
565 #if CONFIG_HIGHMEM
566 unsigned int nr_free_highpages (void)
567 {
568         pg_data_t *pgdat;
569         unsigned int pages = 0;
570
571         for_each_pgdat(pgdat)
572                 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
573
574         return pages;
575 }
576
577 unsigned int freeable_lowmem(void)
578 {
579         unsigned int pages = 0;
580         pg_data_t *pgdat;
581
582         for_each_pgdat(pgdat) {
583                 pages += pgdat->node_zones[ZONE_DMA].free_pages;
584                 pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
585                 pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
586                 pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
587                 pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
588                 pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
589         }
590
591         return pages;
592 }
593 #endif
594
595 #define K(x) ((x) << (PAGE_SHIFT-10))
596
597 /*
598  * Show free area list (used inside shift_scroll-lock stuff)
599  * We also calculate the percentage fragmentation. We do this by counting the
600  * memory on each free list with the exception of the first item on the list.
601  */
602 void show_free_areas_core(pg_data_t *pgdat)
603 {
604         unsigned int order;
605         unsigned type;
606         pg_data_t *tmpdat = pgdat;
607
608         printk("Free pages:      %6dkB (%6dkB HighMem)\n",
609                 K(nr_free_pages()),
610                 K(nr_free_highpages()));
611
612         while (tmpdat) {
613                 zone_t *zone;
614                 for (zone = tmpdat->node_zones;
615                                 zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
616                         printk("Zone:%s freepages:%6lukB\n", 
617                                         zone->name,
618                                         K(zone->free_pages));
619                         
620                 tmpdat = tmpdat->node_next;
621         }
622
623         printk("( Active: %d, inactive: %d, free: %d )\n",
624                nr_active_pages,
625                nr_inactive_pages,
626                nr_free_pages());
627
628         for (type = 0; type < MAX_NR_ZONES; type++) {
629                 struct list_head *head, *curr;
630                 zone_t *zone = pgdat->node_zones + type;
631                 unsigned long nr, total, flags;
632
633                 total = 0;
634                 if (zone->size) {
635                         spin_lock_irqsave(&zone->lock, flags);
636                         for (order = 0; order < MAX_ORDER; order++) {
637                                 head = &(zone->free_area + order)->free_list;
638                                 curr = head;
639                                 nr = 0;
640                                 for (;;) {
641                                         if ((curr = curr->next) == head)
642                                                 break;
643                                         nr++;
644                                 }
645                                 total += nr * (1 << order);
646                                 printk("%lu*%lukB ", nr, K(1UL) << order);
647                         }
648                         spin_unlock_irqrestore(&zone->lock, flags);
649                 }
650                 printk("= %lukB)\n", K(total));
651         }
652
653 #ifdef SWAP_CACHE_INFO
654         show_swap_cache_info();
655 #endif  
656 }
657
658 void show_free_areas(void)
659 {
660         show_free_areas_core(pgdat_list);
661 }
662
663 /*
664  * Builds allocation fallback zone lists.
665  */
666 static inline void build_zonelists(pg_data_t *pgdat)
667 {
668         int i, j, k;
669
670         for (i = 0; i <= GFP_ZONEMASK; i++) {
671                 zonelist_t *zonelist;
672                 zone_t *zone;
673
674                 zonelist = pgdat->node_zonelists + i;
675                 memset(zonelist, 0, sizeof(*zonelist));
676
677                 j = 0;
678                 k = ZONE_NORMAL;
679                 if (i & __GFP_HIGHMEM)
680                         k = ZONE_HIGHMEM;
681                 if (i & __GFP_DMA)
682                         k = ZONE_DMA;
683
684                 switch (k) {
685                         default:
686                                 BUG();
687                         /*
688                          * fallthrough:
689                          */
690                         case ZONE_HIGHMEM:
691                                 zone = pgdat->node_zones + ZONE_HIGHMEM;
692                                 if (zone->size) {
693 #ifndef CONFIG_HIGHMEM
694                                         BUG();
695 #endif
696                                         zonelist->zones[j++] = zone;
697                                 }
698                         case ZONE_NORMAL:
699                                 zone = pgdat->node_zones + ZONE_NORMAL;
700                                 if (zone->size)
701                                         zonelist->zones[j++] = zone;
702                         case ZONE_DMA:
703                                 zone = pgdat->node_zones + ZONE_DMA;
704                                 if (zone->size)
705                                         zonelist->zones[j++] = zone;
706                 }
707                 zonelist->zones[j++] = NULL;
708         } 
709 }
710
711 /*
712  * Helper functions to size the waitqueue hash table.
713  * Essentially these want to choose hash table sizes sufficiently
714  * large so that collisions trying to wait on pages are rare.
715  * But in fact, the number of active page waitqueues on typical
716  * systems is ridiculously low, less than 200. So this is even
717  * conservative, even though it seems large.
718  *
719  * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
720  * waitqueues, i.e. the size of the waitq table given the number of pages.
721  */
722 #define PAGES_PER_WAITQUEUE     256
723
724 static inline unsigned long wait_table_size(unsigned long pages)
725 {
726         unsigned long size = 1;
727
728         pages /= PAGES_PER_WAITQUEUE;
729
730         while (size < pages)
731                 size <<= 1;
732
733         /*
734          * Once we have dozens or even hundreds of threads sleeping
735          * on IO we've got bigger problems than wait queue collision.
736          * Limit the size of the wait table to a reasonable size.
737          */
738         size = min(size, 4096UL);
739
740         return size;
741 }
742
743 /*
744  * This is an integer logarithm so that shifts can be used later
745  * to extract the more random high bits from the multiplicative
746  * hash function before the remainder is taken.
747  */
748 static inline unsigned long wait_table_bits(unsigned long size)
749 {
750         return ffz(~size);
751 }
752
753 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
754
755 /*
756  * Set up the zone data structures:
757  *   - mark all pages reserved
758  *   - mark all memory queues empty
759  *   - clear the memory bitmaps
760  */
761 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
762         unsigned long *zones_size, unsigned long zone_start_paddr, 
763         unsigned long *zholes_size, struct page *lmem_map)
764 {
765         unsigned long i, j;
766         unsigned long map_size;
767         unsigned long totalpages, offset, realtotalpages;
768         const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
769
770         if (zone_start_paddr & ~PAGE_MASK)
771                 BUG();
772
773         totalpages = 0;
774         for (i = 0; i < MAX_NR_ZONES; i++) {
775                 unsigned long size = zones_size[i];
776                 totalpages += size;
777         }
778         realtotalpages = totalpages;
779         if (zholes_size)
780                 for (i = 0; i < MAX_NR_ZONES; i++)
781                         realtotalpages -= zholes_size[i];
782                         
783         printk("On node %d totalpages: %lu\n", nid, realtotalpages);
784
785         /*
786          * Some architectures (with lots of mem and discontinous memory
787          * maps) have to search for a good mem_map area:
788          * For discontigmem, the conceptual mem map array starts from 
789          * PAGE_OFFSET, we need to align the actual array onto a mem map 
790          * boundary, so that MAP_NR works.
791          */
792         map_size = (totalpages + 1)*sizeof(struct page);
793         if (lmem_map == (struct page *)0) {
794                 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
795                 lmem_map = (struct page *)(PAGE_OFFSET + 
796                         MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
797         }
798         *gmap = pgdat->node_mem_map = lmem_map;
799         pgdat->node_size = totalpages;
800         pgdat->node_start_paddr = zone_start_paddr;
801         pgdat->node_start_mapnr = (lmem_map - mem_map);
802         pgdat->nr_zones = 0;
803
804         offset = lmem_map - mem_map;    
805         for (j = 0; j < MAX_NR_ZONES; j++) {
806                 zone_t *zone = pgdat->node_zones + j;
807                 unsigned long mask;
808                 unsigned long size, realsize;
809                 int idx;
810
811                 zone_table[nid * MAX_NR_ZONES + j] = zone;
812                 realsize = size = zones_size[j];
813                 if (zholes_size)
814                         realsize -= zholes_size[j];
815
816                 printk("zone(%lu): %lu pages.\n", j, size);
817                 zone->size = size;
818                 zone->realsize = realsize;
819                 zone->name = zone_names[j];
820                 zone->lock = SPIN_LOCK_UNLOCKED;
821                 zone->zone_pgdat = pgdat;
822                 zone->free_pages = 0;
823                 zone->need_balance = 0;
824                  zone->nr_active_pages = zone->nr_inactive_pages = 0;
825
826
827                 if (!size)
828                         continue;
829
830                 /*
831                  * The per-page waitqueue mechanism uses hashed waitqueues
832                  * per zone.
833                  */
834                 zone->wait_table_size = wait_table_size(size);
835                 zone->wait_table_shift =
836                         BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
837                 zone->wait_table = (wait_queue_head_t *)
838                         alloc_bootmem_node(pgdat, zone->wait_table_size
839                                                 * sizeof(wait_queue_head_t));
840
841                 for(i = 0; i < zone->wait_table_size; ++i)
842                         init_waitqueue_head(zone->wait_table + i);
843
844                 pgdat->nr_zones = j+1;
845
846                 mask = (realsize / zone_balance_ratio[j]);
847                 if (mask < zone_balance_min[j])
848                         mask = zone_balance_min[j];
849                 else if (mask > zone_balance_max[j])
850                         mask = zone_balance_max[j];
851                 zone->watermarks[j].min = mask;
852                 zone->watermarks[j].low = mask*2;
853                 zone->watermarks[j].high = mask*3;
854                 /* now set the watermarks of the lower zones in the "j" classzone */
855                 for (idx = j-1; idx >= 0; idx--) {
856                         zone_t * lower_zone = pgdat->node_zones + idx;
857                         unsigned long lower_zone_reserve;
858                         if (!lower_zone->size)
859                                 continue;
860
861                         mask = lower_zone->watermarks[idx].min;
862                         lower_zone->watermarks[j].min = mask;
863                         lower_zone->watermarks[j].low = mask*2;
864                         lower_zone->watermarks[j].high = mask*3;
865
866                         /* now the brainer part */
867                         lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
868                         lower_zone->watermarks[j].min += lower_zone_reserve;
869                         lower_zone->watermarks[j].low += lower_zone_reserve;
870                         lower_zone->watermarks[j].high += lower_zone_reserve;
871
872                         realsize += lower_zone->realsize;
873                 }
874
875                 zone->zone_mem_map = mem_map + offset;
876                 zone->zone_start_mapnr = offset;
877                 zone->zone_start_paddr = zone_start_paddr;
878
879                 if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
880                         printk("BUG: wrong zone alignment, it will crash\n");
881
882                 /*
883                  * Initially all pages are reserved - free ones are freed
884                  * up by free_all_bootmem() once the early boot process is
885                  * done. Non-atomic initialization, single-pass.
886                  */
887                 for (i = 0; i < size; i++) {
888                         struct page *page = mem_map + offset + i;
889                         set_page_zone(page, nid * MAX_NR_ZONES + j);
890                         set_page_count(page, 0);
891                         SetPageReserved(page);
892                         INIT_LIST_HEAD(&page->list);
893                         if (j != ZONE_HIGHMEM)
894                                 set_page_address(page, __va(zone_start_paddr));
895                         zone_start_paddr += PAGE_SIZE;
896                 }
897
898                 offset += size;
899                 for (i = 0; ; i++) {
900                         unsigned long bitmap_size;
901
902                         INIT_LIST_HEAD(&zone->free_area[i].free_list);
903                         if (i == MAX_ORDER-1) {
904                                 zone->free_area[i].map = NULL;
905                                 break;
906                         }
907
908                         /*
909                          * Page buddy system uses "index >> (i+1)",
910                          * where "index" is at most "size-1".
911                          *
912                          * The extra "+3" is to round down to byte
913                          * size (8 bits per byte assumption). Thus
914                          * we get "(size-1) >> (i+4)" as the last byte
915                          * we can access.
916                          *
917                          * The "+1" is because we want to round the
918                          * byte allocation up rather than down. So
919                          * we should have had a "+7" before we shifted
920                          * down by three. Also, we have to add one as
921                          * we actually _use_ the last bit (it's [0,n]
922                          * inclusive, not [0,n[).
923                          *
924                          * So we actually had +7+1 before we shift
925                          * down by 3. But (n+8) >> 3 == (n >> 3) + 1
926                          * (modulo overflows, which we do not have).
927                          *
928                          * Finally, we LONG_ALIGN because all bitmap
929                          * operations are on longs.
930                          */
931                         bitmap_size = (size-1) >> (i+4);
932                         bitmap_size = LONG_ALIGN(bitmap_size+1);
933                         zone->free_area[i].map = 
934                           (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
935                 }
936         }
937         build_zonelists(pgdat);
938 }
939
940 void __init free_area_init(unsigned long *zones_size)
941 {
942         free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
943 }
944
945 static int __init setup_mem_frac(char *str)
946 {
947         int j = 0;
948
949         while (get_option(&str, &zone_balance_ratio[j++]) == 2);
950         printk("setup_mem_frac: ");
951         for (j = 0; j < MAX_NR_ZONES; j++) printk("%d  ", zone_balance_ratio[j]);
952         printk("\n");
953         return 1;
954 }
955
956 __setup("memfrac=", setup_mem_frac);
957
958 static int __init setup_lower_zone_reserve(char *str)
959 {
960         int j = 0;
961
962         while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
963         printk("setup_lower_zone_reserve: ");
964         for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d  ", lower_zone_reserve_ratio[j]);
965         printk("\n");
966         return 1;
967 }
968
969 __setup("lower_zone_reserve=", setup_lower_zone_reserve);