2 * linux/mm/page_alloc.c
4 * Manages the free list, the system allocates free pages here.
5 * Note that kmalloc() lives in slab.c
7 * Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
8 * Swap reorganised 29.12.95, Stephen Tweedie
9 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
10 * Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
11 * Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
12 * Zone balancing, Kanoj Sarcar, SGI, Jan 2000
15 #include <linux/config.h>
17 #include <linux/swap.h>
18 #include <linux/swapctl.h>
19 #include <linux/interrupt.h>
20 #include <linux/pagemap.h>
21 #include <linux/bootmem.h>
22 #include <linux/slab.h>
23 #include <linux/module.h>
27 int nr_inactive_pages;
28 LIST_HEAD(inactive_list);
29 LIST_HEAD(active_list);
30 pg_data_t *pgdat_list;
34 * The zone_table array is used to look up the address of the
35 * struct zone corresponding to a given zone number (ZONE_DMA,
36 * ZONE_NORMAL, or ZONE_HIGHMEM).
38 zone_t *zone_table[MAX_NR_ZONES*MAX_NR_NODES];
39 EXPORT_SYMBOL(zone_table);
41 static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
42 static int zone_balance_ratio[MAX_NR_ZONES] __initdata = { 128, 128, 128, };
43 static int zone_balance_min[MAX_NR_ZONES] __initdata = { 20 , 20, 20, };
44 static int zone_balance_max[MAX_NR_ZONES] __initdata = { 255 , 255, 255, };
45 static int lower_zone_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
49 static void FASTCALL(__free_pages_ok (struct page *page, unsigned int order));
51 static spinlock_t free_pages_ok_no_irq_lock = SPIN_LOCK_UNLOCKED;
52 struct page * free_pages_ok_no_irq_head;
54 static void do_free_pages_ok_no_irq(void * arg)
56 struct page * page, * __page;
58 spin_lock_irq(&free_pages_ok_no_irq_lock);
60 page = free_pages_ok_no_irq_head;
61 free_pages_ok_no_irq_head = NULL;
63 spin_unlock_irq(&free_pages_ok_no_irq_lock);
67 page = page->next_hash;
68 __free_pages_ok(__page, __page->index);
72 static struct tq_struct free_pages_ok_no_irq_task = {
73 .routine = do_free_pages_ok_no_irq,
78 * Temporary debugging check.
80 #define BAD_RANGE(zone, page) \
82 (((page) - mem_map) >= ((zone)->zone_start_mapnr+(zone)->size)) \
83 || (((page) - mem_map) < (zone)->zone_start_mapnr) \
84 || ((zone) != page_zone(page)) \
88 * Freeing function for a buddy system allocator.
89 * Contrary to prior comments, this is *NOT* hairy, and there
90 * is no reason for anyone not to understand it.
92 * The concept of a buddy system is to maintain direct-mapped tables
93 * (containing bit values) for memory blocks of various "orders".
94 * The bottom level table contains the map for the smallest allocatable
95 * units of memory (here, pages), and each level above it describes
96 * pairs of units from the levels below, hence, "buddies".
97 * At a high level, all that happens here is marking the table entry
98 * at the bottom level available, and propagating the changes upward
99 * as necessary, plus some accounting needed to play nicely with other
100 * parts of the VM system.
101 * At each level, we keep one bit for each pair of blocks, which
102 * is set to 1 iff only one of the pair is allocated. So when we
103 * are allocating or freeing one, we can derive the state of the
104 * other. That is, if we allocate a small block, and both were
105 * free, the remainder of the region must be split into blocks.
106 * If a block is freed, and its buddy is also free, then this
107 * triggers coalescing into a block of larger size.
112 static void fastcall __free_pages_ok (struct page *page, unsigned int order)
114 unsigned long index, page_idx, mask, flags;
120 * Yes, think what happens when other parts of the kernel take
121 * a reference to a page in order to pin it for io. -ben
124 if (unlikely(in_interrupt())) {
127 spin_lock_irqsave(&free_pages_ok_no_irq_lock, flags);
128 page->next_hash = free_pages_ok_no_irq_head;
129 free_pages_ok_no_irq_head = page;
132 spin_unlock_irqrestore(&free_pages_ok_no_irq_lock, flags);
134 schedule_task(&free_pages_ok_no_irq_task);
145 if (!VALID_PAGE(page))
147 if (PageLocked(page))
149 if (PageActive(page))
151 ClearPageReferenced(page);
152 ClearPageDirty(page);
154 if (current->flags & PF_FREE_PAGES)
158 zone = page_zone(page);
160 mask = (~0UL) << order;
161 base = zone->zone_mem_map;
162 page_idx = page - base;
163 if (page_idx & ~mask)
165 index = page_idx >> (1 + order);
167 area = zone->free_area + order;
169 spin_lock_irqsave(&zone->lock, flags);
171 zone->free_pages -= mask;
173 while (mask + (1 << (MAX_ORDER-1))) {
174 struct page *buddy1, *buddy2;
176 if (area >= zone->free_area + MAX_ORDER)
178 if (!__test_and_change_bit(index, area->map))
180 * the buddy page is still allocated.
184 * Move the buddy up one level.
185 * This code is taking advantage of the identity:
188 buddy1 = base + (page_idx ^ -mask);
189 buddy2 = base + page_idx;
190 if (BAD_RANGE(zone,buddy1))
192 if (BAD_RANGE(zone,buddy2))
195 list_del(&buddy1->list);
201 list_add(&(base + page_idx)->list, &area->free_list);
203 spin_unlock_irqrestore(&zone->lock, flags);
207 if (current->nr_local_pages)
208 goto back_local_freelist;
210 goto back_local_freelist;
212 list_add(&page->list, ¤t->local_pages);
214 current->nr_local_pages++;
217 #define MARK_USED(index, order, area) \
218 __change_bit((index) >> (1+(order)), (area)->map)
220 static inline struct page * expand (zone_t *zone, struct page *page,
221 unsigned long index, int low, int high, free_area_t * area)
223 unsigned long size = 1 << high;
226 if (BAD_RANGE(zone,page))
231 list_add(&(page)->list, &(area)->free_list);
232 MARK_USED(index, high, area);
236 if (BAD_RANGE(zone,page))
241 static FASTCALL(struct page * rmqueue(zone_t *zone, unsigned int order));
242 static struct page * fastcall rmqueue(zone_t *zone, unsigned int order)
244 free_area_t * area = zone->free_area + order;
245 unsigned int curr_order = order;
246 struct list_head *head, *curr;
250 spin_lock_irqsave(&zone->lock, flags);
252 head = &area->free_list;
258 page = list_entry(curr, struct page, list);
259 if (BAD_RANGE(zone,page))
262 index = page - zone->zone_mem_map;
263 if (curr_order != MAX_ORDER-1)
264 MARK_USED(index, curr_order, area);
265 zone->free_pages -= 1UL << order;
267 page = expand(zone, page, index, order, curr_order, area);
268 spin_unlock_irqrestore(&zone->lock, flags);
270 set_page_count(page, 1);
271 if (BAD_RANGE(zone,page))
275 if (PageActive(page))
281 } while (curr_order < MAX_ORDER);
282 spin_unlock_irqrestore(&zone->lock, flags);
287 #ifndef CONFIG_DISCONTIGMEM
288 struct page * fastcall _alloc_pages(unsigned int gfp_mask, unsigned int order)
290 return __alloc_pages(gfp_mask, order,
291 contig_page_data.node_zonelists+(gfp_mask & GFP_ZONEMASK));
295 static struct page * FASTCALL(balance_classzone(zone_t *, unsigned int, unsigned int, int *));
296 static struct page * fastcall balance_classzone(zone_t * classzone, unsigned int gfp_mask, unsigned int order, int * freed)
298 struct page * page = NULL;
304 current->allocation_order = order;
305 current->flags |= PF_MEMALLOC | PF_FREE_PAGES;
307 __freed = try_to_free_pages_zone(classzone, gfp_mask);
309 current->flags &= ~(PF_MEMALLOC | PF_FREE_PAGES);
311 if (current->nr_local_pages) {
312 struct list_head * entry, * local_pages;
316 local_pages = ¤t->local_pages;
318 if (likely(__freed)) {
319 /* pick from the last inserted so we're lifo */
320 entry = local_pages->next;
322 tmp = list_entry(entry, struct page, list);
323 if (tmp->index == order && memclass(page_zone(tmp), classzone)) {
325 current->nr_local_pages--;
326 set_page_count(tmp, 1);
333 if (!VALID_PAGE(page))
335 if (PageLocked(page))
339 if (PageActive(page))
346 } while ((entry = entry->next) != local_pages);
349 nr_pages = current->nr_local_pages;
350 /* free in reverse order so that the global order will be lifo */
351 while ((entry = local_pages->prev) != local_pages) {
353 tmp = list_entry(entry, struct page, list);
354 __free_pages_ok(tmp, tmp->index);
358 current->nr_local_pages = 0;
365 static inline unsigned long zone_free_pages(zone_t * zone, unsigned int order)
367 long free = zone->free_pages - (1UL << order);
368 return free >= 0 ? free : 0;
372 * This is the 'heart' of the zoned buddy allocator:
374 struct page * fastcall __alloc_pages(unsigned int gfp_mask, unsigned int order, zonelist_t *zonelist)
376 zone_t **zone, * classzone;
378 int freed, class_idx;
380 zone = zonelist->zones;
382 class_idx = zone_idx(classzone);
385 zone_t *z = *(zone++);
389 if (zone_free_pages(z, order) > z->watermarks[class_idx].low) {
390 page = rmqueue(z, order);
396 classzone->need_balance = 1;
398 if (waitqueue_active(&kswapd_wait))
399 wake_up_interruptible(&kswapd_wait);
401 zone = zonelist->zones;
404 zone_t *z = *(zone++);
408 min = z->watermarks[class_idx].min;
409 if (!(gfp_mask & __GFP_WAIT))
411 if (zone_free_pages(z, order) > min) {
412 page = rmqueue(z, order);
418 /* here we're in the low on memory slow path */
420 if ((current->flags & PF_MEMALLOC) &&
421 (!in_interrupt() || (current->flags & PF_MEMDIE))) {
422 zone = zonelist->zones;
424 zone_t *z = *(zone++);
428 page = rmqueue(z, order);
435 /* Atomic allocations - we can't balance anything */
436 if (!(gfp_mask & __GFP_WAIT))
440 page = balance_classzone(classzone, gfp_mask, order, &freed);
444 zone = zonelist->zones;
447 zone_t *z = *(zone++);
451 if (zone_free_pages(z, order) > z->watermarks[class_idx].min) {
452 page = rmqueue(z, order);
460 * Check that no other task is been killed meanwhile,
461 * in such a case we can succeed the allocation.
464 zone_t *z = *(zone++);
468 if (zone_free_pages(z, order) > z->watermarks[class_idx].high) {
469 page = rmqueue(z, order);
477 printk(KERN_NOTICE "__alloc_pages: %u-order allocation failed (gfp=0x%x/%i)\n",
478 order, gfp_mask, !!(current->flags & PF_MEMALLOC));
479 if (unlikely(vm_gfp_debug))
485 * Common helper functions.
487 fastcall unsigned long __get_free_pages(unsigned int gfp_mask, unsigned int order)
491 page = alloc_pages(gfp_mask, order);
494 return (unsigned long) page_address(page);
497 fastcall unsigned long get_zeroed_page(unsigned int gfp_mask)
501 page = alloc_pages(gfp_mask, 0);
503 void *address = page_address(page);
505 return (unsigned long) address;
510 fastcall void __free_pages(struct page *page, unsigned int order)
512 if (!PageReserved(page) && put_page_testzero(page))
513 __free_pages_ok(page, order);
516 fastcall void free_pages(unsigned long addr, unsigned int order)
519 __free_pages(virt_to_page(addr), order);
523 * Total amount of free (allocatable) RAM:
525 unsigned int nr_free_pages (void)
527 unsigned int sum = 0;
531 sum += zone->free_pages;
537 * Amount of free RAM allocatable as buffer memory:
539 unsigned int nr_free_buffer_pages (void)
542 unsigned int sum = 0;
543 zonelist_t *zonelist;
544 zone_t **zonep, *zone;
546 for_each_pgdat(pgdat) {
548 zonelist = pgdat->node_zonelists + (GFP_USER & GFP_ZONEMASK);
549 zonep = zonelist->zones;
551 class_idx = zone_idx(zone);
553 sum += zone->nr_cache_pages;
554 for (; zone; zone = *zonep++) {
555 int free = zone->free_pages - zone->watermarks[class_idx].high;
566 unsigned int nr_free_highpages (void)
569 unsigned int pages = 0;
571 for_each_pgdat(pgdat)
572 pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
577 unsigned int freeable_lowmem(void)
579 unsigned int pages = 0;
582 for_each_pgdat(pgdat) {
583 pages += pgdat->node_zones[ZONE_DMA].free_pages;
584 pages += pgdat->node_zones[ZONE_DMA].nr_active_pages;
585 pages += pgdat->node_zones[ZONE_DMA].nr_inactive_pages;
586 pages += pgdat->node_zones[ZONE_NORMAL].free_pages;
587 pages += pgdat->node_zones[ZONE_NORMAL].nr_active_pages;
588 pages += pgdat->node_zones[ZONE_NORMAL].nr_inactive_pages;
595 #define K(x) ((x) << (PAGE_SHIFT-10))
598 * Show free area list (used inside shift_scroll-lock stuff)
599 * We also calculate the percentage fragmentation. We do this by counting the
600 * memory on each free list with the exception of the first item on the list.
602 void show_free_areas_core(pg_data_t *pgdat)
606 pg_data_t *tmpdat = pgdat;
608 printk("Free pages: %6dkB (%6dkB HighMem)\n",
610 K(nr_free_highpages()));
614 for (zone = tmpdat->node_zones;
615 zone < tmpdat->node_zones + MAX_NR_ZONES; zone++)
616 printk("Zone:%s freepages:%6lukB\n",
618 K(zone->free_pages));
620 tmpdat = tmpdat->node_next;
623 printk("( Active: %d, inactive: %d, free: %d )\n",
628 for (type = 0; type < MAX_NR_ZONES; type++) {
629 struct list_head *head, *curr;
630 zone_t *zone = pgdat->node_zones + type;
631 unsigned long nr, total, flags;
635 spin_lock_irqsave(&zone->lock, flags);
636 for (order = 0; order < MAX_ORDER; order++) {
637 head = &(zone->free_area + order)->free_list;
641 if ((curr = curr->next) == head)
645 total += nr * (1 << order);
646 printk("%lu*%lukB ", nr, K(1UL) << order);
648 spin_unlock_irqrestore(&zone->lock, flags);
650 printk("= %lukB)\n", K(total));
653 #ifdef SWAP_CACHE_INFO
654 show_swap_cache_info();
658 void show_free_areas(void)
660 show_free_areas_core(pgdat_list);
664 * Builds allocation fallback zone lists.
666 static inline void build_zonelists(pg_data_t *pgdat)
670 for (i = 0; i <= GFP_ZONEMASK; i++) {
671 zonelist_t *zonelist;
674 zonelist = pgdat->node_zonelists + i;
675 memset(zonelist, 0, sizeof(*zonelist));
679 if (i & __GFP_HIGHMEM)
691 zone = pgdat->node_zones + ZONE_HIGHMEM;
693 #ifndef CONFIG_HIGHMEM
696 zonelist->zones[j++] = zone;
699 zone = pgdat->node_zones + ZONE_NORMAL;
701 zonelist->zones[j++] = zone;
703 zone = pgdat->node_zones + ZONE_DMA;
705 zonelist->zones[j++] = zone;
707 zonelist->zones[j++] = NULL;
712 * Helper functions to size the waitqueue hash table.
713 * Essentially these want to choose hash table sizes sufficiently
714 * large so that collisions trying to wait on pages are rare.
715 * But in fact, the number of active page waitqueues on typical
716 * systems is ridiculously low, less than 200. So this is even
717 * conservative, even though it seems large.
719 * The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
720 * waitqueues, i.e. the size of the waitq table given the number of pages.
722 #define PAGES_PER_WAITQUEUE 256
724 static inline unsigned long wait_table_size(unsigned long pages)
726 unsigned long size = 1;
728 pages /= PAGES_PER_WAITQUEUE;
734 * Once we have dozens or even hundreds of threads sleeping
735 * on IO we've got bigger problems than wait queue collision.
736 * Limit the size of the wait table to a reasonable size.
738 size = min(size, 4096UL);
744 * This is an integer logarithm so that shifts can be used later
745 * to extract the more random high bits from the multiplicative
746 * hash function before the remainder is taken.
748 static inline unsigned long wait_table_bits(unsigned long size)
753 #define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
756 * Set up the zone data structures:
757 * - mark all pages reserved
758 * - mark all memory queues empty
759 * - clear the memory bitmaps
761 void __init free_area_init_core(int nid, pg_data_t *pgdat, struct page **gmap,
762 unsigned long *zones_size, unsigned long zone_start_paddr,
763 unsigned long *zholes_size, struct page *lmem_map)
766 unsigned long map_size;
767 unsigned long totalpages, offset, realtotalpages;
768 const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1);
770 if (zone_start_paddr & ~PAGE_MASK)
774 for (i = 0; i < MAX_NR_ZONES; i++) {
775 unsigned long size = zones_size[i];
778 realtotalpages = totalpages;
780 for (i = 0; i < MAX_NR_ZONES; i++)
781 realtotalpages -= zholes_size[i];
783 printk("On node %d totalpages: %lu\n", nid, realtotalpages);
786 * Some architectures (with lots of mem and discontinous memory
787 * maps) have to search for a good mem_map area:
788 * For discontigmem, the conceptual mem map array starts from
789 * PAGE_OFFSET, we need to align the actual array onto a mem map
790 * boundary, so that MAP_NR works.
792 map_size = (totalpages + 1)*sizeof(struct page);
793 if (lmem_map == (struct page *)0) {
794 lmem_map = (struct page *) alloc_bootmem_node(pgdat, map_size);
795 lmem_map = (struct page *)(PAGE_OFFSET +
796 MAP_ALIGN((unsigned long)lmem_map - PAGE_OFFSET));
798 *gmap = pgdat->node_mem_map = lmem_map;
799 pgdat->node_size = totalpages;
800 pgdat->node_start_paddr = zone_start_paddr;
801 pgdat->node_start_mapnr = (lmem_map - mem_map);
804 offset = lmem_map - mem_map;
805 for (j = 0; j < MAX_NR_ZONES; j++) {
806 zone_t *zone = pgdat->node_zones + j;
808 unsigned long size, realsize;
811 zone_table[nid * MAX_NR_ZONES + j] = zone;
812 realsize = size = zones_size[j];
814 realsize -= zholes_size[j];
816 printk("zone(%lu): %lu pages.\n", j, size);
818 zone->realsize = realsize;
819 zone->name = zone_names[j];
820 zone->lock = SPIN_LOCK_UNLOCKED;
821 zone->zone_pgdat = pgdat;
822 zone->free_pages = 0;
823 zone->need_balance = 0;
824 zone->nr_active_pages = zone->nr_inactive_pages = 0;
831 * The per-page waitqueue mechanism uses hashed waitqueues
834 zone->wait_table_size = wait_table_size(size);
835 zone->wait_table_shift =
836 BITS_PER_LONG - wait_table_bits(zone->wait_table_size);
837 zone->wait_table = (wait_queue_head_t *)
838 alloc_bootmem_node(pgdat, zone->wait_table_size
839 * sizeof(wait_queue_head_t));
841 for(i = 0; i < zone->wait_table_size; ++i)
842 init_waitqueue_head(zone->wait_table + i);
844 pgdat->nr_zones = j+1;
846 mask = (realsize / zone_balance_ratio[j]);
847 if (mask < zone_balance_min[j])
848 mask = zone_balance_min[j];
849 else if (mask > zone_balance_max[j])
850 mask = zone_balance_max[j];
851 zone->watermarks[j].min = mask;
852 zone->watermarks[j].low = mask*2;
853 zone->watermarks[j].high = mask*3;
854 /* now set the watermarks of the lower zones in the "j" classzone */
855 for (idx = j-1; idx >= 0; idx--) {
856 zone_t * lower_zone = pgdat->node_zones + idx;
857 unsigned long lower_zone_reserve;
858 if (!lower_zone->size)
861 mask = lower_zone->watermarks[idx].min;
862 lower_zone->watermarks[j].min = mask;
863 lower_zone->watermarks[j].low = mask*2;
864 lower_zone->watermarks[j].high = mask*3;
866 /* now the brainer part */
867 lower_zone_reserve = realsize / lower_zone_reserve_ratio[idx];
868 lower_zone->watermarks[j].min += lower_zone_reserve;
869 lower_zone->watermarks[j].low += lower_zone_reserve;
870 lower_zone->watermarks[j].high += lower_zone_reserve;
872 realsize += lower_zone->realsize;
875 zone->zone_mem_map = mem_map + offset;
876 zone->zone_start_mapnr = offset;
877 zone->zone_start_paddr = zone_start_paddr;
879 if ((zone_start_paddr >> PAGE_SHIFT) & (zone_required_alignment-1))
880 printk("BUG: wrong zone alignment, it will crash\n");
883 * Initially all pages are reserved - free ones are freed
884 * up by free_all_bootmem() once the early boot process is
885 * done. Non-atomic initialization, single-pass.
887 for (i = 0; i < size; i++) {
888 struct page *page = mem_map + offset + i;
889 set_page_zone(page, nid * MAX_NR_ZONES + j);
890 set_page_count(page, 0);
891 SetPageReserved(page);
892 INIT_LIST_HEAD(&page->list);
893 if (j != ZONE_HIGHMEM)
894 set_page_address(page, __va(zone_start_paddr));
895 zone_start_paddr += PAGE_SIZE;
900 unsigned long bitmap_size;
902 INIT_LIST_HEAD(&zone->free_area[i].free_list);
903 if (i == MAX_ORDER-1) {
904 zone->free_area[i].map = NULL;
909 * Page buddy system uses "index >> (i+1)",
910 * where "index" is at most "size-1".
912 * The extra "+3" is to round down to byte
913 * size (8 bits per byte assumption). Thus
914 * we get "(size-1) >> (i+4)" as the last byte
917 * The "+1" is because we want to round the
918 * byte allocation up rather than down. So
919 * we should have had a "+7" before we shifted
920 * down by three. Also, we have to add one as
921 * we actually _use_ the last bit (it's [0,n]
922 * inclusive, not [0,n[).
924 * So we actually had +7+1 before we shift
925 * down by 3. But (n+8) >> 3 == (n >> 3) + 1
926 * (modulo overflows, which we do not have).
928 * Finally, we LONG_ALIGN because all bitmap
929 * operations are on longs.
931 bitmap_size = (size-1) >> (i+4);
932 bitmap_size = LONG_ALIGN(bitmap_size+1);
933 zone->free_area[i].map =
934 (unsigned long *) alloc_bootmem_node(pgdat, bitmap_size);
937 build_zonelists(pgdat);
940 void __init free_area_init(unsigned long *zones_size)
942 free_area_init_core(0, &contig_page_data, &mem_map, zones_size, 0, 0, 0);
945 static int __init setup_mem_frac(char *str)
949 while (get_option(&str, &zone_balance_ratio[j++]) == 2);
950 printk("setup_mem_frac: ");
951 for (j = 0; j < MAX_NR_ZONES; j++) printk("%d ", zone_balance_ratio[j]);
956 __setup("memfrac=", setup_mem_frac);
958 static int __init setup_lower_zone_reserve(char *str)
962 while (get_option(&str, &lower_zone_reserve_ratio[j++]) == 2);
963 printk("setup_lower_zone_reserve: ");
964 for (j = 0; j < MAX_NR_ZONES-1; j++) printk("%d ", lower_zone_reserve_ratio[j]);
969 __setup("lower_zone_reserve=", setup_lower_zone_reserve);