4 * Copyright (C) 1994-2006 Linus Torvalds
8 * This file handles the generic file mmap semantics used by
9 * most "normal" filesystems (but you don't /have/ to use this:
10 * the NFS filesystem used to do this differently, for example)
12 #include <linux/module.h>
13 #include <linux/slab.h>
14 #include <linux/shm.h>
15 #include <linux/mman.h>
16 #include <linux/locks.h>
17 #include <linux/pagemap.h>
18 #include <linux/swap.h>
19 #include <linux/smp_lock.h>
20 #include <linux/blkdev.h>
21 #include <linux/file.h>
22 #include <linux/swapctl.h>
23 #include <linux/init.h>
25 #include <linux/iobuf.h>
27 #include <asm/pgalloc.h>
28 #include <asm/uaccess.h>
31 #include <linux/highmem.h>
34 * Shared mappings implemented 30.11.1994. It's not fully working yet,
37 * Shared mappings now work. 15.8.1995 Bruno.
39 * finished 'unifying' the page and buffer cache and SMP-threaded the
40 * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
42 * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
45 unsigned long page_cache_size;
46 unsigned int page_hash_bits;
47 struct page **page_hash_table;
49 int vm_max_readahead = 31;
50 int vm_min_readahead = 3;
51 EXPORT_SYMBOL(vm_max_readahead);
52 EXPORT_SYMBOL(vm_min_readahead);
55 spinlock_cacheline_t pagecache_lock_cacheline = {SPIN_LOCK_UNLOCKED};
57 * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock
58 * with the pagecache_lock held.
65 spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
67 #define CLUSTER_PAGES (1 << page_cluster)
68 #define CLUSTER_OFFSET(x) (((x) >> page_cluster) << page_cluster)
70 static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
71 static void fastcall add_page_to_hash_queue(struct page * page, struct page **p)
73 struct page *next = *p;
76 page->next_hash = next;
79 next->pprev_hash = &page->next_hash;
82 inc_nr_cache_pages(page);
85 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
87 struct list_head *head = &mapping->clean_pages;
90 list_add(&page->list, head);
91 page->mapping = mapping;
94 static inline void remove_page_from_inode_queue(struct page * page)
96 struct address_space * mapping = page->mapping;
98 if (mapping->a_ops->removepage)
99 mapping->a_ops->removepage(page);
101 list_del(&page->list);
102 page->mapping = NULL;
105 if (!mapping->nrpages)
106 refile_inode(mapping->host);
109 static inline void remove_page_from_hash_queue(struct page * page)
111 struct page *next = page->next_hash;
112 struct page **pprev = page->pprev_hash;
115 next->pprev_hash = pprev;
117 page->pprev_hash = NULL;
118 dec_nr_cache_pages(page);
122 * Remove a page from the page cache and free it. Caller has to make
123 * sure the page is locked and that nobody else uses it - or that usage
126 void __remove_inode_page(struct page *page)
128 remove_page_from_inode_queue(page);
129 remove_page_from_hash_queue(page);
132 void remove_inode_page(struct page *page)
134 if (!PageLocked(page))
137 spin_lock(&pagecache_lock);
138 __remove_inode_page(page);
139 spin_unlock(&pagecache_lock);
142 static inline int sync_page(struct page *page)
144 struct address_space *mapping = page->mapping;
146 if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
147 return mapping->a_ops->sync_page(page);
152 * Add a page to the dirty page list.
154 void fastcall set_page_dirty(struct page *page)
156 if (!test_and_set_bit(PG_dirty, &page->flags)) {
157 struct address_space *mapping = page->mapping;
160 spin_lock(&pagecache_lock);
161 mapping = page->mapping;
162 if (mapping) { /* may have been truncated */
163 list_del(&page->list);
164 list_add(&page->list, &mapping->dirty_pages);
166 spin_unlock(&pagecache_lock);
168 if (mapping && mapping->host)
169 mark_inode_dirty_pages(mapping->host);
171 printk(KERN_DEBUG "%s: dirtied page\n", current->comm);
177 * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
178 * @inode: the inode which pages we want to invalidate
180 * This function only removes the unlocked pages, if you want to
181 * remove all the pages of one inode, you must call truncate_inode_pages.
184 void invalidate_inode_pages(struct inode * inode)
186 struct list_head *head, *curr;
189 head = &inode->i_mapping->clean_pages;
191 spin_lock(&pagemap_lru_lock);
192 spin_lock(&pagecache_lock);
195 while (curr != head) {
196 page = list_entry(curr, struct page, list);
199 /* We cannot invalidate something in dirty.. */
204 if (TryLockPage(page))
207 if (page->buffers && !try_to_free_buffers(page, 0))
210 if (page_count(page) != 1)
213 __lru_cache_del(page);
214 __remove_inode_page(page);
216 page_cache_release(page);
223 spin_unlock(&pagecache_lock);
224 spin_unlock(&pagemap_lru_lock);
227 static int do_flushpage(struct page *page, unsigned long offset)
229 int (*flushpage) (struct page *, unsigned long);
230 flushpage = page->mapping->a_ops->flushpage;
232 return (*flushpage)(page, offset);
233 return block_flushpage(page, offset);
236 static inline void truncate_partial_page(struct page *page, unsigned partial)
238 memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
240 do_flushpage(page, partial);
243 static void truncate_complete_page(struct page *page)
245 /* Leave it on the LRU if it gets converted into anonymous buffers */
246 if (!page->buffers || do_flushpage(page, 0))
250 * We remove the page from the page cache _after_ we have
251 * destroyed all buffer-cache references to it. Otherwise some
252 * other process might think this inode page is not in the
253 * page cache and creates a buffer-cache alias to it causing
254 * all sorts of fun problems ...
256 ClearPageDirty(page);
257 ClearPageUptodate(page);
258 remove_inode_page(page);
259 page_cache_release(page);
262 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
263 static int fastcall truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
265 struct list_head *curr;
271 while (curr != head) {
272 unsigned long offset;
274 page = list_entry(curr, struct page, list);
275 offset = page->index;
277 /* Is one of the pages to truncate? */
278 if ((offset >= start) || (*partial && (offset + 1) == start)) {
281 page_cache_get(page);
282 failed = TryLockPage(page);
286 /* Restart after this page */
287 list_add_tail(head, curr);
289 /* Restart on this page */
290 list_add(head, curr);
292 spin_unlock(&pagecache_lock);
296 if (*partial && (offset + 1) == start) {
297 truncate_partial_page(page, *partial);
300 truncate_complete_page(page);
306 page_cache_release(page);
308 if (current->need_resched) {
309 __set_current_state(TASK_RUNNING);
313 spin_lock(&pagecache_lock);
323 * truncate_inode_pages - truncate *all* the pages from an offset
324 * @mapping: mapping to truncate
325 * @lstart: offset from with to truncate
327 * Truncate the page cache at a set offset, removing the pages
328 * that are beyond that offset (and zeroing out partial pages).
329 * If any page is locked we wait for it to become unlocked.
331 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
333 unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
334 unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
337 spin_lock(&pagecache_lock);
339 unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
340 unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
341 unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
343 /* Traversed all three lists without dropping the lock */
344 spin_unlock(&pagecache_lock);
347 static inline int invalidate_this_page2(struct page * page,
348 struct list_head * curr,
349 struct list_head * head)
354 * The page is locked and we hold the pagecache_lock as well
355 * so both page_count(page) and page->buffers stays constant here.
357 if (page_count(page) == 1 + !!page->buffers) {
358 /* Restart after this page */
360 list_add_tail(head, curr);
362 page_cache_get(page);
363 spin_unlock(&pagecache_lock);
364 truncate_complete_page(page);
367 /* Restart after this page */
369 list_add_tail(head, curr);
371 page_cache_get(page);
372 spin_unlock(&pagecache_lock);
373 block_invalidate_page(page);
377 ClearPageDirty(page);
378 ClearPageUptodate(page);
384 static int FASTCALL(invalidate_list_pages2(struct list_head *));
385 static int fastcall invalidate_list_pages2(struct list_head *head)
387 struct list_head *curr;
393 while (curr != head) {
394 page = list_entry(curr, struct page, list);
396 if (!TryLockPage(page)) {
399 __unlocked = invalidate_this_page2(page, curr, head);
401 unlocked |= __unlocked;
407 /* Restart on this page */
409 list_add(head, curr);
411 page_cache_get(page);
412 spin_unlock(&pagecache_lock);
417 page_cache_release(page);
418 if (current->need_resched) {
419 __set_current_state(TASK_RUNNING);
423 spin_lock(&pagecache_lock);
430 * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
431 * free the pages because they're mapped.
432 * @mapping: the address_space which pages we want to invalidate
434 void invalidate_inode_pages2(struct address_space * mapping)
438 spin_lock(&pagecache_lock);
440 unlocked = invalidate_list_pages2(&mapping->clean_pages);
441 unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
442 unlocked |= invalidate_list_pages2(&mapping->locked_pages);
444 spin_unlock(&pagecache_lock);
447 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
452 page = page->next_hash;
456 if (page->mapping != mapping)
458 if (page->index == offset)
466 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
468 struct list_head *curr;
472 spin_lock(&pagecache_lock);
474 while (curr != head) {
475 page = list_entry(curr, struct page, list);
479 if (page->index >= end)
481 if (page->index < start)
484 page_cache_get(page);
485 spin_unlock(&pagecache_lock);
488 /* The buffers could have been free'd while we waited for the page lock */
493 spin_lock(&pagecache_lock);
494 curr = page->list.next;
495 page_cache_release(page);
497 spin_unlock(&pagecache_lock);
503 * Two-stage data sync: first start the IO, then go back and
504 * collect the information..
506 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
510 /* writeout dirty buffers on pages from both clean and dirty lists */
511 retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
512 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
513 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
515 /* now wait for locked buffers on pages from both clean and dirty lists */
516 retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
517 retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
518 retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
524 * In-memory filesystems have to fail their
525 * writepage function - and this has to be
526 * worked around in the VM layer..
529 * - mark the page dirty again (but do NOT
530 * add it back to the inode dirty list, as
531 * that would livelock in fdatasync)
532 * - activate the page so that the page stealer
533 * doesn't try to write it out over and over
536 int fail_writepage(struct page *page)
538 /* Only activate on memory-pressure, not fsync.. */
539 if (PageLaunder(page)) {
541 SetPageReferenced(page);
544 /* Set the page dirty again, unlock */
550 EXPORT_SYMBOL(fail_writepage);
553 * filemap_fdatawrite - walk the list of dirty pages of the given address space
554 * and writepage() each unlocked page (does not wait on locked pages).
556 * @mapping: address space structure to write
559 int filemap_fdatawrite(struct address_space * mapping)
562 int (*writepage)(struct page *) = mapping->a_ops->writepage;
564 spin_lock(&pagecache_lock);
566 while (!list_empty(&mapping->dirty_pages)) {
567 struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
569 list_del(&page->list);
570 list_add(&page->list, &mapping->locked_pages);
572 if (!PageDirty(page))
575 page_cache_get(page);
576 spin_unlock(&pagecache_lock);
578 if (!TryLockPage(page)) {
579 if (PageDirty(page)) {
581 ClearPageDirty(page);
582 err = writepage(page);
588 page_cache_release(page);
589 spin_lock(&pagecache_lock);
591 spin_unlock(&pagecache_lock);
596 * filemap_fdatasync - walk the list of dirty pages of the given address space
597 * and writepage() all of them.
599 * @mapping: address space structure to write
602 int filemap_fdatasync(struct address_space * mapping)
605 int (*writepage)(struct page *) = mapping->a_ops->writepage;
607 spin_lock(&pagecache_lock);
609 while (!list_empty(&mapping->dirty_pages)) {
610 struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
612 list_del(&page->list);
613 list_add(&page->list, &mapping->locked_pages);
615 if (!PageDirty(page))
618 page_cache_get(page);
619 spin_unlock(&pagecache_lock);
623 if (PageDirty(page)) {
625 ClearPageDirty(page);
626 err = writepage(page);
632 page_cache_release(page);
633 spin_lock(&pagecache_lock);
635 spin_unlock(&pagecache_lock);
640 * filemap_fdatawait - walk the list of locked pages of the given address space
641 * and wait for all of them.
643 * @mapping: address space structure to wait for
646 int filemap_fdatawait(struct address_space * mapping)
650 spin_lock(&pagecache_lock);
652 while (!list_empty(&mapping->locked_pages)) {
653 struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
655 list_del(&page->list);
656 list_add(&page->list, &mapping->clean_pages);
658 if (!PageLocked(page))
661 page_cache_get(page);
662 spin_unlock(&pagecache_lock);
664 ___wait_on_page(page);
668 page_cache_release(page);
669 spin_lock(&pagecache_lock);
671 spin_unlock(&pagecache_lock);
676 * Add a page to the inode page cache.
678 * The caller must have locked the page and
679 * set all the page flags correctly..
681 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
683 if (!PageLocked(page))
687 page_cache_get(page);
688 spin_lock(&pagecache_lock);
689 add_page_to_inode_queue(mapping, page);
690 add_page_to_hash_queue(page, page_hash(mapping, index));
691 spin_unlock(&pagecache_lock);
697 * This adds a page to the page cache, starting out as locked,
698 * owned by us, but unreferenced, not uptodate and with no errors.
700 static inline void __add_to_page_cache(struct page * page,
701 struct address_space *mapping, unsigned long offset,
705 * Yes this is inefficient, however it is needed. The problem
706 * is that we could be adding a page to the swap cache while
707 * another CPU is also modifying page->flags, so the updates
708 * really do need to be atomic. -- Rik
710 ClearPageUptodate(page);
711 ClearPageError(page);
712 ClearPageDirty(page);
713 ClearPageReferenced(page);
714 ClearPageArch1(page);
715 ClearPageChecked(page);
717 page_cache_get(page);
718 page->index = offset;
719 add_page_to_inode_queue(mapping, page);
720 add_page_to_hash_queue(page, hash);
723 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
725 spin_lock(&pagecache_lock);
726 __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
727 spin_unlock(&pagecache_lock);
731 int add_to_page_cache_unique(struct page * page,
732 struct address_space *mapping, unsigned long offset,
738 spin_lock(&pagecache_lock);
739 alias = __find_page_nolock(mapping, offset, *hash);
743 __add_to_page_cache(page,mapping,offset,hash);
747 spin_unlock(&pagecache_lock);
754 * This adds the requested page to the page cache if it isn't already there,
755 * and schedules an I/O to read in its contents from disk.
757 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
758 static int fastcall page_cache_read(struct file * file, unsigned long offset)
760 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
761 struct page **hash = page_hash(mapping, offset);
764 spin_lock(&pagecache_lock);
765 page = __find_page_nolock(mapping, offset, *hash);
766 spin_unlock(&pagecache_lock);
770 page = page_cache_alloc(mapping);
774 if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
775 int error = mapping->a_ops->readpage(file, page);
776 page_cache_release(page);
780 * We arrive here in the unlikely event that someone
781 * raced with us and added our page to the cache first.
783 page_cache_release(page);
788 * Read in an entire cluster at once. A cluster is usually a 64k-
789 * aligned block that includes the page requested in "offset."
791 static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
792 unsigned long filesize));
793 static int fastcall read_cluster_nonblocking(struct file * file, unsigned long offset,
794 unsigned long filesize)
796 unsigned long pages = CLUSTER_PAGES;
798 offset = CLUSTER_OFFSET(offset);
799 while ((pages-- > 0) && (offset < filesize)) {
800 int error = page_cache_read(file, offset);
810 * Knuth recommends primes in approximately golden ratio to the maximum
811 * integer representable by a machine word for multiplicative hashing.
812 * Chuck Lever verified the effectiveness of this technique:
813 * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
815 * These primes are chosen to be bit-sparse, that is operations on
816 * them can use shifts and additions instead of multiplications for
817 * machines where multiplications are slow.
819 #if BITS_PER_LONG == 32
820 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
821 #define GOLDEN_RATIO_PRIME 0x9e370001UL
822 #elif BITS_PER_LONG == 64
823 /* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
824 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
826 #error Define GOLDEN_RATIO_PRIME for your wordsize.
830 * In order to wait for pages to become available there must be
831 * waitqueues associated with pages. By using a hash table of
832 * waitqueues where the bucket discipline is to maintain all
833 * waiters on the same queue and wake all when any of the pages
834 * become available, and for the woken contexts to check to be
835 * sure the appropriate page became available, this saves space
836 * at a cost of "thundering herd" phenomena during rare hash
839 static inline wait_queue_head_t *page_waitqueue(struct page *page)
841 const zone_t *zone = page_zone(page);
842 wait_queue_head_t *wait = zone->wait_table;
843 unsigned long hash = (unsigned long)page;
845 #if BITS_PER_LONG == 64
846 /* Sigh, gcc can't optimise this alone like it does for 32 bits. */
847 unsigned long n = hash;
861 /* On some cpus multiply is faster, on others gcc will do shifts */
862 hash *= GOLDEN_RATIO_PRIME;
864 hash >>= zone->wait_table_shift;
870 * This must be called after every submit_bh with end_io
871 * callbacks that would result into the blkdev layer waking
872 * up the page after a queue unplug.
874 void fastcall wakeup_page_waiters(struct page * page)
876 wait_queue_head_t * head;
878 head = page_waitqueue(page);
879 if (waitqueue_active(head))
884 * Wait for a page to get unlocked.
886 * This must be called with the caller "holding" the page,
887 * ie with increased "page->count" so that the page won't
888 * go away during the wait..
890 * The waiting strategy is to get on a waitqueue determined
891 * by hashing. Waiters will then collide, and the newly woken
892 * task must then determine whether it was woken for the page
893 * it really wanted, and go back to sleep on the waitqueue if
894 * that wasn't it. With the waitqueue semantics, it never leaves
895 * the waitqueue unless it calls, so the loop moves forward one
896 * iteration every time there is
899 * (2) one of the colliding pages is woken
901 * This is the thundering herd problem, but it is expected to
902 * be very rare due to the few pages that are actually being
903 * waited on at any given time and the quality of the hash function.
905 void ___wait_on_page(struct page *page)
907 wait_queue_head_t *waitqueue = page_waitqueue(page);
908 struct task_struct *tsk = current;
909 DECLARE_WAITQUEUE(wait, tsk);
911 add_wait_queue(waitqueue, &wait);
913 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
914 if (!PageLocked(page))
918 } while (PageLocked(page));
919 __set_task_state(tsk, TASK_RUNNING);
920 remove_wait_queue(waitqueue, &wait);
924 * unlock_page() is the other half of the story just above
925 * __wait_on_page(). Here a couple of quick checks are done
926 * and a couple of flags are set on the page, and then all
927 * of the waiters for all of the pages in the appropriate
928 * wait queue are woken.
930 void fastcall unlock_page(struct page *page)
932 wait_queue_head_t *waitqueue = page_waitqueue(page);
933 ClearPageLaunder(page);
934 smp_mb__before_clear_bit();
935 if (!test_and_clear_bit(PG_locked, &(page)->flags))
937 smp_mb__after_clear_bit();
940 * Although the default semantics of wake_up() are
941 * to wake all, here the specific function is used
942 * to make it even more explicit that a number of
943 * pages are being waited on here.
945 if (waitqueue_active(waitqueue))
946 wake_up_all(waitqueue);
950 * Get a lock on the page, assuming we need to sleep
953 static void __lock_page(struct page *page)
955 wait_queue_head_t *waitqueue = page_waitqueue(page);
956 struct task_struct *tsk = current;
957 DECLARE_WAITQUEUE(wait, tsk);
959 add_wait_queue_exclusive(waitqueue, &wait);
961 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
962 if (PageLocked(page)) {
966 if (!TryLockPage(page))
969 __set_task_state(tsk, TASK_RUNNING);
970 remove_wait_queue(waitqueue, &wait);
974 * Get an exclusive lock on the page, optimistically
975 * assuming it's not locked..
977 void fastcall lock_page(struct page *page)
979 if (TryLockPage(page))
984 * a rather lightweight function, finding and getting a reference to a
985 * hashed page atomically.
987 struct page * __find_get_page(struct address_space *mapping,
988 unsigned long offset, struct page **hash)
993 * We scan the hash list read-only. Addition to and removal from
994 * the hash-list needs a held write-lock.
996 spin_lock(&pagecache_lock);
997 page = __find_page_nolock(mapping, offset, *hash);
999 page_cache_get(page);
1000 spin_unlock(&pagecache_lock);
1005 * Same as above, but trylock it instead of incrementing the count.
1007 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
1010 struct page **hash = page_hash(mapping, offset);
1012 spin_lock(&pagecache_lock);
1013 page = __find_page_nolock(mapping, offset, *hash);
1015 if (TryLockPage(page))
1018 spin_unlock(&pagecache_lock);
1023 * Must be called with the pagecache lock held,
1024 * will return with it held (but it may be dropped
1025 * during blocking operations..
1027 static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
1028 static struct page * fastcall __find_lock_page_helper(struct address_space *mapping,
1029 unsigned long offset, struct page *hash)
1034 * We scan the hash list read-only. Addition to and removal from
1035 * the hash-list needs a held write-lock.
1038 page = __find_page_nolock(mapping, offset, hash);
1040 page_cache_get(page);
1041 if (TryLockPage(page)) {
1042 spin_unlock(&pagecache_lock);
1044 spin_lock(&pagecache_lock);
1046 /* Has the page been re-allocated while we slept? */
1047 if (page->mapping != mapping || page->index != offset) {
1049 page_cache_release(page);
1058 * Same as the above, but lock the page too, verifying that
1059 * it's still valid once we own it.
1061 struct page * __find_lock_page (struct address_space *mapping,
1062 unsigned long offset, struct page **hash)
1066 spin_lock(&pagecache_lock);
1067 page = __find_lock_page_helper(mapping, offset, *hash);
1068 spin_unlock(&pagecache_lock);
1073 * Same as above, but create the page if required..
1075 struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
1078 struct page **hash = page_hash(mapping, index);
1080 spin_lock(&pagecache_lock);
1081 page = __find_lock_page_helper(mapping, index, *hash);
1082 spin_unlock(&pagecache_lock);
1084 struct page *newpage = alloc_page(gfp_mask);
1086 spin_lock(&pagecache_lock);
1087 page = __find_lock_page_helper(mapping, index, *hash);
1088 if (likely(!page)) {
1090 __add_to_page_cache(page, mapping, index, hash);
1093 spin_unlock(&pagecache_lock);
1094 if (newpage == NULL)
1095 lru_cache_add(page);
1097 page_cache_release(newpage);
1104 * Same as grab_cache_page, but do not wait if the page is unavailable.
1105 * This is intended for speculative data generators, where the data can
1106 * be regenerated if the page couldn't be grabbed. This routine should
1107 * be safe to call while holding the lock for another page.
1109 struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
1111 struct page *page, **hash;
1113 hash = page_hash(mapping, index);
1114 page = __find_get_page(mapping, index, hash);
1117 if ( !TryLockPage(page) ) {
1118 /* Page found and locked */
1119 /* This test is overly paranoid, but what the heck... */
1120 if ( unlikely(page->mapping != mapping || page->index != index) ) {
1121 /* Someone reallocated this page under us. */
1123 page_cache_release(page);
1129 /* Page locked by someone else */
1130 page_cache_release(page);
1135 page = page_cache_alloc(mapping);
1136 if ( unlikely(!page) )
1137 return NULL; /* Failed to allocate a page */
1139 if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
1140 /* Someone else grabbed the page already. */
1141 page_cache_release(page);
1149 #define PROFILE_READAHEAD
1150 #define DEBUG_READAHEAD
1154 * Read-ahead profiling information
1155 * --------------------------------
1156 * Every PROFILE_MAXREADCOUNT, the following information is written
1158 * Percentage of asynchronous read-ahead.
1159 * Average of read-ahead fields context value.
1160 * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
1164 #ifdef PROFILE_READAHEAD
1166 #define PROFILE_MAXREADCOUNT 1000
1168 static unsigned long total_reada;
1169 static unsigned long total_async;
1170 static unsigned long total_ramax;
1171 static unsigned long total_ralen;
1172 static unsigned long total_rawin;
1174 static void profile_readahead(int async, struct file *filp)
1176 unsigned long flags;
1182 total_ramax += filp->f_ramax;
1183 total_ralen += filp->f_ralen;
1184 total_rawin += filp->f_rawin;
1186 if (total_reada > PROFILE_MAXREADCOUNT) {
1189 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
1190 restore_flags(flags);
1194 printk("Readahead average: max=%ld, len=%ld, win=%ld, async=%ld%%\n",
1195 total_ramax/total_reada,
1196 total_ralen/total_reada,
1197 total_rawin/total_reada,
1198 (total_async*100)/total_reada);
1199 #ifdef DEBUG_READAHEAD
1200 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
1201 filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
1210 restore_flags(flags);
1213 #endif /* defined PROFILE_READAHEAD */
1216 * Read-ahead context:
1217 * -------------------
1218 * The read ahead context fields of the "struct file" are the following:
1219 * - f_raend : position of the first byte after the last page we tried to
1221 * - f_ramax : current read-ahead maximum size.
1222 * - f_ralen : length of the current IO read block we tried to read-ahead.
1223 * - f_rawin : length of the current read-ahead window.
1224 * if last read-ahead was synchronous then
1226 * otherwise (was asynchronous)
1227 * f_rawin = previous value of f_ralen + f_ralen
1229 * Read-ahead limits:
1230 * ------------------
1231 * MIN_READAHEAD : minimum read-ahead size when read-ahead.
1232 * MAX_READAHEAD : maximum read-ahead size when read-ahead.
1234 * Synchronous read-ahead benefits:
1235 * --------------------------------
1236 * Using reasonable IO xfer length from peripheral devices increase system
1238 * Reasonable means, in this context, not too large but not too small.
1239 * The actual maximum value is:
1240 * MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
1241 * and 32K if defined (4K page size assumed).
1243 * Asynchronous read-ahead benefits:
1244 * ---------------------------------
1245 * Overlapping next read request and user process execution increase system
1250 * We have to guess which further data are needed by the user process.
1251 * If these data are often not really needed, it's bad for system
1253 * However, we know that files are often accessed sequentially by
1254 * application programs and it seems that it is possible to have some good
1255 * strategy in that guessing.
1256 * We only try to read-ahead files that seems to be read sequentially.
1258 * Asynchronous read-ahead risks:
1259 * ------------------------------
1260 * In order to maximize overlapping, we must start some asynchronous read
1261 * request from the device, as soon as possible.
1262 * We must be very careful about:
1263 * - The number of effective pending IO read requests.
1264 * ONE seems to be the only reasonable value.
1265 * - The total memory pool usage for the file access stream.
1266 * This maximum memory usage is implicitly 2 IO read chunks:
1267 * 2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
1268 * 64k if defined (4K page size assumed).
1271 static inline int get_max_readahead(struct inode * inode)
1273 if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
1274 return vm_max_readahead;
1275 return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
1278 static void generic_file_readahead(int reada_ok,
1279 struct file * filp, struct inode * inode,
1282 unsigned long end_index;
1283 unsigned long index = page->index;
1284 unsigned long max_ahead, ahead;
1285 unsigned long raend;
1286 int max_readahead = get_max_readahead(inode);
1288 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1290 raend = filp->f_raend;
1294 * The current page is locked.
1295 * If the current position is inside the previous read IO request, do not
1296 * try to reread previously read ahead pages.
1297 * Otherwise decide or not to read ahead some pages synchronously.
1298 * If we are not going to read ahead, set the read ahead context for this
1301 if (PageLocked(page)) {
1302 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
1304 if (raend < end_index)
1305 max_ahead = filp->f_ramax;
1309 filp->f_raend = index + filp->f_ralen;
1310 filp->f_rawin += filp->f_ralen;
1315 * The current page is not locked.
1316 * If we were reading ahead and,
1317 * if the current max read ahead size is not zero and,
1318 * if the current position is inside the last read-ahead IO request,
1319 * it is the moment to try to read ahead asynchronously.
1320 * We will later force unplug device in order to force asynchronous read IO.
1322 else if (reada_ok && filp->f_ramax && raend >= 1 &&
1323 index <= raend && index + filp->f_ralen >= raend) {
1325 * Add ONE page to max_ahead in order to try to have about the same IO max size
1326 * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
1327 * Compute the position of the last page we have tried to read in order to
1328 * begin to read ahead just at the next page.
1331 if (raend < end_index)
1332 max_ahead = filp->f_ramax + 1;
1335 filp->f_rawin = filp->f_ralen;
1341 * Try to read ahead pages.
1342 * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
1343 * scheduler, will work enough for us to avoid too bad actuals IO requests.
1346 while (ahead < max_ahead) {
1347 unsigned long ra_index = raend + ahead + 1;
1349 if (ra_index >= end_index)
1351 if (page_cache_read(filp, ra_index) < 0)
1357 * If we tried to read ahead some pages,
1358 * If we tried to read ahead asynchronously,
1359 * Try to force unplug of the device in order to start an asynchronous
1361 * Update the read-ahead context.
1362 * Store the length of the current read-ahead window.
1363 * Double the current max read ahead size.
1364 * That heuristic avoid to do some large IO for files that are not really
1365 * accessed sequentially.
1368 filp->f_ralen += ahead;
1369 filp->f_rawin += filp->f_ralen;
1370 filp->f_raend = raend + ahead + 1;
1372 filp->f_ramax += filp->f_ramax;
1374 if (filp->f_ramax > max_readahead)
1375 filp->f_ramax = max_readahead;
1377 #ifdef PROFILE_READAHEAD
1378 profile_readahead((reada_ok == 2), filp);
1386 * Mark a page as having seen activity.
1388 * If it was already so marked, move it to the active queue and drop
1389 * the referenced bit. Otherwise, just mark it for future action..
1391 void fastcall mark_page_accessed(struct page *page)
1393 if (!PageActive(page) && PageReferenced(page)) {
1394 activate_page(page);
1395 ClearPageReferenced(page);
1397 SetPageReferenced(page);
1401 * This is a generic file read routine, and uses the
1402 * inode->i_op->readpage() function for the actual low-level
1405 * This is really ugly. But the goto's actually try to clarify some
1406 * of the logic when it comes to error handling etc.
1408 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1410 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1411 struct inode *inode = mapping->host;
1412 unsigned long index, offset;
1413 struct page *cached_page;
1416 int max_readahead = get_max_readahead(inode);
1419 index = *ppos >> PAGE_CACHE_SHIFT;
1420 offset = *ppos & ~PAGE_CACHE_MASK;
1423 * If the current position is outside the previous read-ahead window,
1424 * we reset the current read-ahead context and set read ahead max to zero
1425 * (will be set to just needed value later),
1426 * otherwise, we assume that the file accesses are sequential enough to
1427 * continue read-ahead.
1429 if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1439 * Adjust the current value of read-ahead max.
1440 * If the read operation stay in the first half page, force no readahead.
1441 * Otherwise try to increase read ahead max just enough to do the read request.
1442 * Then, at least MIN_READAHEAD if read ahead is ok,
1443 * and at most MAX_READAHEAD in all cases.
1445 if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1448 unsigned long needed;
1450 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1452 if (filp->f_ramax < needed)
1453 filp->f_ramax = needed;
1455 if (reada_ok && filp->f_ramax < vm_min_readahead)
1456 filp->f_ramax = vm_min_readahead;
1457 if (filp->f_ramax > max_readahead)
1458 filp->f_ramax = max_readahead;
1462 struct page *page, **hash;
1463 unsigned long end_index, nr, ret;
1465 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1467 if (index > end_index)
1469 nr = PAGE_CACHE_SIZE;
1470 if (index == end_index) {
1471 nr = inode->i_size & ~PAGE_CACHE_MASK;
1479 * Try to find the data in the page cache..
1481 hash = page_hash(mapping, index);
1483 spin_lock(&pagecache_lock);
1484 page = __find_page_nolock(mapping, index, *hash);
1486 goto no_cached_page;
1488 page_cache_get(page);
1489 spin_unlock(&pagecache_lock);
1491 if (!Page_Uptodate(page))
1492 goto page_not_up_to_date;
1493 generic_file_readahead(reada_ok, filp, inode, page);
1495 /* If users can be writing to this page using arbitrary
1496 * virtual addresses, take care about potential aliasing
1497 * before reading the page on the kernel side.
1499 if (mapping->i_mmap_shared != NULL)
1500 flush_dcache_page(page);
1503 * Mark the page accessed if we read the
1504 * beginning or we just did an lseek.
1506 if (!offset || !filp->f_reada)
1507 mark_page_accessed(page);
1510 * Ok, we have the page, and it's up-to-date, so
1511 * now we can copy it to user space...
1513 * The actor routine returns how many bytes were actually used..
1514 * NOTE! This may not be the same as how much of a user buffer
1515 * we filled up (we may be padding etc), so we can only update
1516 * "pos" here (the actor routine has to update the user buffer
1517 * pointers and the remaining count).
1519 ret = actor(desc, page, offset, nr);
1521 index += offset >> PAGE_CACHE_SHIFT;
1522 offset &= ~PAGE_CACHE_MASK;
1524 page_cache_release(page);
1525 if (ret == nr && desc->count)
1530 * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1532 page_not_up_to_date:
1533 generic_file_readahead(reada_ok, filp, inode, page);
1535 if (Page_Uptodate(page))
1538 /* Get exclusive access to the page ... */
1541 /* Did it get unhashed before we got the lock? */
1542 if (!page->mapping) {
1544 page_cache_release(page);
1548 /* Did somebody else fill it already? */
1549 if (Page_Uptodate(page)) {
1555 /* ... and start the actual read. The read will unlock the page. */
1556 error = mapping->a_ops->readpage(filp, page);
1559 if (Page_Uptodate(page))
1562 /* Again, try some read-ahead while waiting for the page to finish.. */
1563 generic_file_readahead(reada_ok, filp, inode, page);
1565 if (Page_Uptodate(page))
1570 /* UHHUH! A synchronous read error occurred. Report it */
1571 desc->error = error;
1572 page_cache_release(page);
1577 * Ok, it wasn't cached, so we need to create a new
1580 * We get here with the page cache lock held.
1583 spin_unlock(&pagecache_lock);
1584 cached_page = page_cache_alloc(mapping);
1586 desc->error = -ENOMEM;
1591 * Somebody may have added the page while we
1592 * dropped the page cache lock. Check for that.
1594 spin_lock(&pagecache_lock);
1595 page = __find_page_nolock(mapping, index, *hash);
1601 * Ok, add the new page to the hash-queues...
1604 __add_to_page_cache(page, mapping, index, hash);
1605 spin_unlock(&pagecache_lock);
1606 lru_cache_add(page);
1612 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1615 page_cache_release(cached_page);
1616 UPDATE_ATIME(inode);
1619 static inline int have_mapping_directIO(struct address_space * mapping)
1621 return mapping->a_ops->direct_IO || mapping->a_ops->direct_fileIO;
1624 /* Switch between old and new directIO formats */
1625 static inline int do_call_directIO(int rw, struct file *filp, struct kiobuf *iobuf, unsigned long offset, int blocksize)
1627 struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1629 if (mapping->a_ops->direct_fileIO)
1630 return mapping->a_ops->direct_fileIO(rw, filp, iobuf, offset, blocksize);
1631 return mapping->a_ops->direct_IO(rw, mapping->host, iobuf, offset, blocksize);
1635 * i_sem and i_alloc_sem should be held already. i_sem may be dropped
1636 * later once we've mapped the new IO. i_alloc_sem is kept until the IO
1640 static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
1642 ssize_t retval, progress;
1643 int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits;
1645 struct kiobuf * iobuf;
1646 struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1647 struct inode * inode = mapping->host;
1648 loff_t size = inode->i_size;
1651 iobuf = filp->f_iobuf;
1652 if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
1654 * A parallel read/write is using the preallocated iobuf
1655 * so just run slow and allocate a new one.
1657 retval = alloc_kiovec(1, &iobuf);
1663 blocksize = 1 << inode->i_blkbits;
1664 blocksize_bits = inode->i_blkbits;
1665 blocksize_mask = blocksize - 1;
1666 chunk_size = KIO_MAX_ATOMIC_IO << 10;
1669 if ((offset & blocksize_mask) || (count & blocksize_mask) || ((unsigned long) buf & blocksize_mask))
1671 if (!have_mapping_directIO(mapping))
1674 if ((rw == READ) && (offset + count > size))
1675 count = size - offset;
1678 * Flush to disk exclusively the _data_, metadata must remain
1679 * completly asynchronous or performance will go to /dev/null.
1681 retval = filemap_fdatasync(mapping);
1683 retval = fsync_inode_data_buffers(inode);
1685 retval = filemap_fdatawait(mapping);
1689 progress = retval = 0;
1692 if (iosize > chunk_size)
1693 iosize = chunk_size;
1695 retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
1699 retval = do_call_directIO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize);
1701 if (rw == READ && retval > 0)
1702 mark_dirty_kiobuf(iobuf, retval);
1707 /* warning: weird semantics here, we're reporting a read behind the end of the file */
1711 unmap_kiobuf(iobuf);
1713 if (retval != iosize)
1722 clear_bit(0, &filp->f_iobuf_lock);
1724 free_kiovec(1, &iobuf);
1729 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1732 unsigned long left, count = desc->count;
1738 left = __copy_to_user(desc->buf, kaddr + offset, size);
1743 desc->error = -EFAULT;
1745 desc->count = count - size;
1746 desc->written += size;
1751 inline ssize_t do_generic_direct_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1756 retval = generic_file_direct_IO(READ, filp, buf, count, pos);
1758 *ppos = pos + retval;
1763 * This is the "read()" routine for all filesystems
1764 * that can use the page cache directly.
1766 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1770 if ((ssize_t) count < 0)
1773 if (filp->f_flags & O_DIRECT)
1777 if (access_ok(VERIFY_WRITE, buf, count)) {
1781 read_descriptor_t desc;
1787 do_generic_file_read(filp, ppos, &desc, file_read_actor);
1789 retval = desc.written;
1791 retval = desc.error;
1800 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1801 struct inode *inode = mapping->host;
1805 goto out; /* skip atime */
1806 down_read(&inode->i_alloc_sem);
1807 down(&inode->i_sem);
1808 size = inode->i_size;
1810 retval = do_generic_direct_read(filp, buf, count, ppos);
1812 up_read(&inode->i_alloc_sem);
1813 UPDATE_ATIME(filp->f_dentry->d_inode);
1818 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1821 unsigned long count = desc->count;
1822 struct file *file = (struct file *) desc->buf;
1827 if (file->f_op->sendpage) {
1828 written = file->f_op->sendpage(file, page, offset,
1829 size, &file->f_pos, size<count);
1832 mm_segment_t old_fs;
1838 written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1844 desc->error = written;
1847 desc->count = count - written;
1848 desc->written += written;
1852 static ssize_t common_sendfile(int out_fd, int in_fd, loff_t *offset, size_t count)
1855 struct file * in_file, * out_file;
1856 struct inode * in_inode, * out_inode;
1859 * Get input file, and verify that it is ok..
1862 in_file = fget(in_fd);
1865 if (!(in_file->f_mode & FMODE_READ))
1868 in_inode = in_file->f_dentry->d_inode;
1871 if (!in_inode->i_mapping->a_ops->readpage)
1873 retval = rw_verify_area(READ, in_file, &in_file->f_pos, count);
1878 * Get output file, and verify that it is ok..
1881 out_file = fget(out_fd);
1884 if (!(out_file->f_mode & FMODE_WRITE))
1887 if (!out_file->f_op || !out_file->f_op->write)
1889 out_inode = out_file->f_dentry->d_inode;
1890 retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
1896 read_descriptor_t desc;
1899 offset = &in_file->f_pos;
1903 desc.buf = (char *) out_file;
1905 do_generic_file_read(in_file, offset, &desc, file_send_actor);
1907 retval = desc.written;
1909 retval = desc.error;
1920 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1922 loff_t pos, *ppos = NULL;
1926 if (unlikely(get_user(off, offset)))
1931 ret = common_sendfile(out_fd, in_fd, ppos, count);
1933 put_user((off_t)pos, offset);
1937 asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t *offset, size_t count)
1939 loff_t pos, *ppos = NULL;
1942 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1946 ret = common_sendfile(out_fd, in_fd, ppos, count);
1948 put_user(pos, offset);
1952 static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
1954 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1957 if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1960 /* Limit it to the size of the file.. */
1961 max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
1968 /* And limit it to a sane percentage of the inactive list.. */
1969 max = (nr_free_pages() + nr_inactive_pages) / 2;
1974 page_cache_read(file, index);
1981 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1989 if (file->f_mode & FMODE_READ) {
1990 unsigned long start = offset >> PAGE_CACHE_SHIFT;
1991 unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
1992 ret = do_readahead(file, start, len);
2000 * Read-ahead and flush behind for MADV_SEQUENTIAL areas. Since we are
2001 * sure this is sequential access, we don't need a flexible read-ahead
2002 * window size -- we can always use a large fixed size window.
2004 static void nopage_sequential_readahead(struct vm_area_struct * vma,
2005 unsigned long pgoff, unsigned long filesize)
2007 unsigned long ra_window;
2009 ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
2010 ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
2012 /* vm_raend is zero if we haven't read ahead in this area yet. */
2013 if (vma->vm_raend == 0)
2014 vma->vm_raend = vma->vm_pgoff + ra_window;
2017 * If we've just faulted the page half-way through our window,
2018 * then schedule reads for the next window, and release the
2019 * pages in the previous window.
2021 if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
2022 unsigned long start = vma->vm_pgoff + vma->vm_raend;
2023 unsigned long end = start + ra_window;
2025 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
2026 end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
2030 while ((start < end) && (start < filesize)) {
2031 if (read_cluster_nonblocking(vma->vm_file,
2032 start, filesize) < 0)
2034 start += CLUSTER_PAGES;
2036 run_task_queue(&tq_disk);
2038 /* if we're far enough past the beginning of this area,
2039 recycle pages that are in the previous window. */
2040 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
2041 unsigned long window = ra_window << PAGE_SHIFT;
2043 end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
2044 end -= window + window;
2045 filemap_sync(vma, end - window, window, MS_INVALIDATE);
2048 vma->vm_raend += ra_window;
2055 * filemap_nopage() is invoked via the vma operations vector for a
2056 * mapped memory region to read in file data during a page fault.
2058 * The goto's are kind of ugly, but this streamlines the normal case of having
2059 * it in the page cache, and handles the special cases reasonably without
2060 * having a lot of duplicated code.
2062 struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
2065 struct file *file = area->vm_file;
2066 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2067 struct inode *inode = mapping->host;
2068 struct page *page, **hash;
2069 unsigned long size, pgoff, endoff;
2071 pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2072 endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2076 * An external ptracer can access pages that normally aren't
2079 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2080 if ((pgoff >= size) && (area->vm_mm == current->mm))
2083 /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
2088 * Do we have something in the page cache already?
2090 hash = page_hash(mapping, pgoff);
2092 page = __find_get_page(mapping, pgoff, hash);
2094 goto no_cached_page;
2097 * Ok, found a page in the page cache, now we need to check
2098 * that it's up-to-date.
2100 if (!Page_Uptodate(page))
2101 goto page_not_uptodate;
2105 * Try read-ahead for sequential areas.
2107 if (VM_SequentialReadHint(area))
2108 nopage_sequential_readahead(area, pgoff, size);
2111 * Found the page and have a reference on it, need to check sharing
2112 * and possibly copy it over to another page..
2114 mark_page_accessed(page);
2115 flush_page_to_ram(page);
2120 * If the requested offset is within our file, try to read a whole
2121 * cluster of pages at once.
2123 * Otherwise, we're off the end of a privately mapped file,
2124 * so we need to map a zero page.
2126 if ((pgoff < size) && !VM_RandomReadHint(area))
2127 error = read_cluster_nonblocking(file, pgoff, size);
2129 error = page_cache_read(file, pgoff);
2132 * The page we want has now been added to the page cache.
2133 * In the unlikely event that someone removed it in the
2134 * meantime, we'll just come back here and read it again.
2140 * An error return from page_cache_read can result if the
2141 * system is low on memory, or a problem occurs while trying
2144 if (error == -ENOMEM)
2151 /* Did it get unhashed while we waited for it? */
2152 if (!page->mapping) {
2154 page_cache_release(page);
2158 /* Did somebody else get it up-to-date? */
2159 if (Page_Uptodate(page)) {
2164 if (!mapping->a_ops->readpage(file, page)) {
2166 if (Page_Uptodate(page))
2171 * Umm, take care of errors if the page isn't up-to-date.
2172 * Try to re-read it _once_. We do this synchronously,
2173 * because there really aren't any performance issues here
2174 * and we need to check for errors.
2178 /* Somebody truncated the page on us? */
2179 if (!page->mapping) {
2181 page_cache_release(page);
2185 /* Somebody else successfully read it in? */
2186 if (Page_Uptodate(page)) {
2190 ClearPageError(page);
2191 if (!mapping->a_ops->readpage(file, page)) {
2193 if (Page_Uptodate(page))
2198 * Things didn't work out. Return zero to tell the
2199 * mm layer so, possibly freeing the page cache page first.
2201 page_cache_release(page);
2205 /* Called with mm->page_table_lock held to protect against other
2206 * threads/the swapper from ripping pte's out from under us.
2208 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
2209 unsigned long address, unsigned int flags)
2213 if (pte_present(pte)) {
2214 struct page *page = pte_page(pte);
2215 if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
2216 flush_tlb_page(vma, address);
2217 set_page_dirty(page);
2223 static inline int filemap_sync_pte_range(pmd_t * pmd,
2224 unsigned long address, unsigned long size,
2225 struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
2233 if (pmd_bad(*pmd)) {
2238 pte = pte_offset(pmd, address);
2239 offset += address & PMD_MASK;
2240 address &= ~PMD_MASK;
2241 end = address + size;
2246 error |= filemap_sync_pte(pte, vma, address + offset, flags);
2247 address += PAGE_SIZE;
2249 } while (address && (address < end));
2253 static inline int filemap_sync_pmd_range(pgd_t * pgd,
2254 unsigned long address, unsigned long size,
2255 struct vm_area_struct *vma, unsigned int flags)
2258 unsigned long offset, end;
2263 if (pgd_bad(*pgd)) {
2268 pmd = pmd_offset(pgd, address);
2269 offset = address & PGDIR_MASK;
2270 address &= ~PGDIR_MASK;
2271 end = address + size;
2272 if (end > PGDIR_SIZE)
2276 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
2277 address = (address + PMD_SIZE) & PMD_MASK;
2279 } while (address && (address < end));
2283 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
2284 size_t size, unsigned int flags)
2287 unsigned long end = address + size;
2290 /* Aquire the lock early; it may be possible to avoid dropping
2291 * and reaquiring it repeatedly.
2293 spin_lock(&vma->vm_mm->page_table_lock);
2295 dir = pgd_offset(vma->vm_mm, address);
2296 flush_cache_range(vma->vm_mm, end - size, end);
2300 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
2301 address = (address + PGDIR_SIZE) & PGDIR_MASK;
2303 } while (address && (address < end));
2304 flush_tlb_range(vma->vm_mm, end - size, end);
2306 spin_unlock(&vma->vm_mm->page_table_lock);
2311 static struct vm_operations_struct generic_file_vm_ops = {
2312 nopage: filemap_nopage,
2315 /* This is used for a general mmap of a disk file */
2317 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2319 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2320 struct inode *inode = mapping->host;
2322 if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
2323 if (!mapping->a_ops->writepage)
2326 if (!mapping->a_ops->readpage)
2328 UPDATE_ATIME(inode);
2329 vma->vm_ops = &generic_file_vm_ops;
2334 * The msync() system call.
2338 * MS_SYNC syncs the entire file - including mappings.
2340 * MS_ASYNC initiates writeout of just the dirty mapped data.
2341 * This provides no guarantee of file integrity - things like indirect
2342 * blocks may not have started writeout. MS_ASYNC is primarily useful
2343 * where the application knows that it has finished with the data and
2344 * wishes to intelligently schedule its own I/O traffic.
2346 static int msync_interval(struct vm_area_struct * vma,
2347 unsigned long start, unsigned long end, int flags)
2350 struct file * file = vma->vm_file;
2352 if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) )
2355 if (file && (vma->vm_flags & VM_SHARED)) {
2356 ret = filemap_sync(vma, start, end-start, flags);
2358 if (!ret && (flags & (MS_SYNC|MS_ASYNC))) {
2359 struct inode * inode = file->f_dentry->d_inode;
2361 down(&inode->i_sem);
2362 ret = filemap_fdatasync(inode->i_mapping);
2363 if (flags & MS_SYNC) {
2366 if (file->f_op && file->f_op->fsync) {
2367 err = file->f_op->fsync(file, file->f_dentry, 1);
2371 err = filemap_fdatawait(inode->i_mapping);
2381 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
2384 struct vm_area_struct * vma;
2385 int unmapped_error, error = -EINVAL;
2387 down_read(¤t->mm->mmap_sem);
2388 if (start & ~PAGE_MASK)
2390 len = (len + ~PAGE_MASK) & PAGE_MASK;
2394 if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
2396 if ((flags & MS_ASYNC) && (flags & MS_SYNC))
2403 * If the interval [start,end) covers some unmapped address ranges,
2404 * just ignore them, but return -ENOMEM at the end.
2406 vma = find_vma(current->mm, start);
2409 /* Still start < end. */
2413 /* Here start < vma->vm_end. */
2414 if (start < vma->vm_start) {
2415 unmapped_error = -ENOMEM;
2416 start = vma->vm_start;
2418 /* Here vma->vm_start <= start < vma->vm_end. */
2419 if (end <= vma->vm_end) {
2421 error = msync_interval(vma, start, end, flags);
2425 error = unmapped_error;
2428 /* Here vma->vm_start <= start < vma->vm_end < end. */
2429 error = msync_interval(vma, start, vma->vm_end, flags);
2432 start = vma->vm_end;
2436 up_read(¤t->mm->mmap_sem);
2440 static inline void setup_read_behavior(struct vm_area_struct * vma,
2443 VM_ClearReadHint(vma);
2445 case MADV_SEQUENTIAL:
2446 vma->vm_flags |= VM_SEQ_READ;
2449 vma->vm_flags |= VM_RAND_READ;
2457 static long madvise_fixup_start(struct vm_area_struct * vma,
2458 unsigned long end, int behavior)
2460 struct vm_area_struct * n;
2461 struct mm_struct * mm = vma->vm_mm;
2463 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2468 setup_read_behavior(n, behavior);
2471 get_file(n->vm_file);
2472 if (n->vm_ops && n->vm_ops->open)
2474 vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
2475 lock_vma_mappings(vma);
2476 spin_lock(&mm->page_table_lock);
2477 vma->vm_start = end;
2478 __insert_vm_struct(mm, n);
2479 spin_unlock(&mm->page_table_lock);
2480 unlock_vma_mappings(vma);
2484 static long madvise_fixup_end(struct vm_area_struct * vma,
2485 unsigned long start, int behavior)
2487 struct vm_area_struct * n;
2488 struct mm_struct * mm = vma->vm_mm;
2490 n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2494 n->vm_start = start;
2495 n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
2496 setup_read_behavior(n, behavior);
2499 get_file(n->vm_file);
2500 if (n->vm_ops && n->vm_ops->open)
2502 lock_vma_mappings(vma);
2503 spin_lock(&mm->page_table_lock);
2504 vma->vm_end = start;
2505 __insert_vm_struct(mm, n);
2506 spin_unlock(&mm->page_table_lock);
2507 unlock_vma_mappings(vma);
2511 static long madvise_fixup_middle(struct vm_area_struct * vma,
2512 unsigned long start, unsigned long end, int behavior)
2514 struct vm_area_struct * left, * right;
2515 struct mm_struct * mm = vma->vm_mm;
2517 left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2520 right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2522 kmem_cache_free(vm_area_cachep, left);
2527 left->vm_end = start;
2528 right->vm_start = end;
2529 right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
2531 right->vm_raend = 0;
2533 atomic_add(2, &vma->vm_file->f_count);
2535 if (vma->vm_ops && vma->vm_ops->open) {
2536 vma->vm_ops->open(left);
2537 vma->vm_ops->open(right);
2539 vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
2541 lock_vma_mappings(vma);
2542 spin_lock(&mm->page_table_lock);
2543 vma->vm_start = start;
2545 setup_read_behavior(vma, behavior);
2546 __insert_vm_struct(mm, left);
2547 __insert_vm_struct(mm, right);
2548 spin_unlock(&mm->page_table_lock);
2549 unlock_vma_mappings(vma);
2554 * We can potentially split a vm area into separate
2555 * areas, each area with its own behavior.
2557 static long madvise_behavior(struct vm_area_struct * vma,
2558 unsigned long start, unsigned long end, int behavior)
2562 /* This caps the number of vma's this process can own */
2563 if (vma->vm_mm->map_count > max_map_count)
2566 if (start == vma->vm_start) {
2567 if (end == vma->vm_end) {
2568 setup_read_behavior(vma, behavior);
2571 error = madvise_fixup_start(vma, end, behavior);
2573 if (end == vma->vm_end)
2574 error = madvise_fixup_end(vma, start, behavior);
2576 error = madvise_fixup_middle(vma, start, end, behavior);
2583 * Schedule all required I/O operations, then run the disk queue
2584 * to make sure they are started. Do not wait for completion.
2586 static long madvise_willneed(struct vm_area_struct * vma,
2587 unsigned long start, unsigned long end)
2589 long error = -EBADF;
2591 struct inode * inode;
2594 /* Doesn't work if there's no mapped file. */
2597 file = vma->vm_file;
2598 inode = file->f_dentry->d_inode;
2599 if (!inode->i_mapping->a_ops->readpage)
2601 size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2603 start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2604 if (end > vma->vm_end)
2606 end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2610 /* round to cluster boundaries if this isn't a "random" area. */
2611 if (!VM_RandomReadHint(vma)) {
2612 start = CLUSTER_OFFSET(start);
2613 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
2615 while ((start < end) && (start < size)) {
2616 error = read_cluster_nonblocking(file, start, size);
2617 start += CLUSTER_PAGES;
2622 while ((start < end) && (start < size)) {
2623 error = page_cache_read(file, start);
2630 /* Don't wait for someone else to push these requests. */
2631 run_task_queue(&tq_disk);
2637 * Application no longer needs these pages. If the pages are dirty,
2638 * it's OK to just throw them away. The app will be more careful about
2639 * data it wants to keep. Be sure to free swap resources too. The
2640 * zap_page_range call sets things up for refill_inactive to actually free
2641 * these pages later if no one else has touched them in the meantime,
2642 * although we could add these pages to a global reuse list for
2643 * refill_inactive to pick up before reclaiming other pages.
2645 * NB: This interface discards data rather than pushes it out to swap,
2646 * as some implementations do. This has performance implications for
2647 * applications like large transactional databases which want to discard
2648 * pages in anonymous maps after committing to backing store the data
2649 * that was kept in them. There is no reason to write this data out to
2650 * the swap area if the application is discarding it.
2652 * An interface that causes the system to free clean pages and flush
2653 * dirty pages is already available as msync(MS_INVALIDATE).
2655 static long madvise_dontneed(struct vm_area_struct * vma,
2656 unsigned long start, unsigned long end)
2658 if (vma->vm_flags & VM_LOCKED)
2661 zap_page_range(vma->vm_mm, start, end - start);
2665 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2666 unsigned long end, int behavior)
2668 long error = -EBADF;
2672 case MADV_SEQUENTIAL:
2674 error = madvise_behavior(vma, start, end, behavior);
2678 error = madvise_willneed(vma, start, end);
2682 error = madvise_dontneed(vma, start, end);
2694 * The madvise(2) system call.
2696 * Applications can use madvise() to advise the kernel how it should
2697 * handle paging I/O in this VM area. The idea is to help the kernel
2698 * use appropriate read-ahead and caching techniques. The information
2699 * provided is advisory only, and can be safely disregarded by the
2700 * kernel without affecting the correct operation of the application.
2703 * MADV_NORMAL - the default behavior is to read clusters. This
2704 * results in some read-ahead and read-behind.
2705 * MADV_RANDOM - the system should read the minimum amount of data
2706 * on any access, since it is unlikely that the appli-
2707 * cation will need more than what it asks for.
2708 * MADV_SEQUENTIAL - pages in the given range will probably be accessed
2709 * once, so they can be aggressively read ahead, and
2710 * can be freed soon after they are accessed.
2711 * MADV_WILLNEED - the application is notifying the system to read
2713 * MADV_DONTNEED - the application is finished with the given range,
2714 * so the kernel can free resources associated with it.
2718 * -EINVAL - start + len < 0, start is not page-aligned,
2719 * "behavior" is not a valid value, or application
2720 * is attempting to release locked or shared pages.
2721 * -ENOMEM - addresses in the specified range are not currently
2722 * mapped, or are outside the AS of the process.
2723 * -EIO - an I/O error occurred while paging in data.
2724 * -EBADF - map exists, but area maps something that isn't a file.
2725 * -EAGAIN - a kernel resource was temporarily unavailable.
2727 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2730 struct vm_area_struct * vma;
2731 int unmapped_error = 0;
2732 int error = -EINVAL;
2734 down_write(¤t->mm->mmap_sem);
2736 if (start & ~PAGE_MASK)
2738 len = (len + ~PAGE_MASK) & PAGE_MASK;
2748 * If the interval [start,end) covers some unmapped address
2749 * ranges, just ignore them, but return -ENOMEM at the end.
2751 vma = find_vma(current->mm, start);
2753 /* Still start < end. */
2758 /* Here start < vma->vm_end. */
2759 if (start < vma->vm_start) {
2760 unmapped_error = -ENOMEM;
2761 start = vma->vm_start;
2764 /* Here vma->vm_start <= start < vma->vm_end. */
2765 if (end <= vma->vm_end) {
2767 error = madvise_vma(vma, start, end,
2772 error = unmapped_error;
2776 /* Here vma->vm_start <= start < vma->vm_end < end. */
2777 error = madvise_vma(vma, start, vma->vm_end, behavior);
2780 start = vma->vm_end;
2785 up_write(¤t->mm->mmap_sem);
2790 * Later we can get more picky about what "in core" means precisely.
2791 * For now, simply check to see if the page is in the page cache,
2792 * and is up to date; i.e. that no page-in operation would be required
2793 * at this time if an application were to map and access this page.
2795 static unsigned char mincore_page(struct vm_area_struct * vma,
2796 unsigned long pgoff)
2798 unsigned char present = 0;
2799 struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
2800 struct page * page, ** hash = page_hash(as, pgoff);
2802 spin_lock(&pagecache_lock);
2803 page = __find_page_nolock(as, pgoff, *hash);
2804 if ((page) && (Page_Uptodate(page)))
2806 spin_unlock(&pagecache_lock);
2812 * Do a chunk of "sys_mincore()". We've already checked
2813 * all the arguments, we hold the mmap semaphore: we should
2814 * just return the amount of info we're asked for.
2816 static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
2818 unsigned long i, nr, pgoff;
2819 struct vm_area_struct *vma = find_vma(current->mm, addr);
2822 * find_vma() didn't find anything above us, or we're
2823 * in an unmapped hole in the address space: ENOMEM.
2825 if (!vma || addr < vma->vm_start)
2829 * Ok, got it. But check whether it's a segment we support
2830 * mincore() on. Right now, we don't do any anonymous mappings.
2832 * FIXME: This is just stupid. And returning ENOMEM is
2833 * stupid too. We should just look at the page tables. But
2834 * this is what we've traditionally done, so we'll just
2835 * continue doing it.
2841 * Calculate how many pages there are left in the vma, and
2842 * what the pgoff is for our address.
2844 nr = (vma->vm_end - addr) >> PAGE_SHIFT;
2848 pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
2849 pgoff += vma->vm_pgoff;
2851 /* And then we just fill the sucker in.. */
2852 for (i = 0 ; i < nr; i++, pgoff++)
2853 vec[i] = mincore_page(vma, pgoff);
2859 * The mincore(2) system call.
2861 * mincore() returns the memory residency status of the pages in the
2862 * current process's address space specified by [addr, addr + len).
2863 * The status is returned in a vector of bytes. The least significant
2864 * bit of each byte is 1 if the referenced page is in memory, otherwise
2867 * Because the status of a page can change after mincore() checks it
2868 * but before it returns to the application, the returned vector may
2869 * contain stale information. Only locked pages are guaranteed to
2874 * -EFAULT - vec points to an illegal address
2875 * -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
2876 * -ENOMEM - Addresses in the range [addr, addr + len] are
2877 * invalid for the address space of this process, or
2878 * specify one or more pages which are not currently
2880 * -EAGAIN - A kernel resource was temporarily unavailable.
2882 asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char *vec)
2885 unsigned long pages;
2888 /* Check the start address: needs to be page-aligned.. */
2889 if (start & ~PAGE_CACHE_MASK)
2892 /* ..and we need to be passed a valid user-space range */
2893 if (!access_ok(VERIFY_READ, (void *) start, len))
2896 /* This also avoids any overflows on PAGE_CACHE_ALIGN */
2897 pages = len >> PAGE_SHIFT;
2898 pages += (len & ~PAGE_MASK) != 0;
2900 if (!access_ok(VERIFY_WRITE, vec, pages))
2903 tmp = (void *) __get_free_page(GFP_USER);
2910 * Do at most PAGE_SIZE entries per iteration, due to
2911 * the temporary buffer size.
2913 down_read(¤t->mm->mmap_sem);
2914 retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
2915 up_read(¤t->mm->mmap_sem);
2919 if (copy_to_user(vec, tmp, retval)) {
2925 start += retval << PAGE_SHIFT;
2928 free_page((unsigned long) tmp);
2933 struct page *__read_cache_page(struct address_space *mapping,
2934 unsigned long index,
2935 int (*filler)(void *,struct page*),
2938 struct page **hash = page_hash(mapping, index);
2939 struct page *page, *cached_page = NULL;
2942 page = __find_get_page(mapping, index, hash);
2945 cached_page = page_cache_alloc(mapping);
2947 return ERR_PTR(-ENOMEM);
2950 if (add_to_page_cache_unique(page, mapping, index, hash))
2953 err = filler(data, page);
2955 page_cache_release(page);
2956 page = ERR_PTR(err);
2960 page_cache_release(cached_page);
2965 * Read into the page cache. If a page already exists,
2966 * and Page_Uptodate() is not set, try to fill the page.
2968 struct page *read_cache_page(struct address_space *mapping,
2969 unsigned long index,
2970 int (*filler)(void *,struct page*),
2977 page = __read_cache_page(mapping, index, filler, data);
2980 mark_page_accessed(page);
2981 if (Page_Uptodate(page))
2985 if (!page->mapping) {
2987 page_cache_release(page);
2990 if (Page_Uptodate(page)) {
2994 err = filler(data, page);
2996 page_cache_release(page);
2997 page = ERR_PTR(err);
3003 static inline struct page * __grab_cache_page(struct address_space *mapping,
3004 unsigned long index, struct page **cached_page)
3006 struct page *page, **hash = page_hash(mapping, index);
3008 page = __find_lock_page(mapping, index, hash);
3010 if (!*cached_page) {
3011 *cached_page = page_cache_alloc(mapping);
3015 page = *cached_page;
3016 if (add_to_page_cache_unique(page, mapping, index, hash))
3018 *cached_page = NULL;
3023 inline void remove_suid(struct inode *inode)
3027 /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
3028 mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
3030 /* was any of the uid bits set? */
3031 mode &= inode->i_mode;
3032 if (mode && !capable(CAP_FSETID)) {
3033 inode->i_mode &= ~mode;
3034 mark_inode_dirty(inode);
3039 * precheck_file_write():
3040 * Check the conditions on a file descriptor prior to beginning a write
3041 * on it. Contains the common precheck code for both buffered and direct
3044 int precheck_file_write(struct file *file, struct inode *inode,
3045 size_t *count, loff_t *ppos)
3048 unsigned long limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
3055 err = file->f_error;
3061 /* FIXME: this is for backwards compatibility with 2.4 */
3062 if (!S_ISBLK(inode->i_mode) && (file->f_flags & O_APPEND))
3063 *ppos = pos = inode->i_size;
3066 * Check whether we've reached the file size limit.
3070 if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
3072 send_sig(SIGXFSZ, current, 0);
3075 if (pos > 0xFFFFFFFFULL || *count > limit - (u32)pos) {
3076 /* send_sig(SIGXFSZ, current, 0); */
3077 *count = limit - (u32)pos;
3084 if ( pos + *count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
3085 if (pos >= MAX_NON_LFS) {
3086 send_sig(SIGXFSZ, current, 0);
3089 if (*count > MAX_NON_LFS - (u32)pos) {
3090 /* send_sig(SIGXFSZ, current, 0); */
3091 *count = MAX_NON_LFS - (u32)pos;
3096 * Are we about to exceed the fs block limit ?
3098 * If we have written data it becomes a short write
3099 * If we have exceeded without writing data we send
3100 * a signal and give them an EFBIG.
3102 * Linus frestrict idea will clean these up nicely..
3105 if (!S_ISBLK(inode->i_mode)) {
3106 if (pos >= inode->i_sb->s_maxbytes)
3108 if (*count || pos > inode->i_sb->s_maxbytes) {
3109 send_sig(SIGXFSZ, current, 0);
3113 /* zero-length writes at ->s_maxbytes are OK */
3116 if (pos + *count > inode->i_sb->s_maxbytes)
3117 *count = inode->i_sb->s_maxbytes - pos;
3119 if (is_read_only(inode->i_rdev)) {
3123 if (pos >= inode->i_size) {
3124 if (*count || pos > inode->i_size) {
3130 if (pos + *count > inode->i_size)
3131 *count = inode->i_size - pos;
3140 * Write to a file through the page cache.
3142 * We currently put everything into the page cache prior to writing it.
3143 * This is not a problem when writing full pages. With partial pages,
3144 * however, we first have to read the data into the cache, then
3145 * dirty the page, and finally schedule it for writing. Alternatively, we
3146 * could write-through just the portion of data that would go into that
3147 * page, but that would kill performance for applications that write data
3148 * line by line, and it's prone to race conditions.
3150 * Note that this routine doesn't try to keep track of dirty pages. Each
3151 * file system has to do this all by itself, unfortunately.
3155 do_generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3157 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3158 struct inode *inode = mapping->host;
3160 struct page *page, *cached_page;
3170 err = precheck_file_write(file, inode, &count, &pos);
3171 if (err != 0 || count == 0)
3175 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3176 mark_inode_dirty_sync(inode);
3179 unsigned long index, offset;
3184 * Try to find the page in the cache. If it isn't there,
3185 * allocate a free page.
3187 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
3188 index = pos >> PAGE_CACHE_SHIFT;
3189 bytes = PAGE_CACHE_SIZE - offset;
3194 * Bring in the user page that we will copy from _first_.
3195 * Otherwise there's a nasty deadlock on copying from the
3196 * same page as we're writing to, without it being marked
3199 { volatile unsigned char dummy;
3200 __get_user(dummy, buf);
3201 __get_user(dummy, buf+bytes-1);
3204 status = -ENOMEM; /* we'll assign it later anyway */
3205 page = __grab_cache_page(mapping, index, &cached_page);
3209 /* We have exclusive IO access to the page.. */
3210 if (!PageLocked(page)) {
3215 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
3218 page_fault = __copy_from_user(kaddr+offset, buf, bytes);
3219 flush_dcache_page(page);
3220 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
3234 /* Mark it unlocked again and drop the page.. */
3235 SetPageReferenced(page);
3237 page_cache_release(page);
3246 page_cache_release(cached_page);
3248 /* For now, when the user asks for O_SYNC, we'll actually
3249 * provide O_DSYNC. */
3251 if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
3252 status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
3255 err = written ? written : status;
3265 * If blocksize < pagesize, prepare_write() may have instantiated a
3266 * few blocks outside i_size. Trim these off again.
3270 page_cache_release(page);
3271 if (pos + bytes > inode->i_size)
3272 vmtruncate(inode, inode->i_size);
3277 do_generic_direct_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3279 struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3280 struct inode *inode = mapping->host;
3289 err = precheck_file_write(file, inode, &count, &pos);
3290 if (err != 0 || count == 0)
3293 if (!(file->f_flags & O_DIRECT))
3297 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3298 mark_inode_dirty_sync(inode);
3300 written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
3302 loff_t end = pos + written;
3303 if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
3304 inode->i_size = end;
3305 mark_inode_dirty(inode);
3308 invalidate_inode_pages2(mapping);
3311 * Sync the fs metadata but not the minor inode changes and
3312 * of course not the data as we did direct DMA for the IO.
3314 if (written >= 0 && (file->f_flags & O_SYNC))
3315 status = generic_osync_inode(inode, OSYNC_METADATA);
3317 err = written ? written : status;
3322 static int do_odirect_fallback(struct file *file, struct inode *inode,
3323 const char *buf, size_t count, loff_t *ppos)
3328 down(&inode->i_sem);
3329 ret = do_generic_file_write(file, buf, count, ppos);
3331 err = do_fdatasync(file);
3340 generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3342 struct inode *inode = file->f_dentry->d_inode->i_mapping->host;
3345 if ((ssize_t) count < 0)
3348 if (!access_ok(VERIFY_READ, buf, count))
3351 if (file->f_flags & O_DIRECT) {
3352 /* do_generic_direct_write may drop i_sem during the
3354 down_read(&inode->i_alloc_sem);
3355 down(&inode->i_sem);
3356 err = do_generic_direct_write(file, buf, count, ppos);
3358 up_read(&inode->i_alloc_sem);
3359 if (unlikely(err == -ENOTBLK))
3360 err = do_odirect_fallback(file, inode, buf, count, ppos);
3362 down(&inode->i_sem);
3363 err = do_generic_file_write(file, buf, count, ppos);
3370 void __init page_cache_init(unsigned long mempages)
3372 unsigned long htable_size, order;
3374 htable_size = mempages;
3375 htable_size *= sizeof(struct page *);
3376 for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
3380 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
3383 while((tmp >>= 1UL) != 0UL)
3386 page_hash_table = (struct page **)
3387 __get_free_pages(GFP_ATOMIC, order);
3388 } while(page_hash_table == NULL && --order > 0);
3390 printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
3391 (1 << page_hash_bits), order, (PAGE_SIZE << order));
3392 if (!page_hash_table)
3393 panic("Failed to allocate page hash table\n");
3394 memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));