mm/filemap.c

   1 /*
   2  *      linux/mm/filemap.c
   3  *
   4  * Copyright (C) 1994-2006  Linus Torvalds
   5  */
   6
   7 /*
   8  * This file handles the generic file mmap semantics used by
   9  * most "normal" filesystems (but you don't /have/ to use this:
  10  * the NFS filesystem used to do this differently, for example)
  11  */
  12 #include <linux/module.h>
  13 #include <linux/slab.h>
  14 #include <linux/shm.h>
  15 #include <linux/mman.h>
  16 #include <linux/locks.h>
  17 #include <linux/pagemap.h>
  18 #include <linux/swap.h>
  19 #include <linux/smp_lock.h>
  20 #include <linux/blkdev.h>
  21 #include <linux/file.h>
  22 #include <linux/swapctl.h>
  23 #include <linux/init.h>
  24 #include <linux/mm.h>
  25 #include <linux/iobuf.h>
  26
  27 #include <asm/pgalloc.h>
  28 #include <asm/uaccess.h>
  29 #include <asm/mman.h>
  30
  31 #include <linux/highmem.h>
  32
  33 /*
  34  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  35  * though.
  36  *
  37  * Shared mappings now work. 15.8.1995  Bruno.
  38  *
  39  * finished 'unifying' the page and buffer cache and SMP-threaded the
  40  * page-cache, 21.05.1999, Ingo Molnar <mingo@redhat.com>
  41  *
  42  * SMP-threaded pagemap-LRU 1999, Andrea Arcangeli <andrea@suse.de>
  43  */
  44
  45 unsigned long page_cache_size;
  46 unsigned int page_hash_bits;
  47 struct page **page_hash_table;
  48
  49 int vm_max_readahead = 31;
  50 int vm_min_readahead = 3;
  51 EXPORT_SYMBOL(vm_max_readahead);
  52 EXPORT_SYMBOL(vm_min_readahead);
  53
  54
  55 spinlock_cacheline_t pagecache_lock_cacheline  = {SPIN_LOCK_UNLOCKED};
  56 /*
  57  * NOTE: to avoid deadlocking you must never acquire the pagemap_lru_lock
  58  *      with the pagecache_lock held.
  59  *
  60  * Ordering:
  61  *      swap_lock ->
  62  *              pagemap_lru_lock ->
  63  *                      pagecache_lock
  64  */
  65 spinlock_cacheline_t pagemap_lru_lock_cacheline = {SPIN_LOCK_UNLOCKED};
  66
  67 #define CLUSTER_PAGES           (1 << page_cluster)
  68 #define CLUSTER_OFFSET(x)       (((x) >> page_cluster) << page_cluster)
  69
  70 static void FASTCALL(add_page_to_hash_queue(struct page * page, struct page **p));
  71 static void fastcall add_page_to_hash_queue(struct page * page, struct page **p)
  72 {
  73         struct page *next = *p;
  74
  75         *p = page;
  76         page->next_hash = next;
  77         page->pprev_hash = p;
  78         if (next)
  79                 next->pprev_hash = &page->next_hash;
  80         if (page->buffers)
  81                 PAGE_BUG(page);
  82         inc_nr_cache_pages(page);
  83 }
  84
  85 static inline void add_page_to_inode_queue(struct address_space *mapping, struct page * page)
  86 {
  87         struct list_head *head = &mapping->clean_pages;
  88
  89         mapping->nrpages++;
  90         list_add(&page->list, head);
  91         page->mapping = mapping;
  92 }
  93
  94 static inline void remove_page_from_inode_queue(struct page * page)
  95 {
  96         struct address_space * mapping = page->mapping;
  97
  98         if (mapping->a_ops->removepage)
  99                 mapping->a_ops->removepage(page);
 100
 101         list_del(&page->list);
 102         page->mapping = NULL;
 103         wmb();
 104         mapping->nrpages--;
 105         if (!mapping->nrpages)
 106                 refile_inode(mapping->host);
 107 }
 108
 109 static inline void remove_page_from_hash_queue(struct page * page)
 110 {
 111         struct page *next = page->next_hash;
 112         struct page **pprev = page->pprev_hash;
 113
 114         if (next)
 115                 next->pprev_hash = pprev;
 116         *pprev = next;
 117         page->pprev_hash = NULL;
 118         dec_nr_cache_pages(page);
 119 }
 120
 121 /*
 122  * Remove a page from the page cache and free it. Caller has to make
 123  * sure the page is locked and that nobody else uses it - or that usage
 124  * is safe.
 125  */
 126 void __remove_inode_page(struct page *page)
 127 {
 128         remove_page_from_inode_queue(page);
 129         remove_page_from_hash_queue(page);
 130 }
 131
 132 void remove_inode_page(struct page *page)
 133 {
 134         if (!PageLocked(page))
 135                 PAGE_BUG(page);
 136
 137         spin_lock(&pagecache_lock);
 138         __remove_inode_page(page);
 139         spin_unlock(&pagecache_lock);
 140 }
 141
 142 static inline int sync_page(struct page *page)
 143 {
 144         struct address_space *mapping = page->mapping;
 145
 146         if (mapping && mapping->a_ops && mapping->a_ops->sync_page)
 147                 return mapping->a_ops->sync_page(page);
 148         return 0;
 149 }
 150
 151 /*
 152  * Add a page to the dirty page list.
 153  */
 154 void fastcall set_page_dirty(struct page *page)
 155 {
 156         if (!test_and_set_bit(PG_dirty, &page->flags)) {
 157                 struct address_space *mapping = page->mapping;
 158
 159                 if (mapping) {
 160                         spin_lock(&pagecache_lock);
 161                         mapping = page->mapping;
 162                         if (mapping) {  /* may have been truncated */
 163                                 list_del(&page->list);
 164                                 list_add(&page->list, &mapping->dirty_pages);
 165                         }
 166                         spin_unlock(&pagecache_lock);
 167
 168                         if (mapping && mapping->host)
 169                                 mark_inode_dirty_pages(mapping->host);
 170                         if (block_dump)
 171                                 printk(KERN_DEBUG "%s: dirtied page\n", current->comm);
 172                 }
 173         }
 174 }
 175
 176 /**
 177  * invalidate_inode_pages - Invalidate all the unlocked pages of one inode
 178  * @inode: the inode which pages we want to invalidate
 179  *
 180  * This function only removes the unlocked pages, if you want to
 181  * remove all the pages of one inode, you must call truncate_inode_pages.
 182  */
 183
 184 void invalidate_inode_pages(struct inode * inode)
 185 {
 186         struct list_head *head, *curr;
 187         struct page * page;
 188
 189         head = &inode->i_mapping->clean_pages;
 190
 191         spin_lock(&pagemap_lru_lock);
 192         spin_lock(&pagecache_lock);
 193         curr = head->next;
 194
 195         while (curr != head) {
 196                 page = list_entry(curr, struct page, list);
 197                 curr = curr->next;
 198
 199                 /* We cannot invalidate something in dirty.. */
 200                 if (PageDirty(page))
 201                         continue;
 202
 203                 /* ..or locked */
 204                 if (TryLockPage(page))
 205                         continue;
 206
 207                 if (page->buffers && !try_to_free_buffers(page, 0))
 208                         goto unlock;
 209
 210                 if (page_count(page) != 1)
 211                         goto unlock;
 212
 213                 __lru_cache_del(page);
 214                 __remove_inode_page(page);
 215                 UnlockPage(page);
 216                 page_cache_release(page);
 217                 continue;
 218 unlock:
 219                 UnlockPage(page);
 220                 continue;
 221         }
 222
 223         spin_unlock(&pagecache_lock);
 224         spin_unlock(&pagemap_lru_lock);
 225 }
 226
 227 static int do_flushpage(struct page *page, unsigned long offset)
 228 {
 229         int (*flushpage) (struct page *, unsigned long);
 230         flushpage = page->mapping->a_ops->flushpage;
 231         if (flushpage)
 232                 return (*flushpage)(page, offset);
 233         return block_flushpage(page, offset);
 234 }
 235
 236 static inline void truncate_partial_page(struct page *page, unsigned partial)
 237 {
 238         memclear_highpage_flush(page, partial, PAGE_CACHE_SIZE-partial);
 239         if (page->buffers)
 240                 do_flushpage(page, partial);
 241 }
 242
 243 static void truncate_complete_page(struct page *page)
 244 {
 245         /* Leave it on the LRU if it gets converted into anonymous buffers */
 246         if (!page->buffers || do_flushpage(page, 0))
 247                 lru_cache_del(page);
 248
 249         /*
 250          * We remove the page from the page cache _after_ we have
 251          * destroyed all buffer-cache references to it. Otherwise some
 252          * other process might think this inode page is not in the
 253          * page cache and creates a buffer-cache alias to it causing
 254          * all sorts of fun problems ...
 255          */
 256         ClearPageDirty(page);
 257         ClearPageUptodate(page);
 258         remove_inode_page(page);
 259         page_cache_release(page);
 260 }
 261
 262 static int FASTCALL(truncate_list_pages(struct list_head *, unsigned long, unsigned *));
 263 static int fastcall truncate_list_pages(struct list_head *head, unsigned long start, unsigned *partial)
 264 {
 265         struct list_head *curr;
 266         struct page * page;
 267         int unlocked = 0;
 268
 269  restart:
 270         curr = head->prev;
 271         while (curr != head) {
 272                 unsigned long offset;
 273
 274                 page = list_entry(curr, struct page, list);
 275                 offset = page->index;
 276
 277                 /* Is one of the pages to truncate? */
 278                 if ((offset >= start) || (*partial && (offset + 1) == start)) {
 279                         int failed;
 280
 281                         page_cache_get(page);
 282                         failed = TryLockPage(page);
 283
 284                         list_del(head);
 285                         if (!failed)
 286                                 /* Restart after this page */
 287                                 list_add_tail(head, curr);
 288                         else
 289                                 /* Restart on this page */
 290                                 list_add(head, curr);
 291
 292                         spin_unlock(&pagecache_lock);
 293                         unlocked = 1;
 294
 295                         if (!failed) {
 296                                 if (*partial && (offset + 1) == start) {
 297                                         truncate_partial_page(page, *partial);
 298                                         *partial = 0;
 299                                 } else
 300                                         truncate_complete_page(page);
 301
 302                                 UnlockPage(page);
 303                         } else
 304                                 wait_on_page(page);
 305
 306                         page_cache_release(page);
 307
 308                         if (current->need_resched) {
 309                                 __set_current_state(TASK_RUNNING);
 310                                 schedule();
 311                         }
 312
 313                         spin_lock(&pagecache_lock);
 314                         goto restart;
 315                 }
 316                 curr = curr->prev;
 317         }
 318         return unlocked;
 319 }
 320
 321
 322 /**
 323  * truncate_inode_pages - truncate *all* the pages from an offset
 324  * @mapping: mapping to truncate
 325  * @lstart: offset from with to truncate
 326  *
 327  * Truncate the page cache at a set offset, removing the pages
 328  * that are beyond that offset (and zeroing out partial pages).
 329  * If any page is locked we wait for it to become unlocked.
 330  */
 331 void truncate_inode_pages(struct address_space * mapping, loff_t lstart)
 332 {
 333         unsigned long start = (lstart + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 334         unsigned partial = lstart & (PAGE_CACHE_SIZE - 1);
 335         int unlocked;
 336
 337         spin_lock(&pagecache_lock);
 338         do {
 339                 unlocked = truncate_list_pages(&mapping->clean_pages, start, &partial);
 340                 unlocked |= truncate_list_pages(&mapping->dirty_pages, start, &partial);
 341                 unlocked |= truncate_list_pages(&mapping->locked_pages, start, &partial);
 342         } while (unlocked);
 343         /* Traversed all three lists without dropping the lock */
 344         spin_unlock(&pagecache_lock);
 345 }
 346
 347 static inline int invalidate_this_page2(struct page * page,
 348                                         struct list_head * curr,
 349                                         struct list_head * head)
 350 {
 351         int unlocked = 1;
 352
 353         /*
 354          * The page is locked and we hold the pagecache_lock as well
 355          * so both page_count(page) and page->buffers stays constant here.
 356          */
 357         if (page_count(page) == 1 + !!page->buffers) {
 358                 /* Restart after this page */
 359                 list_del(head);
 360                 list_add_tail(head, curr);
 361
 362                 page_cache_get(page);
 363                 spin_unlock(&pagecache_lock);
 364                 truncate_complete_page(page);
 365         } else {
 366                 if (page->buffers) {
 367                         /* Restart after this page */
 368                         list_del(head);
 369                         list_add_tail(head, curr);
 370
 371                         page_cache_get(page);
 372                         spin_unlock(&pagecache_lock);
 373                         block_invalidate_page(page);
 374                 } else
 375                         unlocked = 0;
 376
 377                 ClearPageDirty(page);
 378                 ClearPageUptodate(page);
 379         }
 380
 381         return unlocked;
 382 }
 383
 384 static int FASTCALL(invalidate_list_pages2(struct list_head *));
 385 static int fastcall invalidate_list_pages2(struct list_head *head)
 386 {
 387         struct list_head *curr;
 388         struct page * page;
 389         int unlocked = 0;
 390
 391  restart:
 392         curr = head->prev;
 393         while (curr != head) {
 394                 page = list_entry(curr, struct page, list);
 395
 396                 if (!TryLockPage(page)) {
 397                         int __unlocked;
 398
 399                         __unlocked = invalidate_this_page2(page, curr, head);
 400                         UnlockPage(page);
 401                         unlocked |= __unlocked;
 402                         if (!__unlocked) {
 403                                 curr = curr->prev;
 404                                 continue;
 405                         }
 406                 } else {
 407                         /* Restart on this page */
 408                         list_del(head);
 409                         list_add(head, curr);
 410
 411                         page_cache_get(page);
 412                         spin_unlock(&pagecache_lock);
 413                         unlocked = 1;
 414                         wait_on_page(page);
 415                 }
 416
 417                 page_cache_release(page);
 418                 if (current->need_resched) {
 419                         __set_current_state(TASK_RUNNING);
 420                         schedule();
 421                 }
 422
 423                 spin_lock(&pagecache_lock);
 424                 goto restart;
 425         }
 426         return unlocked;
 427 }
 428
 429 /**
 430  * invalidate_inode_pages2 - Clear all the dirty bits around if it can't
 431  * free the pages because they're mapped.
 432  * @mapping: the address_space which pages we want to invalidate
 433  */
 434 void invalidate_inode_pages2(struct address_space * mapping)
 435 {
 436         int unlocked;
 437
 438         spin_lock(&pagecache_lock);
 439         do {
 440                 unlocked = invalidate_list_pages2(&mapping->clean_pages);
 441                 unlocked |= invalidate_list_pages2(&mapping->dirty_pages);
 442                 unlocked |= invalidate_list_pages2(&mapping->locked_pages);
 443         } while (unlocked);
 444         spin_unlock(&pagecache_lock);
 445 }
 446
 447 static inline struct page * __find_page_nolock(struct address_space *mapping, unsigned long offset, struct page *page)
 448 {
 449         goto inside;
 450
 451         for (;;) {
 452                 page = page->next_hash;
 453 inside:
 454                 if (!page)
 455                         goto not_found;
 456                 if (page->mapping != mapping)
 457                         continue;
 458                 if (page->index == offset)
 459                         break;
 460         }
 461
 462 not_found:
 463         return page;
 464 }
 465
 466 static int do_buffer_fdatasync(struct list_head *head, unsigned long start, unsigned long end, int (*fn)(struct page *))
 467 {
 468         struct list_head *curr;
 469         struct page *page;
 470         int retval = 0;
 471
 472         spin_lock(&pagecache_lock);
 473         curr = head->next;
 474         while (curr != head) {
 475                 page = list_entry(curr, struct page, list);
 476                 curr = curr->next;
 477                 if (!page->buffers)
 478                         continue;
 479                 if (page->index >= end)
 480                         continue;
 481                 if (page->index < start)
 482                         continue;
 483
 484                 page_cache_get(page);
 485                 spin_unlock(&pagecache_lock);
 486                 lock_page(page);
 487
 488                 /* The buffers could have been free'd while we waited for the page lock */
 489                 if (page->buffers)
 490                         retval |= fn(page);
 491
 492                 UnlockPage(page);
 493                 spin_lock(&pagecache_lock);
 494                 curr = page->list.next;
 495                 page_cache_release(page);
 496         }
 497         spin_unlock(&pagecache_lock);
 498
 499         return retval;
 500 }
 501
 502 /*
 503  * Two-stage data sync: first start the IO, then go back and
 504  * collect the information..
 505  */
 506 int generic_buffer_fdatasync(struct inode *inode, unsigned long start_idx, unsigned long end_idx)
 507 {
 508         int retval;
 509
 510         /* writeout dirty buffers on pages from both clean and dirty lists */
 511         retval = do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, writeout_one_page);
 512         retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, writeout_one_page);
 513         retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, writeout_one_page);
 514
 515         /* now wait for locked buffers on pages from both clean and dirty lists */
 516         retval |= do_buffer_fdatasync(&inode->i_mapping->dirty_pages, start_idx, end_idx, waitfor_one_page);
 517         retval |= do_buffer_fdatasync(&inode->i_mapping->clean_pages, start_idx, end_idx, waitfor_one_page);
 518         retval |= do_buffer_fdatasync(&inode->i_mapping->locked_pages, start_idx, end_idx, waitfor_one_page);
 519
 520         return retval;
 521 }
 522
 523 /*
 524  * In-memory filesystems have to fail their
 525  * writepage function - and this has to be
 526  * worked around in the VM layer..
 527  *
 528  * We
 529  *  - mark the page dirty again (but do NOT
 530  *    add it back to the inode dirty list, as
 531  *    that would livelock in fdatasync)
 532  *  - activate the page so that the page stealer
 533  *    doesn't try to write it out over and over
 534  *    again.
 535  */
 536 int fail_writepage(struct page *page)
 537 {
 538         /* Only activate on memory-pressure, not fsync.. */
 539         if (PageLaunder(page)) {
 540                 activate_page(page);
 541                 SetPageReferenced(page);
 542         }
 543
 544         /* Set the page dirty again, unlock */
 545         SetPageDirty(page);
 546         UnlockPage(page);
 547         return 0;
 548 }
 549
 550 EXPORT_SYMBOL(fail_writepage);
 551
 552 /**
 553  *      filemap_fdatawrite - walk the list of dirty pages of the given address space
 554  *      and writepage() each unlocked page (does not wait on locked pages).
 555  *
 556  *      @mapping: address space structure to write
 557  *
 558  */
 559 int filemap_fdatawrite(struct address_space * mapping)
 560 {
 561         int ret = 0;
 562         int (*writepage)(struct page *) = mapping->a_ops->writepage;
 563
 564         spin_lock(&pagecache_lock);
 565
 566         while (!list_empty(&mapping->dirty_pages)) {
 567                 struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
 568
 569                 list_del(&page->list);
 570                 list_add(&page->list, &mapping->locked_pages);
 571
 572                 if (!PageDirty(page))
 573                         continue;
 574
 575                 page_cache_get(page);
 576                 spin_unlock(&pagecache_lock);
 577
 578                 if (!TryLockPage(page)) {
 579                         if (PageDirty(page)) {
 580                                 int err;
 581                                 ClearPageDirty(page);
 582                                 err = writepage(page);
 583                                 if (err && !ret)
 584                                         ret = err;
 585                         } else
 586                                 UnlockPage(page);
 587                 }
 588                 page_cache_release(page);
 589                 spin_lock(&pagecache_lock);
 590         }
 591         spin_unlock(&pagecache_lock);
 592         return ret;
 593 }
 594
 595 /**
 596  *      filemap_fdatasync - walk the list of dirty pages of the given address space
 597  *      and writepage() all of them.
 598  *
 599  *      @mapping: address space structure to write
 600  *
 601  */
 602 int filemap_fdatasync(struct address_space * mapping)
 603 {
 604         int ret = 0;
 605         int (*writepage)(struct page *) = mapping->a_ops->writepage;
 606
 607         spin_lock(&pagecache_lock);
 608
 609         while (!list_empty(&mapping->dirty_pages)) {
 610                 struct page *page = list_entry(mapping->dirty_pages.prev, struct page, list);
 611
 612                 list_del(&page->list);
 613                 list_add(&page->list, &mapping->locked_pages);
 614
 615                 if (!PageDirty(page))
 616                         continue;
 617
 618                 page_cache_get(page);
 619                 spin_unlock(&pagecache_lock);
 620
 621                 lock_page(page);
 622
 623                 if (PageDirty(page)) {
 624                         int err;
 625                         ClearPageDirty(page);
 626                         err = writepage(page);
 627                         if (err && !ret)
 628                                 ret = err;
 629                 } else
 630                         UnlockPage(page);
 631
 632                 page_cache_release(page);
 633                 spin_lock(&pagecache_lock);
 634         }
 635         spin_unlock(&pagecache_lock);
 636         return ret;
 637 }
 638
 639 /**
 640  *      filemap_fdatawait - walk the list of locked pages of the given address space
 641  *      and wait for all of them.
 642  *
 643  *      @mapping: address space structure to wait for
 644  *
 645  */
 646 int filemap_fdatawait(struct address_space * mapping)
 647 {
 648         int ret = 0;
 649
 650         spin_lock(&pagecache_lock);
 651
 652         while (!list_empty(&mapping->locked_pages)) {
 653                 struct page *page = list_entry(mapping->locked_pages.next, struct page, list);
 654
 655                 list_del(&page->list);
 656                 list_add(&page->list, &mapping->clean_pages);
 657
 658                 if (!PageLocked(page))
 659                         continue;
 660
 661                 page_cache_get(page);
 662                 spin_unlock(&pagecache_lock);
 663
 664                 ___wait_on_page(page);
 665                 if (PageError(page))
 666                         ret = -EIO;
 667
 668                 page_cache_release(page);
 669                 spin_lock(&pagecache_lock);
 670         }
 671         spin_unlock(&pagecache_lock);
 672         return ret;
 673 }
 674
 675 /*
 676  * Add a page to the inode page cache.
 677  *
 678  * The caller must have locked the page and
 679  * set all the page flags correctly..
 680  */
 681 void add_to_page_cache_locked(struct page * page, struct address_space *mapping, unsigned long index)
 682 {
 683         if (!PageLocked(page))
 684                 BUG();
 685
 686         page->index = index;
 687         page_cache_get(page);
 688         spin_lock(&pagecache_lock);
 689         add_page_to_inode_queue(mapping, page);
 690         add_page_to_hash_queue(page, page_hash(mapping, index));
 691         spin_unlock(&pagecache_lock);
 692
 693         lru_cache_add(page);
 694 }
 695
 696 /*
 697  * This adds a page to the page cache, starting out as locked,
 698  * owned by us, but unreferenced, not uptodate and with no errors.
 699  */
 700 static inline void __add_to_page_cache(struct page * page,
 701         struct address_space *mapping, unsigned long offset,
 702         struct page **hash)
 703 {
 704         /*
 705          * Yes this is inefficient, however it is needed.  The problem
 706          * is that we could be adding a page to the swap cache while
 707          * another CPU is also modifying page->flags, so the updates
 708          * really do need to be atomic.  -- Rik
 709          */
 710         ClearPageUptodate(page);
 711         ClearPageError(page);
 712         ClearPageDirty(page);
 713         ClearPageReferenced(page);
 714         ClearPageArch1(page);
 715         ClearPageChecked(page);
 716         LockPage(page);
 717         page_cache_get(page);
 718         page->index = offset;
 719         add_page_to_inode_queue(mapping, page);
 720         add_page_to_hash_queue(page, hash);
 721 }
 722
 723 void add_to_page_cache(struct page * page, struct address_space * mapping, unsigned long offset)
 724 {
 725         spin_lock(&pagecache_lock);
 726         __add_to_page_cache(page, mapping, offset, page_hash(mapping, offset));
 727         spin_unlock(&pagecache_lock);
 728         lru_cache_add(page);
 729 }
 730
 731 int add_to_page_cache_unique(struct page * page,
 732         struct address_space *mapping, unsigned long offset,
 733         struct page **hash)
 734 {
 735         int err;
 736         struct page *alias;
 737
 738         spin_lock(&pagecache_lock);
 739         alias = __find_page_nolock(mapping, offset, *hash);
 740
 741         err = 1;
 742         if (!alias) {
 743                 __add_to_page_cache(page,mapping,offset,hash);
 744                 err = 0;
 745         }
 746
 747         spin_unlock(&pagecache_lock);
 748         if (!err)
 749                 lru_cache_add(page);
 750         return err;
 751 }
 752
 753 /*
 754  * This adds the requested page to the page cache if it isn't already there,
 755  * and schedules an I/O to read in its contents from disk.
 756  */
 757 static int FASTCALL(page_cache_read(struct file * file, unsigned long offset));
 758 static int fastcall page_cache_read(struct file * file, unsigned long offset)
 759 {
 760         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
 761         struct page **hash = page_hash(mapping, offset);
 762         struct page *page;
 763
 764         spin_lock(&pagecache_lock);
 765         page = __find_page_nolock(mapping, offset, *hash);
 766         spin_unlock(&pagecache_lock);
 767         if (page)
 768                 return 0;
 769
 770         page = page_cache_alloc(mapping);
 771         if (!page)
 772                 return -ENOMEM;
 773
 774         if (!add_to_page_cache_unique(page, mapping, offset, hash)) {
 775                 int error = mapping->a_ops->readpage(file, page);
 776                 page_cache_release(page);
 777                 return error;
 778         }
 779         /*
 780          * We arrive here in the unlikely event that someone
 781          * raced with us and added our page to the cache first.
 782          */
 783         page_cache_release(page);
 784         return 0;
 785 }
 786
 787 /*
 788  * Read in an entire cluster at once.  A cluster is usually a 64k-
 789  * aligned block that includes the page requested in "offset."
 790  */
 791 static int FASTCALL(read_cluster_nonblocking(struct file * file, unsigned long offset,
 792                                              unsigned long filesize));
 793 static int fastcall read_cluster_nonblocking(struct file * file, unsigned long offset,
 794         unsigned long filesize)
 795 {
 796         unsigned long pages = CLUSTER_PAGES;
 797
 798         offset = CLUSTER_OFFSET(offset);
 799         while ((pages-- > 0) && (offset < filesize)) {
 800                 int error = page_cache_read(file, offset);
 801                 if (error < 0)
 802                         return error;
 803                 offset ++;
 804         }
 805
 806         return 0;
 807 }
 808
 809 /*
 810  * Knuth recommends primes in approximately golden ratio to the maximum
 811  * integer representable by a machine word for multiplicative hashing.
 812  * Chuck Lever verified the effectiveness of this technique:
 813  * http://www.citi.umich.edu/techreports/reports/citi-tr-00-1.pdf
 814  *
 815  * These primes are chosen to be bit-sparse, that is operations on
 816  * them can use shifts and additions instead of multiplications for
 817  * machines where multiplications are slow.
 818  */
 819 #if BITS_PER_LONG == 32
 820 /* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */
 821 #define GOLDEN_RATIO_PRIME 0x9e370001UL
 822 #elif BITS_PER_LONG == 64
 823 /*  2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */
 824 #define GOLDEN_RATIO_PRIME 0x9e37fffffffc0001UL
 825 #else
 826 #error Define GOLDEN_RATIO_PRIME for your wordsize.
 827 #endif
 828
 829 /*
 830  * In order to wait for pages to become available there must be
 831  * waitqueues associated with pages. By using a hash table of
 832  * waitqueues where the bucket discipline is to maintain all
 833  * waiters on the same queue and wake all when any of the pages
 834  * become available, and for the woken contexts to check to be
 835  * sure the appropriate page became available, this saves space
 836  * at a cost of "thundering herd" phenomena during rare hash
 837  * collisions.
 838  */
 839 static inline wait_queue_head_t *page_waitqueue(struct page *page)
 840 {
 841         const zone_t *zone = page_zone(page);
 842         wait_queue_head_t *wait = zone->wait_table;
 843         unsigned long hash = (unsigned long)page;
 844
 845 #if BITS_PER_LONG == 64
 846         /*  Sigh, gcc can't optimise this alone like it does for 32 bits. */
 847         unsigned long n = hash;
 848         n <<= 18;
 849         hash -= n;
 850         n <<= 33;
 851         hash -= n;
 852         n <<= 3;
 853         hash += n;
 854         n <<= 3;
 855         hash -= n;
 856         n <<= 4;
 857         hash += n;
 858         n <<= 2;
 859         hash += n;
 860 #else
 861         /* On some cpus multiply is faster, on others gcc will do shifts */
 862         hash *= GOLDEN_RATIO_PRIME;
 863 #endif
 864         hash >>= zone->wait_table_shift;
 865
 866         return &wait[hash];
 867 }
 868
 869 /*
 870  * This must be called after every submit_bh with end_io
 871  * callbacks that would result into the blkdev layer waking
 872  * up the page after a queue unplug.
 873  */
 874 void fastcall wakeup_page_waiters(struct page * page)
 875 {
 876         wait_queue_head_t * head;
 877
 878         head = page_waitqueue(page);
 879         if (waitqueue_active(head))
 880                 wake_up(head);
 881 }
 882
 883 /*
 884  * Wait for a page to get unlocked.
 885  *
 886  * This must be called with the caller "holding" the page,
 887  * ie with increased "page->count" so that the page won't
 888  * go away during the wait..
 889  *
 890  * The waiting strategy is to get on a waitqueue determined
 891  * by hashing. Waiters will then collide, and the newly woken
 892  * task must then determine whether it was woken for the page
 893  * it really wanted, and go back to sleep on the waitqueue if
 894  * that wasn't it. With the waitqueue semantics, it never leaves
 895  * the waitqueue unless it calls, so the loop moves forward one
 896  * iteration every time there is
 897  * (1) a collision
 898  * and
 899  * (2) one of the colliding pages is woken
 900  *
 901  * This is the thundering herd problem, but it is expected to
 902  * be very rare due to the few pages that are actually being
 903  * waited on at any given time and the quality of the hash function.
 904  */
 905 void ___wait_on_page(struct page *page)
 906 {
 907         wait_queue_head_t *waitqueue = page_waitqueue(page);
 908         struct task_struct *tsk = current;
 909         DECLARE_WAITQUEUE(wait, tsk);
 910
 911         add_wait_queue(waitqueue, &wait);
 912         do {
 913                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 914                 if (!PageLocked(page))
 915                         break;
 916                 sync_page(page);
 917                 schedule();
 918         } while (PageLocked(page));
 919         __set_task_state(tsk, TASK_RUNNING);
 920         remove_wait_queue(waitqueue, &wait);
 921 }
 922
 923 /*
 924  * unlock_page() is the other half of the story just above
 925  * __wait_on_page(). Here a couple of quick checks are done
 926  * and a couple of flags are set on the page, and then all
 927  * of the waiters for all of the pages in the appropriate
 928  * wait queue are woken.
 929  */
 930 void fastcall unlock_page(struct page *page)
 931 {
 932         wait_queue_head_t *waitqueue = page_waitqueue(page);
 933         ClearPageLaunder(page);
 934         smp_mb__before_clear_bit();
 935         if (!test_and_clear_bit(PG_locked, &(page)->flags))
 936                 BUG();
 937         smp_mb__after_clear_bit();
 938
 939         /*
 940          * Although the default semantics of wake_up() are
 941          * to wake all, here the specific function is used
 942          * to make it even more explicit that a number of
 943          * pages are being waited on here.
 944          */
 945         if (waitqueue_active(waitqueue))
 946                 wake_up_all(waitqueue);
 947 }
 948
 949 /*
 950  * Get a lock on the page, assuming we need to sleep
 951  * to get it..
 952  */
 953 static void __lock_page(struct page *page)
 954 {
 955         wait_queue_head_t *waitqueue = page_waitqueue(page);
 956         struct task_struct *tsk = current;
 957         DECLARE_WAITQUEUE(wait, tsk);
 958
 959         add_wait_queue_exclusive(waitqueue, &wait);
 960         for (;;) {
 961                 set_task_state(tsk, TASK_UNINTERRUPTIBLE);
 962                 if (PageLocked(page)) {
 963                         sync_page(page);
 964                         schedule();
 965                 }
 966                 if (!TryLockPage(page))
 967                         break;
 968         }
 969         __set_task_state(tsk, TASK_RUNNING);
 970         remove_wait_queue(waitqueue, &wait);
 971 }
 972
 973 /*
 974  * Get an exclusive lock on the page, optimistically
 975  * assuming it's not locked..
 976  */
 977 void fastcall lock_page(struct page *page)
 978 {
 979         if (TryLockPage(page))
 980                 __lock_page(page);
 981 }
 982
 983 /*
 984  * a rather lightweight function, finding and getting a reference to a
 985  * hashed page atomically.
 986  */
 987 struct page * __find_get_page(struct address_space *mapping,
 988                               unsigned long offset, struct page **hash)
 989 {
 990         struct page *page;
 991
 992         /*
 993          * We scan the hash list read-only. Addition to and removal from
 994          * the hash-list needs a held write-lock.
 995          */
 996         spin_lock(&pagecache_lock);
 997         page = __find_page_nolock(mapping, offset, *hash);
 998         if (page)
 999                 page_cache_get(page);
1000         spin_unlock(&pagecache_lock);
1001         return page;
1002 }
1003
1004 /*
1005  * Same as above, but trylock it instead of incrementing the count.
1006  */
1007 struct page *find_trylock_page(struct address_space *mapping, unsigned long offset)
1008 {
1009         struct page *page;
1010         struct page **hash = page_hash(mapping, offset);
1011
1012         spin_lock(&pagecache_lock);
1013         page = __find_page_nolock(mapping, offset, *hash);
1014         if (page) {
1015                 if (TryLockPage(page))
1016                         page = NULL;
1017         }
1018         spin_unlock(&pagecache_lock);
1019         return page;
1020 }
1021
1022 /*
1023  * Must be called with the pagecache lock held,
1024  * will return with it held (but it may be dropped
1025  * during blocking operations..
1026  */
1027 static struct page * FASTCALL(__find_lock_page_helper(struct address_space *, unsigned long, struct page *));
1028 static struct page * fastcall __find_lock_page_helper(struct address_space *mapping,
1029                                         unsigned long offset, struct page *hash)
1030 {
1031         struct page *page;
1032
1033         /*
1034          * We scan the hash list read-only. Addition to and removal from
1035          * the hash-list needs a held write-lock.
1036          */
1037 repeat:
1038         page = __find_page_nolock(mapping, offset, hash);
1039         if (page) {
1040                 page_cache_get(page);
1041                 if (TryLockPage(page)) {
1042                         spin_unlock(&pagecache_lock);
1043                         lock_page(page);
1044                         spin_lock(&pagecache_lock);
1045
1046                         /* Has the page been re-allocated while we slept? */
1047                         if (page->mapping != mapping || page->index != offset) {
1048                                 UnlockPage(page);
1049                                 page_cache_release(page);
1050                                 goto repeat;
1051                         }
1052                 }
1053         }
1054         return page;
1055 }
1056
1057 /*
1058  * Same as the above, but lock the page too, verifying that
1059  * it's still valid once we own it.
1060  */
1061 struct page * __find_lock_page (struct address_space *mapping,
1062                                 unsigned long offset, struct page **hash)
1063 {
1064         struct page *page;
1065
1066         spin_lock(&pagecache_lock);
1067         page = __find_lock_page_helper(mapping, offset, *hash);
1068         spin_unlock(&pagecache_lock);
1069         return page;
1070 }
1071
1072 /*
1073  * Same as above, but create the page if required..
1074  */
1075 struct page * find_or_create_page(struct address_space *mapping, unsigned long index, unsigned int gfp_mask)
1076 {
1077         struct page *page;
1078         struct page **hash = page_hash(mapping, index);
1079
1080         spin_lock(&pagecache_lock);
1081         page = __find_lock_page_helper(mapping, index, *hash);
1082         spin_unlock(&pagecache_lock);
1083         if (!page) {
1084                 struct page *newpage = alloc_page(gfp_mask);
1085                 if (newpage) {
1086                         spin_lock(&pagecache_lock);
1087                         page = __find_lock_page_helper(mapping, index, *hash);
1088                         if (likely(!page)) {
1089                                 page = newpage;
1090                                 __add_to_page_cache(page, mapping, index, hash);
1091                                 newpage = NULL;
1092                         }
1093                         spin_unlock(&pagecache_lock);
1094                         if (newpage == NULL)
1095                                 lru_cache_add(page);
1096                         else
1097                                 page_cache_release(newpage);
1098                 }
1099         }
1100         return page;
1101 }
1102
1103 /*
1104  * Same as grab_cache_page, but do not wait if the page is unavailable.
1105  * This is intended for speculative data generators, where the data can
1106  * be regenerated if the page couldn't be grabbed.  This routine should
1107  * be safe to call while holding the lock for another page.
1108  */
1109 struct page *grab_cache_page_nowait(struct address_space *mapping, unsigned long index)
1110 {
1111         struct page *page, **hash;
1112
1113         hash = page_hash(mapping, index);
1114         page = __find_get_page(mapping, index, hash);
1115
1116         if ( page ) {
1117                 if ( !TryLockPage(page) ) {
1118                         /* Page found and locked */
1119                         /* This test is overly paranoid, but what the heck... */
1120                         if ( unlikely(page->mapping != mapping || page->index != index) ) {
1121                                 /* Someone reallocated this page under us. */
1122                                 UnlockPage(page);
1123                                 page_cache_release(page);
1124                                 return NULL;
1125                         } else {
1126                                 return page;
1127                         }
1128                 } else {
1129                         /* Page locked by someone else */
1130                         page_cache_release(page);
1131                         return NULL;
1132                 }
1133         }
1134
1135         page = page_cache_alloc(mapping);
1136         if ( unlikely(!page) )
1137                 return NULL;    /* Failed to allocate a page */
1138
1139         if ( unlikely(add_to_page_cache_unique(page, mapping, index, hash)) ) {
1140                 /* Someone else grabbed the page already. */
1141                 page_cache_release(page);
1142                 return NULL;
1143         }
1144
1145         return page;
1146 }
1147
1148 #if 0
1149 #define PROFILE_READAHEAD
1150 #define DEBUG_READAHEAD
1151 #endif
1152
1153 /*
1154  * Read-ahead profiling information
1155  * --------------------------------
1156  * Every PROFILE_MAXREADCOUNT, the following information is written
1157  * to the syslog:
1158  *   Percentage of asynchronous read-ahead.
1159  *   Average of read-ahead fields context value.
1160  * If DEBUG_READAHEAD is defined, a snapshot of these fields is written
1161  * to the syslog.
1162  */
1163
1164 #ifdef PROFILE_READAHEAD
1165
1166 #define PROFILE_MAXREADCOUNT 1000
1167
1168 static unsigned long total_reada;
1169 static unsigned long total_async;
1170 static unsigned long total_ramax;
1171 static unsigned long total_ralen;
1172 static unsigned long total_rawin;
1173
1174 static void profile_readahead(int async, struct file *filp)
1175 {
1176         unsigned long flags;
1177
1178         ++total_reada;
1179         if (async)
1180                 ++total_async;
1181
1182         total_ramax     += filp->f_ramax;
1183         total_ralen     += filp->f_ralen;
1184         total_rawin     += filp->f_rawin;
1185
1186         if (total_reada > PROFILE_MAXREADCOUNT) {
1187                 save_flags(flags);
1188                 cli();
1189                 if (!(total_reada > PROFILE_MAXREADCOUNT)) {
1190                         restore_flags(flags);
1191                         return;
1192                 }
1193
1194                 printk("Readahead average:  max=%ld, len=%ld, win=%ld, async=%ld%%\n",
1195                         total_ramax/total_reada,
1196                         total_ralen/total_reada,
1197                         total_rawin/total_reada,
1198                         (total_async*100)/total_reada);
1199 #ifdef DEBUG_READAHEAD
1200                 printk("Readahead snapshot: max=%ld, len=%ld, win=%ld, raend=%Ld\n",
1201                         filp->f_ramax, filp->f_ralen, filp->f_rawin, filp->f_raend);
1202 #endif
1203
1204                 total_reada     = 0;
1205                 total_async     = 0;
1206                 total_ramax     = 0;
1207                 total_ralen     = 0;
1208                 total_rawin     = 0;
1209
1210                 restore_flags(flags);
1211         }
1212 }
1213 #endif  /* defined PROFILE_READAHEAD */
1214
1215 /*
1216  * Read-ahead context:
1217  * -------------------
1218  * The read ahead context fields of the "struct file" are the following:
1219  * - f_raend : position of the first byte after the last page we tried to
1220  *             read ahead.
1221  * - f_ramax : current read-ahead maximum size.
1222  * - f_ralen : length of the current IO read block we tried to read-ahead.
1223  * - f_rawin : length of the current read-ahead window.
1224  *              if last read-ahead was synchronous then
1225  *                      f_rawin = f_ralen
1226  *              otherwise (was asynchronous)
1227  *                      f_rawin = previous value of f_ralen + f_ralen
1228  *
1229  * Read-ahead limits:
1230  * ------------------
1231  * MIN_READAHEAD   : minimum read-ahead size when read-ahead.
1232  * MAX_READAHEAD   : maximum read-ahead size when read-ahead.
1233  *
1234  * Synchronous read-ahead benefits:
1235  * --------------------------------
1236  * Using reasonable IO xfer length from peripheral devices increase system
1237  * performances.
1238  * Reasonable means, in this context, not too large but not too small.
1239  * The actual maximum value is:
1240  *      MAX_READAHEAD + PAGE_CACHE_SIZE = 76k is CONFIG_READA_SMALL is undefined
1241  *      and 32K if defined (4K page size assumed).
1242  *
1243  * Asynchronous read-ahead benefits:
1244  * ---------------------------------
1245  * Overlapping next read request and user process execution increase system
1246  * performance.
1247  *
1248  * Read-ahead risks:
1249  * -----------------
1250  * We have to guess which further data are needed by the user process.
1251  * If these data are often not really needed, it's bad for system
1252  * performances.
1253  * However, we know that files are often accessed sequentially by
1254  * application programs and it seems that it is possible to have some good
1255  * strategy in that guessing.
1256  * We only try to read-ahead files that seems to be read sequentially.
1257  *
1258  * Asynchronous read-ahead risks:
1259  * ------------------------------
1260  * In order to maximize overlapping, we must start some asynchronous read
1261  * request from the device, as soon as possible.
1262  * We must be very careful about:
1263  * - The number of effective pending IO read requests.
1264  *   ONE seems to be the only reasonable value.
1265  * - The total memory pool usage for the file access stream.
1266  *   This maximum memory usage is implicitly 2 IO read chunks:
1267  *   2*(MAX_READAHEAD + PAGE_CACHE_SIZE) = 156K if CONFIG_READA_SMALL is undefined,
1268  *   64k if defined (4K page size assumed).
1269  */
1270
1271 static inline int get_max_readahead(struct inode * inode)
1272 {
1273         if (!inode->i_dev || !max_readahead[MAJOR(inode->i_dev)])
1274                 return vm_max_readahead;
1275         return max_readahead[MAJOR(inode->i_dev)][MINOR(inode->i_dev)];
1276 }
1277
1278 static void generic_file_readahead(int reada_ok,
1279         struct file * filp, struct inode * inode,
1280         struct page * page)
1281 {
1282         unsigned long end_index;
1283         unsigned long index = page->index;
1284         unsigned long max_ahead, ahead;
1285         unsigned long raend;
1286         int max_readahead = get_max_readahead(inode);
1287
1288         end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1289
1290         raend = filp->f_raend;
1291         max_ahead = 0;
1292
1293 /*
1294  * The current page is locked.
1295  * If the current position is inside the previous read IO request, do not
1296  * try to reread previously read ahead pages.
1297  * Otherwise decide or not to read ahead some pages synchronously.
1298  * If we are not going to read ahead, set the read ahead context for this
1299  * page only.
1300  */
1301         if (PageLocked(page)) {
1302                 if (!filp->f_ralen || index >= raend || index + filp->f_rawin < raend) {
1303                         raend = index;
1304                         if (raend < end_index)
1305                                 max_ahead = filp->f_ramax;
1306                         filp->f_rawin = 0;
1307                         filp->f_ralen = 1;
1308                         if (!max_ahead) {
1309                                 filp->f_raend  = index + filp->f_ralen;
1310                                 filp->f_rawin += filp->f_ralen;
1311                         }
1312                 }
1313         }
1314 /*
1315  * The current page is not locked.
1316  * If we were reading ahead and,
1317  * if the current max read ahead size is not zero and,
1318  * if the current position is inside the last read-ahead IO request,
1319  *   it is the moment to try to read ahead asynchronously.
1320  * We will later force unplug device in order to force asynchronous read IO.
1321  */
1322         else if (reada_ok && filp->f_ramax && raend >= 1 &&
1323                  index <= raend && index + filp->f_ralen >= raend) {
1324 /*
1325  * Add ONE page to max_ahead in order to try to have about the same IO max size
1326  * as synchronous read-ahead (MAX_READAHEAD + 1)*PAGE_CACHE_SIZE.
1327  * Compute the position of the last page we have tried to read in order to
1328  * begin to read ahead just at the next page.
1329  */
1330                 raend -= 1;
1331                 if (raend < end_index)
1332                         max_ahead = filp->f_ramax + 1;
1333
1334                 if (max_ahead) {
1335                         filp->f_rawin = filp->f_ralen;
1336                         filp->f_ralen = 0;
1337                         reada_ok      = 2;
1338                 }
1339         }
1340 /*
1341  * Try to read ahead pages.
1342  * We hope that ll_rw_blk() plug/unplug, coalescence, requests sort and the
1343  * scheduler, will work enough for us to avoid too bad actuals IO requests.
1344  */
1345         ahead = 0;
1346         while (ahead < max_ahead) {
1347                 unsigned long ra_index = raend + ahead + 1;
1348
1349                 if (ra_index >= end_index)
1350                         break;
1351                 if (page_cache_read(filp, ra_index) < 0)
1352                         break;
1353
1354                 ahead++;
1355         }
1356 /*
1357  * If we tried to read ahead some pages,
1358  * If we tried to read ahead asynchronously,
1359  *   Try to force unplug of the device in order to start an asynchronous
1360  *   read IO request.
1361  * Update the read-ahead context.
1362  * Store the length of the current read-ahead window.
1363  * Double the current max read ahead size.
1364  *   That heuristic avoid to do some large IO for files that are not really
1365  *   accessed sequentially.
1366  */
1367         if (ahead) {
1368                 filp->f_ralen += ahead;
1369                 filp->f_rawin += filp->f_ralen;
1370                 filp->f_raend = raend + ahead + 1;
1371
1372                 filp->f_ramax += filp->f_ramax;
1373
1374                 if (filp->f_ramax > max_readahead)
1375                         filp->f_ramax = max_readahead;
1376
1377 #ifdef PROFILE_READAHEAD
1378                 profile_readahead((reada_ok == 2), filp);
1379 #endif
1380         }
1381
1382         return;
1383 }
1384
1385 /*
1386  * Mark a page as having seen activity.
1387  *
1388  * If it was already so marked, move it to the active queue and drop
1389  * the referenced bit.  Otherwise, just mark it for future action..
1390  */
1391 void fastcall mark_page_accessed(struct page *page)
1392 {
1393         if (!PageActive(page) && PageReferenced(page)) {
1394                 activate_page(page);
1395                 ClearPageReferenced(page);
1396         } else
1397                 SetPageReferenced(page);
1398 }
1399
1400 /*
1401  * This is a generic file read routine, and uses the
1402  * inode->i_op->readpage() function for the actual low-level
1403  * stuff.
1404  *
1405  * This is really ugly. But the goto's actually try to clarify some
1406  * of the logic when it comes to error handling etc.
1407  */
1408 void do_generic_file_read(struct file * filp, loff_t *ppos, read_descriptor_t * desc, read_actor_t actor)
1409 {
1410         struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1411         struct inode *inode = mapping->host;
1412         unsigned long index, offset;
1413         struct page *cached_page;
1414         int reada_ok;
1415         int error;
1416         int max_readahead = get_max_readahead(inode);
1417
1418         cached_page = NULL;
1419         index = *ppos >> PAGE_CACHE_SHIFT;
1420         offset = *ppos & ~PAGE_CACHE_MASK;
1421
1422 /*
1423  * If the current position is outside the previous read-ahead window,
1424  * we reset the current read-ahead context and set read ahead max to zero
1425  * (will be set to just needed value later),
1426  * otherwise, we assume that the file accesses are sequential enough to
1427  * continue read-ahead.
1428  */
1429         if (index > filp->f_raend || index + filp->f_rawin < filp->f_raend) {
1430                 reada_ok = 0;
1431                 filp->f_raend = 0;
1432                 filp->f_ralen = 0;
1433                 filp->f_ramax = 0;
1434                 filp->f_rawin = 0;
1435         } else {
1436                 reada_ok = 1;
1437         }
1438 /*
1439  * Adjust the current value of read-ahead max.
1440  * If the read operation stay in the first half page, force no readahead.
1441  * Otherwise try to increase read ahead max just enough to do the read request.
1442  * Then, at least MIN_READAHEAD if read ahead is ok,
1443  * and at most MAX_READAHEAD in all cases.
1444  */
1445         if (!index && offset + desc->count <= (PAGE_CACHE_SIZE >> 1)) {
1446                 filp->f_ramax = 0;
1447         } else {
1448                 unsigned long needed;
1449
1450                 needed = ((offset + desc->count) >> PAGE_CACHE_SHIFT) + 1;
1451
1452                 if (filp->f_ramax < needed)
1453                         filp->f_ramax = needed;
1454
1455                 if (reada_ok && filp->f_ramax < vm_min_readahead)
1456                                 filp->f_ramax = vm_min_readahead;
1457                 if (filp->f_ramax > max_readahead)
1458                         filp->f_ramax = max_readahead;
1459         }
1460
1461         for (;;) {
1462                 struct page *page, **hash;
1463                 unsigned long end_index, nr, ret;
1464
1465                 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1466
1467                 if (index > end_index)
1468                         break;
1469                 nr = PAGE_CACHE_SIZE;
1470                 if (index == end_index) {
1471                         nr = inode->i_size & ~PAGE_CACHE_MASK;
1472                         if (nr <= offset)
1473                                 break;
1474                 }
1475
1476                 nr = nr - offset;
1477
1478                 /*
1479                  * Try to find the data in the page cache..
1480                  */
1481                 hash = page_hash(mapping, index);
1482
1483                 spin_lock(&pagecache_lock);
1484                 page = __find_page_nolock(mapping, index, *hash);
1485                 if (!page)
1486                         goto no_cached_page;
1487 found_page:
1488                 page_cache_get(page);
1489                 spin_unlock(&pagecache_lock);
1490
1491                 if (!Page_Uptodate(page))
1492                         goto page_not_up_to_date;
1493                 generic_file_readahead(reada_ok, filp, inode, page);
1494 page_ok:
1495                 /* If users can be writing to this page using arbitrary
1496                  * virtual addresses, take care about potential aliasing
1497                  * before reading the page on the kernel side.
1498                  */
1499                 if (mapping->i_mmap_shared != NULL)
1500                         flush_dcache_page(page);
1501
1502                 /*
1503                  * Mark the page accessed if we read the
1504                  * beginning or we just did an lseek.
1505                  */
1506                 if (!offset || !filp->f_reada)
1507                         mark_page_accessed(page);
1508
1509                 /*
1510                  * Ok, we have the page, and it's up-to-date, so
1511                  * now we can copy it to user space...
1512                  *
1513                  * The actor routine returns how many bytes were actually used..
1514                  * NOTE! This may not be the same as how much of a user buffer
1515                  * we filled up (we may be padding etc), so we can only update
1516                  * "pos" here (the actor routine has to update the user buffer
1517                  * pointers and the remaining count).
1518                  */
1519                 ret = actor(desc, page, offset, nr);
1520                 offset += ret;
1521                 index += offset >> PAGE_CACHE_SHIFT;
1522                 offset &= ~PAGE_CACHE_MASK;
1523
1524                 page_cache_release(page);
1525                 if (ret == nr && desc->count)
1526                         continue;
1527                 break;
1528
1529 /*
1530  * Ok, the page was not immediately readable, so let's try to read ahead while we're at it..
1531  */
1532 page_not_up_to_date:
1533                 generic_file_readahead(reada_ok, filp, inode, page);
1534
1535                 if (Page_Uptodate(page))
1536                         goto page_ok;
1537
1538                 /* Get exclusive access to the page ... */
1539                 lock_page(page);
1540
1541                 /* Did it get unhashed before we got the lock? */
1542                 if (!page->mapping) {
1543                         UnlockPage(page);
1544                         page_cache_release(page);
1545                         continue;
1546                 }
1547
1548                 /* Did somebody else fill it already? */
1549                 if (Page_Uptodate(page)) {
1550                         UnlockPage(page);
1551                         goto page_ok;
1552                 }
1553
1554 readpage:
1555                 /* ... and start the actual read. The read will unlock the page. */
1556                 error = mapping->a_ops->readpage(filp, page);
1557
1558                 if (!error) {
1559                         if (Page_Uptodate(page))
1560                                 goto page_ok;
1561
1562                         /* Again, try some read-ahead while waiting for the page to finish.. */
1563                         generic_file_readahead(reada_ok, filp, inode, page);
1564                         wait_on_page(page);
1565                         if (Page_Uptodate(page))
1566                                 goto page_ok;
1567                         error = -EIO;
1568                 }
1569
1570                 /* UHHUH! A synchronous read error occurred. Report it */
1571                 desc->error = error;
1572                 page_cache_release(page);
1573                 break;
1574
1575 no_cached_page:
1576                 /*
1577                  * Ok, it wasn't cached, so we need to create a new
1578                  * page..
1579                  *
1580                  * We get here with the page cache lock held.
1581                  */
1582                 if (!cached_page) {
1583                         spin_unlock(&pagecache_lock);
1584                         cached_page = page_cache_alloc(mapping);
1585                         if (!cached_page) {
1586                                 desc->error = -ENOMEM;
1587                                 break;
1588                         }
1589
1590                         /*
1591                          * Somebody may have added the page while we
1592                          * dropped the page cache lock. Check for that.
1593                          */
1594                         spin_lock(&pagecache_lock);
1595                         page = __find_page_nolock(mapping, index, *hash);
1596                         if (page)
1597                                 goto found_page;
1598                 }
1599
1600                 /*
1601                  * Ok, add the new page to the hash-queues...
1602                  */
1603                 page = cached_page;
1604                 __add_to_page_cache(page, mapping, index, hash);
1605                 spin_unlock(&pagecache_lock);
1606                 lru_cache_add(page);
1607                 cached_page = NULL;
1608
1609                 goto readpage;
1610         }
1611
1612         *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1613         filp->f_reada = 1;
1614         if (cached_page)
1615                 page_cache_release(cached_page);
1616         UPDATE_ATIME(inode);
1617 }
1618
1619 static inline int have_mapping_directIO(struct address_space * mapping)
1620 {
1621         return mapping->a_ops->direct_IO || mapping->a_ops->direct_fileIO;
1622 }
1623
1624 /* Switch between old and new directIO formats */
1625 static inline int do_call_directIO(int rw, struct file *filp, struct kiobuf *iobuf, unsigned long offset, int blocksize)
1626 {
1627         struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1628
1629         if (mapping->a_ops->direct_fileIO)
1630                 return mapping->a_ops->direct_fileIO(rw, filp, iobuf, offset, blocksize);
1631         return mapping->a_ops->direct_IO(rw, mapping->host, iobuf, offset, blocksize);
1632 }
1633
1634 /*
1635  * i_sem and i_alloc_sem should be held already.  i_sem may be dropped
1636  * later once we've mapped the new IO.  i_alloc_sem is kept until the IO
1637  * completes.
1638  */
1639
1640 static ssize_t generic_file_direct_IO(int rw, struct file * filp, char * buf, size_t count, loff_t offset)
1641 {
1642         ssize_t retval, progress;
1643         int new_iobuf, chunk_size, blocksize_mask, blocksize, blocksize_bits;
1644         ssize_t iosize;
1645         struct kiobuf * iobuf;
1646         struct address_space * mapping = filp->f_dentry->d_inode->i_mapping;
1647         struct inode * inode = mapping->host;
1648         loff_t size = inode->i_size;
1649
1650         new_iobuf = 0;
1651         iobuf = filp->f_iobuf;
1652         if (test_and_set_bit(0, &filp->f_iobuf_lock)) {
1653                 /*
1654                  * A parallel read/write is using the preallocated iobuf
1655                  * so just run slow and allocate a new one.
1656                  */
1657                 retval = alloc_kiovec(1, &iobuf);
1658                 if (retval)
1659                         goto out;
1660                 new_iobuf = 1;
1661         }
1662
1663         blocksize = 1 << inode->i_blkbits;
1664         blocksize_bits = inode->i_blkbits;
1665         blocksize_mask = blocksize - 1;
1666         chunk_size = KIO_MAX_ATOMIC_IO << 10;
1667
1668         retval = -EINVAL;
1669         if ((offset & blocksize_mask) || (count & blocksize_mask) || ((unsigned long) buf & blocksize_mask))
1670                 goto out_free;
1671         if (!have_mapping_directIO(mapping))
1672                 goto out_free;
1673
1674         if ((rw == READ) && (offset + count > size))
1675                 count = size - offset;
1676
1677         /*
1678          * Flush to disk exclusively the _data_, metadata must remain
1679          * completly asynchronous or performance will go to /dev/null.
1680          */
1681         retval = filemap_fdatasync(mapping);
1682         if (retval == 0)
1683                 retval = fsync_inode_data_buffers(inode);
1684         if (retval == 0)
1685                 retval = filemap_fdatawait(mapping);
1686         if (retval < 0)
1687                 goto out_free;
1688
1689         progress = retval = 0;
1690         while (count > 0) {
1691                 iosize = count;
1692                 if (iosize > chunk_size)
1693                         iosize = chunk_size;
1694
1695                 retval = map_user_kiobuf(rw, iobuf, (unsigned long) buf, iosize);
1696                 if (retval)
1697                         break;
1698
1699                 retval = do_call_directIO(rw, filp, iobuf, (offset+progress) >> blocksize_bits, blocksize);
1700
1701                 if (rw == READ && retval > 0)
1702                         mark_dirty_kiobuf(iobuf, retval);
1703
1704                 if (retval >= 0) {
1705                         count -= retval;
1706                         buf += retval;
1707                         /* warning: weird semantics here, we're reporting a read behind the end of the file */
1708                         progress += retval;
1709                 }
1710
1711                 unmap_kiobuf(iobuf);
1712
1713                 if (retval != iosize)
1714                         break;
1715         }
1716
1717         if (progress)
1718                 retval = progress;
1719
1720  out_free:
1721         if (!new_iobuf)
1722                 clear_bit(0, &filp->f_iobuf_lock);
1723         else
1724                 free_kiovec(1, &iobuf);
1725  out:
1726         return retval;
1727 }
1728
1729 int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size)
1730 {
1731         char *kaddr;
1732         unsigned long left, count = desc->count;
1733
1734         if (size > count)
1735                 size = count;
1736
1737         kaddr = kmap(page);
1738         left = __copy_to_user(desc->buf, kaddr + offset, size);
1739         kunmap(page);
1740
1741         if (left) {
1742                 size -= left;
1743                 desc->error = -EFAULT;
1744         }
1745         desc->count = count - size;
1746         desc->written += size;
1747         desc->buf += size;
1748         return size;
1749 }
1750
1751 inline ssize_t do_generic_direct_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1752 {
1753         ssize_t retval;
1754         loff_t pos = *ppos;
1755
1756         retval = generic_file_direct_IO(READ, filp, buf, count, pos);
1757         if (retval > 0)
1758                 *ppos = pos + retval;
1759         return retval;
1760 }
1761
1762 /*
1763  * This is the "read()" routine for all filesystems
1764  * that can use the page cache directly.
1765  */
1766 ssize_t generic_file_read(struct file * filp, char * buf, size_t count, loff_t *ppos)
1767 {
1768         ssize_t retval;
1769
1770         if ((ssize_t) count < 0)
1771                 return -EINVAL;
1772
1773         if (filp->f_flags & O_DIRECT)
1774                 goto o_direct;
1775
1776         retval = -EFAULT;
1777         if (access_ok(VERIFY_WRITE, buf, count)) {
1778                 retval = 0;
1779
1780                 if (count) {
1781                         read_descriptor_t desc;
1782
1783                         desc.written = 0;
1784                         desc.count = count;
1785                         desc.buf = buf;
1786                         desc.error = 0;
1787                         do_generic_file_read(filp, ppos, &desc, file_read_actor);
1788
1789                         retval = desc.written;
1790                         if (!retval)
1791                                 retval = desc.error;
1792                 }
1793         }
1794  out:
1795         return retval;
1796
1797  o_direct:
1798         {
1799                 loff_t size;
1800                 struct address_space *mapping = filp->f_dentry->d_inode->i_mapping;
1801                 struct inode *inode = mapping->host;
1802
1803                 retval = 0;
1804                 if (!count)
1805                         goto out; /* skip atime */
1806                 down_read(&inode->i_alloc_sem);
1807                 down(&inode->i_sem);
1808                 size = inode->i_size;
1809                 if (*ppos < size)
1810                         retval = do_generic_direct_read(filp, buf, count, ppos);
1811                 up(&inode->i_sem);
1812                 up_read(&inode->i_alloc_sem);
1813                 UPDATE_ATIME(filp->f_dentry->d_inode);
1814                 goto out;
1815         }
1816 }
1817
1818 static int file_send_actor(read_descriptor_t * desc, struct page *page, unsigned long offset , unsigned long size)
1819 {
1820         ssize_t written;
1821         unsigned long count = desc->count;
1822         struct file *file = (struct file *) desc->buf;
1823
1824         if (size > count)
1825                 size = count;
1826
1827         if (file->f_op->sendpage) {
1828                 written = file->f_op->sendpage(file, page, offset,
1829                                                size, &file->f_pos, size<count);
1830         } else {
1831                 char *kaddr;
1832                 mm_segment_t old_fs;
1833
1834                 old_fs = get_fs();
1835                 set_fs(KERNEL_DS);
1836
1837                 kaddr = kmap(page);
1838                 written = file->f_op->write(file, kaddr + offset, size, &file->f_pos);
1839                 kunmap(page);
1840
1841                 set_fs(old_fs);
1842         }
1843         if (written < 0) {
1844                 desc->error = written;
1845                 written = 0;
1846         }
1847         desc->count = count - written;
1848         desc->written += written;
1849         return written;
1850 }
1851
1852 static ssize_t common_sendfile(int out_fd, int in_fd, loff_t *offset, size_t count)
1853 {
1854         ssize_t retval;
1855         struct file * in_file, * out_file;
1856         struct inode * in_inode, * out_inode;
1857
1858         /*
1859          * Get input file, and verify that it is ok..
1860          */
1861         retval = -EBADF;
1862         in_file = fget(in_fd);
1863         if (!in_file)
1864                 goto out;
1865         if (!(in_file->f_mode & FMODE_READ))
1866                 goto fput_in;
1867         retval = -EINVAL;
1868         in_inode = in_file->f_dentry->d_inode;
1869         if (!in_inode)
1870                 goto fput_in;
1871         if (!in_inode->i_mapping->a_ops->readpage)
1872                 goto fput_in;
1873         retval = rw_verify_area(READ, in_file, &in_file->f_pos, count);
1874         if (retval)
1875                 goto fput_in;
1876
1877         /*
1878          * Get output file, and verify that it is ok..
1879          */
1880         retval = -EBADF;
1881         out_file = fget(out_fd);
1882         if (!out_file)
1883                 goto fput_in;
1884         if (!(out_file->f_mode & FMODE_WRITE))
1885                 goto fput_out;
1886         retval = -EINVAL;
1887         if (!out_file->f_op || !out_file->f_op->write)
1888                 goto fput_out;
1889         out_inode = out_file->f_dentry->d_inode;
1890         retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count);
1891         if (retval)
1892                 goto fput_out;
1893
1894         retval = 0;
1895         if (count) {
1896                 read_descriptor_t desc;
1897
1898                 if (!offset)
1899                         offset = &in_file->f_pos;
1900
1901                 desc.written = 0;
1902                 desc.count = count;
1903                 desc.buf = (char *) out_file;
1904                 desc.error = 0;
1905                 do_generic_file_read(in_file, offset, &desc, file_send_actor);
1906
1907                 retval = desc.written;
1908                 if (!retval)
1909                         retval = desc.error;
1910         }
1911
1912 fput_out:
1913         fput(out_file);
1914 fput_in:
1915         fput(in_file);
1916 out:
1917         return retval;
1918 }
1919
1920 asmlinkage ssize_t sys_sendfile(int out_fd, int in_fd, off_t *offset, size_t count)
1921 {
1922         loff_t pos, *ppos = NULL;
1923         ssize_t ret;
1924         if (offset) {
1925                 off_t off;
1926                 if (unlikely(get_user(off, offset)))
1927                         return -EFAULT;
1928                 pos = off;
1929                 ppos = &pos;
1930         }
1931         ret = common_sendfile(out_fd, in_fd, ppos, count);
1932         if (offset)
1933                 put_user((off_t)pos, offset);
1934         return ret;
1935 }
1936
1937 asmlinkage ssize_t sys_sendfile64(int out_fd, int in_fd, loff_t *offset, size_t count)
1938 {
1939         loff_t pos, *ppos = NULL;
1940         ssize_t ret;
1941         if (offset) {
1942                 if (unlikely(copy_from_user(&pos, offset, sizeof(loff_t))))
1943                         return -EFAULT;
1944                 ppos = &pos;
1945         }
1946         ret = common_sendfile(out_fd, in_fd, ppos, count);
1947         if (offset)
1948                 put_user(pos, offset);
1949         return ret;
1950 }
1951
1952 static ssize_t do_readahead(struct file *file, unsigned long index, unsigned long nr)
1953 {
1954         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
1955         unsigned long max;
1956
1957         if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
1958                 return -EINVAL;
1959
1960         /* Limit it to the size of the file.. */
1961         max = (mapping->host->i_size + ~PAGE_CACHE_MASK) >> PAGE_CACHE_SHIFT;
1962         if (index > max)
1963                 return 0;
1964         max -= index;
1965         if (nr > max)
1966                 nr = max;
1967
1968         /* And limit it to a sane percentage of the inactive list.. */
1969         max = (nr_free_pages() + nr_inactive_pages) / 2;
1970         if (nr > max)
1971                 nr = max;
1972
1973         while (nr) {
1974                 page_cache_read(file, index);
1975                 index++;
1976                 nr--;
1977         }
1978         return 0;
1979 }
1980
1981 asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count)
1982 {
1983         ssize_t ret;
1984         struct file *file;
1985
1986         ret = -EBADF;
1987         file = fget(fd);
1988         if (file) {
1989                 if (file->f_mode & FMODE_READ) {
1990                         unsigned long start = offset >> PAGE_CACHE_SHIFT;
1991                         unsigned long len = (count + ((long)offset & ~PAGE_CACHE_MASK)) >> PAGE_CACHE_SHIFT;
1992                         ret = do_readahead(file, start, len);
1993                 }
1994                 fput(file);
1995         }
1996         return ret;
1997 }
1998
1999 /*
2000  * Read-ahead and flush behind for MADV_SEQUENTIAL areas.  Since we are
2001  * sure this is sequential access, we don't need a flexible read-ahead
2002  * window size -- we can always use a large fixed size window.
2003  */
2004 static void nopage_sequential_readahead(struct vm_area_struct * vma,
2005         unsigned long pgoff, unsigned long filesize)
2006 {
2007         unsigned long ra_window;
2008
2009         ra_window = get_max_readahead(vma->vm_file->f_dentry->d_inode);
2010         ra_window = CLUSTER_OFFSET(ra_window + CLUSTER_PAGES - 1);
2011
2012         /* vm_raend is zero if we haven't read ahead in this area yet.  */
2013         if (vma->vm_raend == 0)
2014                 vma->vm_raend = vma->vm_pgoff + ra_window;
2015
2016         /*
2017          * If we've just faulted the page half-way through our window,
2018          * then schedule reads for the next window, and release the
2019          * pages in the previous window.
2020          */
2021         if ((pgoff + (ra_window >> 1)) == vma->vm_raend) {
2022                 unsigned long start = vma->vm_pgoff + vma->vm_raend;
2023                 unsigned long end = start + ra_window;
2024
2025                 if (end > ((vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff))
2026                         end = (vma->vm_end >> PAGE_SHIFT) + vma->vm_pgoff;
2027                 if (start > end)
2028                         return;
2029
2030                 while ((start < end) && (start < filesize)) {
2031                         if (read_cluster_nonblocking(vma->vm_file,
2032                                                         start, filesize) < 0)
2033                                 break;
2034                         start += CLUSTER_PAGES;
2035                 }
2036                 run_task_queue(&tq_disk);
2037
2038                 /* if we're far enough past the beginning of this area,
2039                    recycle pages that are in the previous window. */
2040                 if (vma->vm_raend > (vma->vm_pgoff + ra_window + ra_window)) {
2041                         unsigned long window = ra_window << PAGE_SHIFT;
2042
2043                         end = vma->vm_start + (vma->vm_raend << PAGE_SHIFT);
2044                         end -= window + window;
2045                         filemap_sync(vma, end - window, window, MS_INVALIDATE);
2046                 }
2047
2048                 vma->vm_raend += ra_window;
2049         }
2050
2051         return;
2052 }
2053
2054 /*
2055  * filemap_nopage() is invoked via the vma operations vector for a
2056  * mapped memory region to read in file data during a page fault.
2057  *
2058  * The goto's are kind of ugly, but this streamlines the normal case of having
2059  * it in the page cache, and handles the special cases reasonably without
2060  * having a lot of duplicated code.
2061  */
2062 struct page * filemap_nopage(struct vm_area_struct * area, unsigned long address, int unused)
2063 {
2064         int error;
2065         struct file *file = area->vm_file;
2066         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2067         struct inode *inode = mapping->host;
2068         struct page *page, **hash;
2069         unsigned long size, pgoff, endoff;
2070
2071         pgoff = ((address - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2072         endoff = ((area->vm_end - area->vm_start) >> PAGE_CACHE_SHIFT) + area->vm_pgoff;
2073
2074 retry_all:
2075         /*
2076          * An external ptracer can access pages that normally aren't
2077          * accessible..
2078          */
2079         size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2080         if ((pgoff >= size) && (area->vm_mm == current->mm))
2081                 return NULL;
2082
2083         /* The "size" of the file, as far as mmap is concerned, isn't bigger than the mapping */
2084         if (size > endoff)
2085                 size = endoff;
2086
2087         /*
2088          * Do we have something in the page cache already?
2089          */
2090         hash = page_hash(mapping, pgoff);
2091 retry_find:
2092         page = __find_get_page(mapping, pgoff, hash);
2093         if (!page)
2094                 goto no_cached_page;
2095
2096         /*
2097          * Ok, found a page in the page cache, now we need to check
2098          * that it's up-to-date.
2099          */
2100         if (!Page_Uptodate(page))
2101                 goto page_not_uptodate;
2102
2103 success:
2104         /*
2105          * Try read-ahead for sequential areas.
2106          */
2107         if (VM_SequentialReadHint(area))
2108                 nopage_sequential_readahead(area, pgoff, size);
2109
2110         /*
2111          * Found the page and have a reference on it, need to check sharing
2112          * and possibly copy it over to another page..
2113          */
2114         mark_page_accessed(page);
2115         flush_page_to_ram(page);
2116         return page;
2117
2118 no_cached_page:
2119         /*
2120          * If the requested offset is within our file, try to read a whole
2121          * cluster of pages at once.
2122          *
2123          * Otherwise, we're off the end of a privately mapped file,
2124          * so we need to map a zero page.
2125          */
2126         if ((pgoff < size) && !VM_RandomReadHint(area))
2127                 error = read_cluster_nonblocking(file, pgoff, size);
2128         else
2129                 error = page_cache_read(file, pgoff);
2130
2131         /*
2132          * The page we want has now been added to the page cache.
2133          * In the unlikely event that someone removed it in the
2134          * meantime, we'll just come back here and read it again.
2135          */
2136         if (error >= 0)
2137                 goto retry_find;
2138
2139         /*
2140          * An error return from page_cache_read can result if the
2141          * system is low on memory, or a problem occurs while trying
2142          * to schedule I/O.
2143          */
2144         if (error == -ENOMEM)
2145                 return NOPAGE_OOM;
2146         return NULL;
2147
2148 page_not_uptodate:
2149         lock_page(page);
2150
2151         /* Did it get unhashed while we waited for it? */
2152         if (!page->mapping) {
2153                 UnlockPage(page);
2154                 page_cache_release(page);
2155                 goto retry_all;
2156         }
2157
2158         /* Did somebody else get it up-to-date? */
2159         if (Page_Uptodate(page)) {
2160                 UnlockPage(page);
2161                 goto success;
2162         }
2163
2164         if (!mapping->a_ops->readpage(file, page)) {
2165                 wait_on_page(page);
2166                 if (Page_Uptodate(page))
2167                         goto success;
2168         }
2169
2170         /*
2171          * Umm, take care of errors if the page isn't up-to-date.
2172          * Try to re-read it _once_. We do this synchronously,
2173          * because there really aren't any performance issues here
2174          * and we need to check for errors.
2175          */
2176         lock_page(page);
2177
2178         /* Somebody truncated the page on us? */
2179         if (!page->mapping) {
2180                 UnlockPage(page);
2181                 page_cache_release(page);
2182                 goto retry_all;
2183         }
2184
2185         /* Somebody else successfully read it in? */
2186         if (Page_Uptodate(page)) {
2187                 UnlockPage(page);
2188                 goto success;
2189         }
2190         ClearPageError(page);
2191         if (!mapping->a_ops->readpage(file, page)) {
2192                 wait_on_page(page);
2193                 if (Page_Uptodate(page))
2194                         goto success;
2195         }
2196
2197         /*
2198          * Things didn't work out. Return zero to tell the
2199          * mm layer so, possibly freeing the page cache page first.
2200          */
2201         page_cache_release(page);
2202         return NULL;
2203 }
2204
2205 /* Called with mm->page_table_lock held to protect against other
2206  * threads/the swapper from ripping pte's out from under us.
2207  */
2208 static inline int filemap_sync_pte(pte_t * ptep, struct vm_area_struct *vma,
2209         unsigned long address, unsigned int flags)
2210 {
2211         pte_t pte = *ptep;
2212
2213         if (pte_present(pte)) {
2214                 struct page *page = pte_page(pte);
2215                 if (VALID_PAGE(page) && !PageReserved(page) && ptep_test_and_clear_dirty(ptep)) {
2216                         flush_tlb_page(vma, address);
2217                         set_page_dirty(page);
2218                 }
2219         }
2220         return 0;
2221 }
2222
2223 static inline int filemap_sync_pte_range(pmd_t * pmd,
2224         unsigned long address, unsigned long size,
2225         struct vm_area_struct *vma, unsigned long offset, unsigned int flags)
2226 {
2227         pte_t * pte;
2228         unsigned long end;
2229         int error;
2230
2231         if (pmd_none(*pmd))
2232                 return 0;
2233         if (pmd_bad(*pmd)) {
2234                 pmd_ERROR(*pmd);
2235                 pmd_clear(pmd);
2236                 return 0;
2237         }
2238         pte = pte_offset(pmd, address);
2239         offset += address & PMD_MASK;
2240         address &= ~PMD_MASK;
2241         end = address + size;
2242         if (end > PMD_SIZE)
2243                 end = PMD_SIZE;
2244         error = 0;
2245         do {
2246                 error |= filemap_sync_pte(pte, vma, address + offset, flags);
2247                 address += PAGE_SIZE;
2248                 pte++;
2249         } while (address && (address < end));
2250         return error;
2251 }
2252
2253 static inline int filemap_sync_pmd_range(pgd_t * pgd,
2254         unsigned long address, unsigned long size,
2255         struct vm_area_struct *vma, unsigned int flags)
2256 {
2257         pmd_t * pmd;
2258         unsigned long offset, end;
2259         int error;
2260
2261         if (pgd_none(*pgd))
2262                 return 0;
2263         if (pgd_bad(*pgd)) {
2264                 pgd_ERROR(*pgd);
2265                 pgd_clear(pgd);
2266                 return 0;
2267         }
2268         pmd = pmd_offset(pgd, address);
2269         offset = address & PGDIR_MASK;
2270         address &= ~PGDIR_MASK;
2271         end = address + size;
2272         if (end > PGDIR_SIZE)
2273                 end = PGDIR_SIZE;
2274         error = 0;
2275         do {
2276                 error |= filemap_sync_pte_range(pmd, address, end - address, vma, offset, flags);
2277                 address = (address + PMD_SIZE) & PMD_MASK;
2278                 pmd++;
2279         } while (address && (address < end));
2280         return error;
2281 }
2282
2283 int filemap_sync(struct vm_area_struct * vma, unsigned long address,
2284         size_t size, unsigned int flags)
2285 {
2286         pgd_t * dir;
2287         unsigned long end = address + size;
2288         int error = 0;
2289
2290         /* Aquire the lock early; it may be possible to avoid dropping
2291          * and reaquiring it repeatedly.
2292          */
2293         spin_lock(&vma->vm_mm->page_table_lock);
2294
2295         dir = pgd_offset(vma->vm_mm, address);
2296         flush_cache_range(vma->vm_mm, end - size, end);
2297         if (address >= end)
2298                 BUG();
2299         do {
2300                 error |= filemap_sync_pmd_range(dir, address, end - address, vma, flags);
2301                 address = (address + PGDIR_SIZE) & PGDIR_MASK;
2302                 dir++;
2303         } while (address && (address < end));
2304         flush_tlb_range(vma->vm_mm, end - size, end);
2305
2306         spin_unlock(&vma->vm_mm->page_table_lock);
2307
2308         return error;
2309 }
2310
2311 static struct vm_operations_struct generic_file_vm_ops = {
2312         nopage:         filemap_nopage,
2313 };
2314
2315 /* This is used for a general mmap of a disk file */
2316
2317 int generic_file_mmap(struct file * file, struct vm_area_struct * vma)
2318 {
2319         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
2320         struct inode *inode = mapping->host;
2321
2322         if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
2323                 if (!mapping->a_ops->writepage)
2324                         return -EINVAL;
2325         }
2326         if (!mapping->a_ops->readpage)
2327                 return -ENOEXEC;
2328         UPDATE_ATIME(inode);
2329         vma->vm_ops = &generic_file_vm_ops;
2330         return 0;
2331 }
2332
2333 /*
2334  * The msync() system call.
2335  */
2336
2337 /*
2338  * MS_SYNC syncs the entire file - including mappings.
2339  *
2340  * MS_ASYNC initiates writeout of just the dirty mapped data.
2341  * This provides no guarantee of file integrity - things like indirect
2342  * blocks may not have started writeout.  MS_ASYNC is primarily useful
2343  * where the application knows that it has finished with the data and
2344  * wishes to intelligently schedule its own I/O traffic.
2345  */
2346 static int msync_interval(struct vm_area_struct * vma,
2347         unsigned long start, unsigned long end, int flags)
2348 {
2349         int ret = 0;
2350         struct file * file = vma->vm_file;
2351
2352         if ( (flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED) )
2353                 return -EBUSY;
2354
2355         if (file && (vma->vm_flags & VM_SHARED)) {
2356                 ret = filemap_sync(vma, start, end-start, flags);
2357
2358                 if (!ret && (flags & (MS_SYNC|MS_ASYNC))) {
2359                         struct inode * inode = file->f_dentry->d_inode;
2360
2361                         down(&inode->i_sem);
2362                         ret = filemap_fdatasync(inode->i_mapping);
2363                         if (flags & MS_SYNC) {
2364                                 int err;
2365
2366                                 if (file->f_op && file->f_op->fsync) {
2367                                         err = file->f_op->fsync(file, file->f_dentry, 1);
2368                                         if (err && !ret)
2369                                                 ret = err;
2370                                 }
2371                                 err = filemap_fdatawait(inode->i_mapping);
2372                                 if (err && !ret)
2373                                         ret = err;
2374                         }
2375                         up(&inode->i_sem);
2376                 }
2377         }
2378         return ret;
2379 }
2380
2381 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
2382 {
2383         unsigned long end;
2384         struct vm_area_struct * vma;
2385         int unmapped_error, error = -EINVAL;
2386
2387         down_read(&current->mm->mmap_sem);
2388         if (start & ~PAGE_MASK)
2389                 goto out;
2390         len = (len + ~PAGE_MASK) & PAGE_MASK;
2391         end = start + len;
2392         if (end < start)
2393                 goto out;
2394         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
2395                 goto out;
2396         if ((flags & MS_ASYNC) && (flags & MS_SYNC))
2397                 goto out;
2398
2399         error = 0;
2400         if (end == start)
2401                 goto out;
2402         /*
2403          * If the interval [start,end) covers some unmapped address ranges,
2404          * just ignore them, but return -ENOMEM at the end.
2405          */
2406         vma = find_vma(current->mm, start);
2407         unmapped_error = 0;
2408         for (;;) {
2409                 /* Still start < end. */
2410                 error = -ENOMEM;
2411                 if (!vma)
2412                         goto out;
2413                 /* Here start < vma->vm_end. */
2414                 if (start < vma->vm_start) {
2415                         unmapped_error = -ENOMEM;
2416                         start = vma->vm_start;
2417                 }
2418                 /* Here vma->vm_start <= start < vma->vm_end. */
2419                 if (end <= vma->vm_end) {
2420                         if (start < end) {
2421                                 error = msync_interval(vma, start, end, flags);
2422                                 if (error)
2423                                         goto out;
2424                         }
2425                         error = unmapped_error;
2426                         goto out;
2427                 }
2428                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2429                 error = msync_interval(vma, start, vma->vm_end, flags);
2430                 if (error)
2431                         goto out;
2432                 start = vma->vm_end;
2433                 vma = vma->vm_next;
2434         }
2435 out:
2436         up_read(&current->mm->mmap_sem);
2437         return error;
2438 }
2439
2440 static inline void setup_read_behavior(struct vm_area_struct * vma,
2441         int behavior)
2442 {
2443         VM_ClearReadHint(vma);
2444         switch(behavior) {
2445                 case MADV_SEQUENTIAL:
2446                         vma->vm_flags |= VM_SEQ_READ;
2447                         break;
2448                 case MADV_RANDOM:
2449                         vma->vm_flags |= VM_RAND_READ;
2450                         break;
2451                 default:
2452                         break;
2453         }
2454         return;
2455 }
2456
2457 static long madvise_fixup_start(struct vm_area_struct * vma,
2458         unsigned long end, int behavior)
2459 {
2460         struct vm_area_struct * n;
2461         struct mm_struct * mm = vma->vm_mm;
2462
2463         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2464         if (!n)
2465                 return -EAGAIN;
2466         *n = *vma;
2467         n->vm_end = end;
2468         setup_read_behavior(n, behavior);
2469         n->vm_raend = 0;
2470         if (n->vm_file)
2471                 get_file(n->vm_file);
2472         if (n->vm_ops && n->vm_ops->open)
2473                 n->vm_ops->open(n);
2474         vma->vm_pgoff += (end - vma->vm_start) >> PAGE_SHIFT;
2475         lock_vma_mappings(vma);
2476         spin_lock(&mm->page_table_lock);
2477         vma->vm_start = end;
2478         __insert_vm_struct(mm, n);
2479         spin_unlock(&mm->page_table_lock);
2480         unlock_vma_mappings(vma);
2481         return 0;
2482 }
2483
2484 static long madvise_fixup_end(struct vm_area_struct * vma,
2485         unsigned long start, int behavior)
2486 {
2487         struct vm_area_struct * n;
2488         struct mm_struct * mm = vma->vm_mm;
2489
2490         n = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2491         if (!n)
2492                 return -EAGAIN;
2493         *n = *vma;
2494         n->vm_start = start;
2495         n->vm_pgoff += (n->vm_start - vma->vm_start) >> PAGE_SHIFT;
2496         setup_read_behavior(n, behavior);
2497         n->vm_raend = 0;
2498         if (n->vm_file)
2499                 get_file(n->vm_file);
2500         if (n->vm_ops && n->vm_ops->open)
2501                 n->vm_ops->open(n);
2502         lock_vma_mappings(vma);
2503         spin_lock(&mm->page_table_lock);
2504         vma->vm_end = start;
2505         __insert_vm_struct(mm, n);
2506         spin_unlock(&mm->page_table_lock);
2507         unlock_vma_mappings(vma);
2508         return 0;
2509 }
2510
2511 static long madvise_fixup_middle(struct vm_area_struct * vma,
2512         unsigned long start, unsigned long end, int behavior)
2513 {
2514         struct vm_area_struct * left, * right;
2515         struct mm_struct * mm = vma->vm_mm;
2516
2517         left = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2518         if (!left)
2519                 return -EAGAIN;
2520         right = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
2521         if (!right) {
2522                 kmem_cache_free(vm_area_cachep, left);
2523                 return -EAGAIN;
2524         }
2525         *left = *vma;
2526         *right = *vma;
2527         left->vm_end = start;
2528         right->vm_start = end;
2529         right->vm_pgoff += (right->vm_start - left->vm_start) >> PAGE_SHIFT;
2530         left->vm_raend = 0;
2531         right->vm_raend = 0;
2532         if (vma->vm_file)
2533                 atomic_add(2, &vma->vm_file->f_count);
2534
2535         if (vma->vm_ops && vma->vm_ops->open) {
2536                 vma->vm_ops->open(left);
2537                 vma->vm_ops->open(right);
2538         }
2539         vma->vm_pgoff += (start - vma->vm_start) >> PAGE_SHIFT;
2540         vma->vm_raend = 0;
2541         lock_vma_mappings(vma);
2542         spin_lock(&mm->page_table_lock);
2543         vma->vm_start = start;
2544         vma->vm_end = end;
2545         setup_read_behavior(vma, behavior);
2546         __insert_vm_struct(mm, left);
2547         __insert_vm_struct(mm, right);
2548         spin_unlock(&mm->page_table_lock);
2549         unlock_vma_mappings(vma);
2550         return 0;
2551 }
2552
2553 /*
2554  * We can potentially split a vm area into separate
2555  * areas, each area with its own behavior.
2556  */
2557 static long madvise_behavior(struct vm_area_struct * vma,
2558         unsigned long start, unsigned long end, int behavior)
2559 {
2560         int error = 0;
2561
2562         /* This caps the number of vma's this process can own */
2563         if (vma->vm_mm->map_count > max_map_count)
2564                 return -ENOMEM;
2565
2566         if (start == vma->vm_start) {
2567                 if (end == vma->vm_end) {
2568                         setup_read_behavior(vma, behavior);
2569                         vma->vm_raend = 0;
2570                 } else
2571                         error = madvise_fixup_start(vma, end, behavior);
2572         } else {
2573                 if (end == vma->vm_end)
2574                         error = madvise_fixup_end(vma, start, behavior);
2575                 else
2576                         error = madvise_fixup_middle(vma, start, end, behavior);
2577         }
2578
2579         return error;
2580 }
2581
2582 /*
2583  * Schedule all required I/O operations, then run the disk queue
2584  * to make sure they are started.  Do not wait for completion.
2585  */
2586 static long madvise_willneed(struct vm_area_struct * vma,
2587         unsigned long start, unsigned long end)
2588 {
2589         long error = -EBADF;
2590         struct file * file;
2591         struct inode * inode;
2592         unsigned long size;
2593
2594         /* Doesn't work if there's no mapped file. */
2595         if (!vma->vm_file)
2596                 return error;
2597         file = vma->vm_file;
2598         inode = file->f_dentry->d_inode;
2599         if (!inode->i_mapping->a_ops->readpage)
2600                 return error;
2601         size = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
2602
2603         start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2604         if (end > vma->vm_end)
2605                 end = vma->vm_end;
2606         end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
2607
2608         error = -EIO;
2609
2610         /* round to cluster boundaries if this isn't a "random" area. */
2611         if (!VM_RandomReadHint(vma)) {
2612                 start = CLUSTER_OFFSET(start);
2613                 end = CLUSTER_OFFSET(end + CLUSTER_PAGES - 1);
2614
2615                 while ((start < end) && (start < size)) {
2616                         error = read_cluster_nonblocking(file, start, size);
2617                         start += CLUSTER_PAGES;
2618                         if (error < 0)
2619                                 break;
2620                 }
2621         } else {
2622                 while ((start < end) && (start < size)) {
2623                         error = page_cache_read(file, start);
2624                         start++;
2625                         if (error < 0)
2626                                 break;
2627                 }
2628         }
2629
2630         /* Don't wait for someone else to push these requests. */
2631         run_task_queue(&tq_disk);
2632
2633         return error;
2634 }
2635
2636 /*
2637  * Application no longer needs these pages.  If the pages are dirty,
2638  * it's OK to just throw them away.  The app will be more careful about
2639  * data it wants to keep.  Be sure to free swap resources too.  The
2640  * zap_page_range call sets things up for refill_inactive to actually free
2641  * these pages later if no one else has touched them in the meantime,
2642  * although we could add these pages to a global reuse list for
2643  * refill_inactive to pick up before reclaiming other pages.
2644  *
2645  * NB: This interface discards data rather than pushes it out to swap,
2646  * as some implementations do.  This has performance implications for
2647  * applications like large transactional databases which want to discard
2648  * pages in anonymous maps after committing to backing store the data
2649  * that was kept in them.  There is no reason to write this data out to
2650  * the swap area if the application is discarding it.
2651  *
2652  * An interface that causes the system to free clean pages and flush
2653  * dirty pages is already available as msync(MS_INVALIDATE).
2654  */
2655 static long madvise_dontneed(struct vm_area_struct * vma,
2656         unsigned long start, unsigned long end)
2657 {
2658         if (vma->vm_flags & VM_LOCKED)
2659                 return -EINVAL;
2660
2661         zap_page_range(vma->vm_mm, start, end - start);
2662         return 0;
2663 }
2664
2665 static long madvise_vma(struct vm_area_struct * vma, unsigned long start,
2666         unsigned long end, int behavior)
2667 {
2668         long error = -EBADF;
2669
2670         switch (behavior) {
2671         case MADV_NORMAL:
2672         case MADV_SEQUENTIAL:
2673         case MADV_RANDOM:
2674                 error = madvise_behavior(vma, start, end, behavior);
2675                 break;
2676
2677         case MADV_WILLNEED:
2678                 error = madvise_willneed(vma, start, end);
2679                 break;
2680
2681         case MADV_DONTNEED:
2682                 error = madvise_dontneed(vma, start, end);
2683                 break;
2684
2685         default:
2686                 error = -EINVAL;
2687                 break;
2688         }
2689
2690         return error;
2691 }
2692
2693 /*
2694  * The madvise(2) system call.
2695  *
2696  * Applications can use madvise() to advise the kernel how it should
2697  * handle paging I/O in this VM area.  The idea is to help the kernel
2698  * use appropriate read-ahead and caching techniques.  The information
2699  * provided is advisory only, and can be safely disregarded by the
2700  * kernel without affecting the correct operation of the application.
2701  *
2702  * behavior values:
2703  *  MADV_NORMAL - the default behavior is to read clusters.  This
2704  *              results in some read-ahead and read-behind.
2705  *  MADV_RANDOM - the system should read the minimum amount of data
2706  *              on any access, since it is unlikely that the appli-
2707  *              cation will need more than what it asks for.
2708  *  MADV_SEQUENTIAL - pages in the given range will probably be accessed
2709  *              once, so they can be aggressively read ahead, and
2710  *              can be freed soon after they are accessed.
2711  *  MADV_WILLNEED - the application is notifying the system to read
2712  *              some pages ahead.
2713  *  MADV_DONTNEED - the application is finished with the given range,
2714  *              so the kernel can free resources associated with it.
2715  *
2716  * return values:
2717  *  zero    - success
2718  *  -EINVAL - start + len < 0, start is not page-aligned,
2719  *              "behavior" is not a valid value, or application
2720  *              is attempting to release locked or shared pages.
2721  *  -ENOMEM - addresses in the specified range are not currently
2722  *              mapped, or are outside the AS of the process.
2723  *  -EIO    - an I/O error occurred while paging in data.
2724  *  -EBADF  - map exists, but area maps something that isn't a file.
2725  *  -EAGAIN - a kernel resource was temporarily unavailable.
2726  */
2727 asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior)
2728 {
2729         unsigned long end;
2730         struct vm_area_struct * vma;
2731         int unmapped_error = 0;
2732         int error = -EINVAL;
2733
2734         down_write(&current->mm->mmap_sem);
2735
2736         if (start & ~PAGE_MASK)
2737                 goto out;
2738         len = (len + ~PAGE_MASK) & PAGE_MASK;
2739         end = start + len;
2740         if (end < start)
2741                 goto out;
2742
2743         error = 0;
2744         if (end == start)
2745                 goto out;
2746
2747         /*
2748          * If the interval [start,end) covers some unmapped address
2749          * ranges, just ignore them, but return -ENOMEM at the end.
2750          */
2751         vma = find_vma(current->mm, start);
2752         for (;;) {
2753                 /* Still start < end. */
2754                 error = -ENOMEM;
2755                 if (!vma)
2756                         goto out;
2757
2758                 /* Here start < vma->vm_end. */
2759                 if (start < vma->vm_start) {
2760                         unmapped_error = -ENOMEM;
2761                         start = vma->vm_start;
2762                 }
2763
2764                 /* Here vma->vm_start <= start < vma->vm_end. */
2765                 if (end <= vma->vm_end) {
2766                         if (start < end) {
2767                                 error = madvise_vma(vma, start, end,
2768                                                         behavior);
2769                                 if (error)
2770                                         goto out;
2771                         }
2772                         error = unmapped_error;
2773                         goto out;
2774                 }
2775
2776                 /* Here vma->vm_start <= start < vma->vm_end < end. */
2777                 error = madvise_vma(vma, start, vma->vm_end, behavior);
2778                 if (error)
2779                         goto out;
2780                 start = vma->vm_end;
2781                 vma = vma->vm_next;
2782         }
2783
2784 out:
2785         up_write(&current->mm->mmap_sem);
2786         return error;
2787 }
2788
2789 /*
2790  * Later we can get more picky about what "in core" means precisely.
2791  * For now, simply check to see if the page is in the page cache,
2792  * and is up to date; i.e. that no page-in operation would be required
2793  * at this time if an application were to map and access this page.
2794  */
2795 static unsigned char mincore_page(struct vm_area_struct * vma,
2796         unsigned long pgoff)
2797 {
2798         unsigned char present = 0;
2799         struct address_space * as = vma->vm_file->f_dentry->d_inode->i_mapping;
2800         struct page * page, ** hash = page_hash(as, pgoff);
2801
2802         spin_lock(&pagecache_lock);
2803         page = __find_page_nolock(as, pgoff, *hash);
2804         if ((page) && (Page_Uptodate(page)))
2805                 present = 1;
2806         spin_unlock(&pagecache_lock);
2807
2808         return present;
2809 }
2810
2811 /*
2812  * Do a chunk of "sys_mincore()". We've already checked
2813  * all the arguments, we hold the mmap semaphore: we should
2814  * just return the amount of info we're asked for.
2815  */
2816 static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages)
2817 {
2818         unsigned long i, nr, pgoff;
2819         struct vm_area_struct *vma = find_vma(current->mm, addr);
2820
2821         /*
2822          * find_vma() didn't find anything above us, or we're
2823          * in an unmapped hole in the address space: ENOMEM.
2824          */
2825         if (!vma || addr < vma->vm_start)
2826                 return -ENOMEM;
2827
2828         /*
2829          * Ok, got it. But check whether it's a segment we support
2830          * mincore() on. Right now, we don't do any anonymous mappings.
2831          *
2832          * FIXME: This is just stupid. And returning ENOMEM is
2833          * stupid too. We should just look at the page tables. But
2834          * this is what we've traditionally done, so we'll just
2835          * continue doing it.
2836          */
2837         if (!vma->vm_file)
2838                 return -ENOMEM;
2839
2840         /*
2841          * Calculate how many pages there are left in the vma, and
2842          * what the pgoff is for our address.
2843          */
2844         nr = (vma->vm_end - addr) >> PAGE_SHIFT;
2845         if (nr > pages)
2846                 nr = pages;
2847
2848         pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
2849         pgoff += vma->vm_pgoff;
2850
2851         /* And then we just fill the sucker in.. */
2852         for (i = 0 ; i < nr; i++, pgoff++)
2853                 vec[i] = mincore_page(vma, pgoff);
2854
2855         return nr;
2856 }
2857
2858 /*
2859  * The mincore(2) system call.
2860  *
2861  * mincore() returns the memory residency status of the pages in the
2862  * current process's address space specified by [addr, addr + len).
2863  * The status is returned in a vector of bytes.  The least significant
2864  * bit of each byte is 1 if the referenced page is in memory, otherwise
2865  * it is zero.
2866  *
2867  * Because the status of a page can change after mincore() checks it
2868  * but before it returns to the application, the returned vector may
2869  * contain stale information.  Only locked pages are guaranteed to
2870  * remain in memory.
2871  *
2872  * return values:
2873  *  zero    - success
2874  *  -EFAULT - vec points to an illegal address
2875  *  -EINVAL - addr is not a multiple of PAGE_CACHE_SIZE
2876  *  -ENOMEM - Addresses in the range [addr, addr + len] are
2877  *              invalid for the address space of this process, or
2878  *              specify one or more pages which are not currently
2879  *              mapped
2880  *  -EAGAIN - A kernel resource was temporarily unavailable.
2881  */
2882 asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char *vec)
2883 {
2884         long retval;
2885         unsigned long pages;
2886         unsigned char *tmp;
2887
2888         /* Check the start address: needs to be page-aligned.. */
2889         if (start & ~PAGE_CACHE_MASK)
2890                 return -EINVAL;
2891
2892         /* ..and we need to be passed a valid user-space range */
2893         if (!access_ok(VERIFY_READ, (void *) start, len))
2894                 return -ENOMEM;
2895
2896         /* This also avoids any overflows on PAGE_CACHE_ALIGN */
2897         pages = len >> PAGE_SHIFT;
2898         pages += (len & ~PAGE_MASK) != 0;
2899
2900         if (!access_ok(VERIFY_WRITE, vec, pages))
2901                 return -EFAULT;
2902
2903         tmp = (void *) __get_free_page(GFP_USER);
2904         if (!tmp)
2905                 return -EAGAIN;
2906
2907         retval = 0;
2908         while (pages) {
2909                 /*
2910                  * Do at most PAGE_SIZE entries per iteration, due to
2911                  * the temporary buffer size.
2912                  */
2913                 down_read(&current->mm->mmap_sem);
2914                 retval = do_mincore(start, tmp, min(pages, PAGE_SIZE));
2915                 up_read(&current->mm->mmap_sem);
2916
2917                 if (retval <= 0)
2918                         break;
2919                 if (copy_to_user(vec, tmp, retval)) {
2920                         retval = -EFAULT;
2921                         break;
2922                 }
2923                 pages -= retval;
2924                 vec += retval;
2925                 start += retval << PAGE_SHIFT;
2926                 retval = 0;
2927         }
2928         free_page((unsigned long) tmp);
2929         return retval;
2930 }
2931
2932 static inline
2933 struct page *__read_cache_page(struct address_space *mapping,
2934                                 unsigned long index,
2935                                 int (*filler)(void *,struct page*),
2936                                 void *data)
2937 {
2938         struct page **hash = page_hash(mapping, index);
2939         struct page *page, *cached_page = NULL;
2940         int err;
2941 repeat:
2942         page = __find_get_page(mapping, index, hash);
2943         if (!page) {
2944                 if (!cached_page) {
2945                         cached_page = page_cache_alloc(mapping);
2946                         if (!cached_page)
2947                                 return ERR_PTR(-ENOMEM);
2948                 }
2949                 page = cached_page;
2950                 if (add_to_page_cache_unique(page, mapping, index, hash))
2951                         goto repeat;
2952                 cached_page = NULL;
2953                 err = filler(data, page);
2954                 if (err < 0) {
2955                         page_cache_release(page);
2956                         page = ERR_PTR(err);
2957                 }
2958         }
2959         if (cached_page)
2960                 page_cache_release(cached_page);
2961         return page;
2962 }
2963
2964 /*
2965  * Read into the page cache. If a page already exists,
2966  * and Page_Uptodate() is not set, try to fill the page.
2967  */
2968 struct page *read_cache_page(struct address_space *mapping,
2969                                 unsigned long index,
2970                                 int (*filler)(void *,struct page*),
2971                                 void *data)
2972 {
2973         struct page *page;
2974         int err;
2975
2976 retry:
2977         page = __read_cache_page(mapping, index, filler, data);
2978         if (IS_ERR(page))
2979                 goto out;
2980         mark_page_accessed(page);
2981         if (Page_Uptodate(page))
2982                 goto out;
2983
2984         lock_page(page);
2985         if (!page->mapping) {
2986                 UnlockPage(page);
2987                 page_cache_release(page);
2988                 goto retry;
2989         }
2990         if (Page_Uptodate(page)) {
2991                 UnlockPage(page);
2992                 goto out;
2993         }
2994         err = filler(data, page);
2995         if (err < 0) {
2996                 page_cache_release(page);
2997                 page = ERR_PTR(err);
2998         }
2999  out:
3000         return page;
3001 }
3002
3003 static inline struct page * __grab_cache_page(struct address_space *mapping,
3004                                 unsigned long index, struct page **cached_page)
3005 {
3006         struct page *page, **hash = page_hash(mapping, index);
3007 repeat:
3008         page = __find_lock_page(mapping, index, hash);
3009         if (!page) {
3010                 if (!*cached_page) {
3011                         *cached_page = page_cache_alloc(mapping);
3012                         if (!*cached_page)
3013                                 return NULL;
3014                 }
3015                 page = *cached_page;
3016                 if (add_to_page_cache_unique(page, mapping, index, hash))
3017                         goto repeat;
3018                 *cached_page = NULL;
3019         }
3020         return page;
3021 }
3022
3023 inline void remove_suid(struct inode *inode)
3024 {
3025         unsigned int mode;
3026
3027         /* set S_IGID if S_IXGRP is set, and always set S_ISUID */
3028         mode = (inode->i_mode & S_IXGRP)*(S_ISGID/S_IXGRP) | S_ISUID;
3029
3030         /* was any of the uid bits set? */
3031         mode &= inode->i_mode;
3032         if (mode && !capable(CAP_FSETID)) {
3033                 inode->i_mode &= ~mode;
3034                 mark_inode_dirty(inode);
3035         }
3036 }
3037
3038 /*
3039  * precheck_file_write():
3040  * Check the conditions on a file descriptor prior to beginning a write
3041  * on it.  Contains the common precheck code for both buffered and direct
3042  * IO.
3043  */
3044 int precheck_file_write(struct file *file, struct inode *inode,
3045                         size_t *count, loff_t *ppos)
3046 {
3047         ssize_t         err;
3048         unsigned long   limit = current->rlim[RLIMIT_FSIZE].rlim_cur;
3049         loff_t          pos = *ppos;
3050
3051         err = -EINVAL;
3052         if (pos < 0)
3053                 goto out;
3054
3055         err = file->f_error;
3056         if (err) {
3057                 file->f_error = 0;
3058                 goto out;
3059         }
3060
3061         /* FIXME: this is for backwards compatibility with 2.4 */
3062         if (!S_ISBLK(inode->i_mode) && (file->f_flags & O_APPEND))
3063                 *ppos = pos = inode->i_size;
3064
3065         /*
3066          * Check whether we've reached the file size limit.
3067          */
3068         err = -EFBIG;
3069
3070         if (!S_ISBLK(inode->i_mode) && limit != RLIM_INFINITY) {
3071                 if (pos >= limit) {
3072                         send_sig(SIGXFSZ, current, 0);
3073                         goto out;
3074                 }
3075                 if (pos > 0xFFFFFFFFULL || *count > limit - (u32)pos) {
3076                         /* send_sig(SIGXFSZ, current, 0); */
3077                         *count = limit - (u32)pos;
3078                 }
3079         }
3080
3081         /*
3082          *      LFS rule
3083          */
3084         if ( pos + *count > MAX_NON_LFS && !(file->f_flags&O_LARGEFILE)) {
3085                 if (pos >= MAX_NON_LFS) {
3086                         send_sig(SIGXFSZ, current, 0);
3087                         goto out;
3088                 }
3089                 if (*count > MAX_NON_LFS - (u32)pos) {
3090                         /* send_sig(SIGXFSZ, current, 0); */
3091                         *count = MAX_NON_LFS - (u32)pos;
3092                 }
3093         }
3094
3095         /*
3096          *      Are we about to exceed the fs block limit ?
3097          *
3098          *      If we have written data it becomes a short write
3099          *      If we have exceeded without writing data we send
3100          *      a signal and give them an EFBIG.
3101          *
3102          *      Linus frestrict idea will clean these up nicely..
3103          */
3104
3105         if (!S_ISBLK(inode->i_mode)) {
3106                 if (pos >= inode->i_sb->s_maxbytes)
3107                 {
3108                         if (*count || pos > inode->i_sb->s_maxbytes) {
3109                                 send_sig(SIGXFSZ, current, 0);
3110                                 err = -EFBIG;
3111                                 goto out;
3112                         }
3113                         /* zero-length writes at ->s_maxbytes are OK */
3114                 }
3115
3116                 if (pos + *count > inode->i_sb->s_maxbytes)
3117                         *count = inode->i_sb->s_maxbytes - pos;
3118         } else {
3119                 if (is_read_only(inode->i_rdev)) {
3120                         err = -EPERM;
3121                         goto out;
3122                 }
3123                 if (pos >= inode->i_size) {
3124                         if (*count || pos > inode->i_size) {
3125                                 err = -ENOSPC;
3126                                 goto out;
3127                         }
3128                 }
3129
3130                 if (pos + *count > inode->i_size)
3131                         *count = inode->i_size - pos;
3132         }
3133
3134         err = 0;
3135 out:
3136         return err;
3137 }
3138
3139 /*
3140  * Write to a file through the page cache.
3141  *
3142  * We currently put everything into the page cache prior to writing it.
3143  * This is not a problem when writing full pages. With partial pages,
3144  * however, we first have to read the data into the cache, then
3145  * dirty the page, and finally schedule it for writing. Alternatively, we
3146  * could write-through just the portion of data that would go into that
3147  * page, but that would kill performance for applications that write data
3148  * line by line, and it's prone to race conditions.
3149  *
3150  * Note that this routine doesn't try to keep track of dirty pages. Each
3151  * file system has to do this all by itself, unfortunately.
3152  *                                                      okir@monad.swb.de
3153  */
3154 ssize_t
3155 do_generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3156 {
3157         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3158         struct inode    *inode = mapping->host;
3159         loff_t          pos;
3160         struct page     *page, *cached_page;
3161         ssize_t         written;
3162         long            status = 0;
3163         ssize_t         err;
3164         unsigned        bytes;
3165
3166         cached_page = NULL;
3167         pos = *ppos;
3168         written = 0;
3169
3170         err = precheck_file_write(file, inode, &count, &pos);
3171         if (err != 0 || count == 0)
3172                 goto out;
3173
3174         remove_suid(inode);
3175         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3176         mark_inode_dirty_sync(inode);
3177
3178         do {
3179                 unsigned long index, offset;
3180                 long page_fault;
3181                 char *kaddr;
3182
3183                 /*
3184                  * Try to find the page in the cache. If it isn't there,
3185                  * allocate a free page.
3186                  */
3187                 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
3188                 index = pos >> PAGE_CACHE_SHIFT;
3189                 bytes = PAGE_CACHE_SIZE - offset;
3190                 if (bytes > count)
3191                         bytes = count;
3192
3193                 /*
3194                  * Bring in the user page that we will copy from _first_.
3195                  * Otherwise there's a nasty deadlock on copying from the
3196                  * same page as we're writing to, without it being marked
3197                  * up-to-date.
3198                  */
3199                 { volatile unsigned char dummy;
3200                         __get_user(dummy, buf);
3201                         __get_user(dummy, buf+bytes-1);
3202                 }
3203
3204                 status = -ENOMEM;       /* we'll assign it later anyway */
3205                 page = __grab_cache_page(mapping, index, &cached_page);
3206                 if (!page)
3207                         break;
3208
3209                 /* We have exclusive IO access to the page.. */
3210                 if (!PageLocked(page)) {
3211                         PAGE_BUG(page);
3212                 }
3213
3214                 kaddr = kmap(page);
3215                 status = mapping->a_ops->prepare_write(file, page, offset, offset+bytes);
3216                 if (status)
3217                         goto sync_failure;
3218                 page_fault = __copy_from_user(kaddr+offset, buf, bytes);
3219                 flush_dcache_page(page);
3220                 status = mapping->a_ops->commit_write(file, page, offset, offset+bytes);
3221                 if (page_fault)
3222                         goto fail_write;
3223                 if (!status)
3224                         status = bytes;
3225
3226                 if (status >= 0) {
3227                         written += status;
3228                         count -= status;
3229                         pos += status;
3230                         buf += status;
3231                 }
3232 unlock:
3233                 kunmap(page);
3234                 /* Mark it unlocked again and drop the page.. */
3235                 SetPageReferenced(page);
3236                 UnlockPage(page);
3237                 page_cache_release(page);
3238
3239                 if (status < 0)
3240                         break;
3241         } while (count);
3242 done:
3243         *ppos = pos;
3244
3245         if (cached_page)
3246                 page_cache_release(cached_page);
3247
3248         /* For now, when the user asks for O_SYNC, we'll actually
3249          * provide O_DSYNC. */
3250         if (status >= 0) {
3251                 if ((file->f_flags & O_SYNC) || IS_SYNC(inode))
3252                         status = generic_osync_inode(inode, OSYNC_METADATA|OSYNC_DATA);
3253         }
3254
3255         err = written ? written : status;
3256 out:
3257
3258         return err;
3259 fail_write:
3260         status = -EFAULT;
3261         goto unlock;
3262
3263 sync_failure:
3264         /*
3265          * If blocksize < pagesize, prepare_write() may have instantiated a
3266          * few blocks outside i_size.  Trim these off again.
3267          */
3268         kunmap(page);
3269         UnlockPage(page);
3270         page_cache_release(page);
3271         if (pos + bytes > inode->i_size)
3272                 vmtruncate(inode, inode->i_size);
3273         goto done;
3274 }
3275
3276 ssize_t
3277 do_generic_direct_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3278 {
3279         struct address_space *mapping = file->f_dentry->d_inode->i_mapping;
3280         struct inode    *inode = mapping->host;
3281         loff_t          pos;
3282         ssize_t         written;
3283         long            status = 0;
3284         ssize_t         err;
3285
3286         pos = *ppos;
3287         written = 0;
3288
3289         err = precheck_file_write(file, inode, &count, &pos);
3290         if (err != 0 || count == 0)
3291                 goto out;
3292
3293         if (!(file->f_flags & O_DIRECT))
3294                 BUG();
3295
3296         remove_suid(inode);
3297         inode->i_ctime = inode->i_mtime = CURRENT_TIME;
3298         mark_inode_dirty_sync(inode);
3299
3300         written = generic_file_direct_IO(WRITE, file, (char *) buf, count, pos);
3301         if (written > 0) {
3302                 loff_t end = pos + written;
3303                 if (end > inode->i_size && !S_ISBLK(inode->i_mode)) {
3304                         inode->i_size = end;
3305                         mark_inode_dirty(inode);
3306                 }
3307                 *ppos = end;
3308                 invalidate_inode_pages2(mapping);
3309         }
3310         /*
3311          * Sync the fs metadata but not the minor inode changes and
3312          * of course not the data as we did direct DMA for the IO.
3313          */
3314         if (written >= 0 && (file->f_flags & O_SYNC))
3315                 status = generic_osync_inode(inode, OSYNC_METADATA);
3316
3317         err = written ? written : status;
3318 out:
3319         return err;
3320 }
3321
3322 static int do_odirect_fallback(struct file *file, struct inode *inode,
3323                                const char *buf, size_t count, loff_t *ppos)
3324 {
3325         ssize_t ret;
3326         int err;
3327
3328         down(&inode->i_sem);
3329         ret = do_generic_file_write(file, buf, count, ppos);
3330         if (ret > 0) {
3331                 err = do_fdatasync(file);
3332                 if (err)
3333                         ret = err;
3334         }
3335         up(&inode->i_sem);
3336         return ret;
3337 }
3338
3339 ssize_t
3340 generic_file_write(struct file *file,const char *buf,size_t count, loff_t *ppos)
3341 {
3342         struct inode    *inode = file->f_dentry->d_inode->i_mapping->host;
3343         ssize_t         err;
3344
3345         if ((ssize_t) count < 0)
3346                 return -EINVAL;
3347
3348         if (!access_ok(VERIFY_READ, buf, count))
3349                 return -EFAULT;
3350
3351         if (file->f_flags & O_DIRECT) {
3352                 /* do_generic_direct_write may drop i_sem during the
3353                    actual IO */
3354                 down_read(&inode->i_alloc_sem);
3355                 down(&inode->i_sem);
3356                 err = do_generic_direct_write(file, buf, count, ppos);
3357                 up(&inode->i_sem);
3358                 up_read(&inode->i_alloc_sem);
3359                 if (unlikely(err == -ENOTBLK))
3360                         err = do_odirect_fallback(file, inode, buf, count, ppos);
3361         } else {
3362                 down(&inode->i_sem);
3363                 err = do_generic_file_write(file, buf, count, ppos);
3364                 up(&inode->i_sem);
3365         }
3366
3367         return err;
3368 }
3369
3370 void __init page_cache_init(unsigned long mempages)
3371 {
3372         unsigned long htable_size, order;
3373
3374         htable_size = mempages;
3375         htable_size *= sizeof(struct page *);
3376         for(order = 0; (PAGE_SIZE << order) < htable_size; order++)
3377                 ;
3378
3379         do {
3380                 unsigned long tmp = (PAGE_SIZE << order) / sizeof(struct page *);
3381
3382                 page_hash_bits = 0;
3383                 while((tmp >>= 1UL) != 0UL)
3384                         page_hash_bits++;
3385
3386                 page_hash_table = (struct page **)
3387                         __get_free_pages(GFP_ATOMIC, order);
3388         } while(page_hash_table == NULL && --order > 0);
3389
3390         printk("Page-cache hash table entries: %d (order: %ld, %ld bytes)\n",
3391                (1 << page_hash_bits), order, (PAGE_SIZE << order));
3392         if (!page_hash_table)
3393                 panic("Failed to allocate page hash table\n");
3394         memset((void *)page_hash_table, 0, PAGE_HASH_SIZE * sizeof(struct page *));
3395 }