2 * Resizable virtual memory filesystem for Linux.
4 * Copyright (C) 2000 Linus Torvalds.
6 * 2000-2001 Christoph Rohland
9 * Copyright (C) 2002-2003 Hugh Dickins.
10 * Copyright (C) 2002-2003 VERITAS Software Corporation.
12 * This file is released under the GPL.
16 * This virtual memory filesystem is heavily based on the ramfs. It
17 * extends ramfs by the ability to use swap and honor resource limits
18 * which makes it a completely usable filesystem.
21 #include <linux/config.h>
22 #include <linux/module.h>
23 #include <linux/init.h>
24 #include <linux/devfs_fs_kernel.h>
27 #include <linux/file.h>
28 #include <linux/swap.h>
29 #include <linux/pagemap.h>
30 #include <linux/string.h>
31 #include <linux/locks.h>
32 #include <linux/smp_lock.h>
34 #include <asm/uaccess.h>
35 #include <asm/div64.h>
37 /* This magic number is used in glibc for posix shared memory */
38 #define TMPFS_MAGIC 0x01021994
40 #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long))
41 #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE)
42 #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512)
44 #define SHMEM_MAX_INDEX (SHMEM_NR_DIRECT + (ENTRIES_PER_PAGEPAGE/2) * (ENTRIES_PER_PAGE+1))
45 #define SHMEM_MAX_BYTES ((unsigned long long)SHMEM_MAX_INDEX << PAGE_CACHE_SHIFT)
47 #define VM_ACCT(size) (PAGE_CACHE_ALIGN(size) >> PAGE_SHIFT)
49 /* info->flags needs VM_flags to handle pagein/truncate race efficiently */
50 #define SHMEM_PAGEIN VM_READ
51 #define SHMEM_TRUNCATE VM_WRITE
53 /* Pretend that each entry is of this size in directory's i_size */
54 #define BOGO_DIRENT_SIZE 20
56 #define SHMEM_SB(sb) (&sb->u.shmem_sb)
58 /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
60 SGP_READ, /* don't exceed i_size, don't allocate page */
61 SGP_CACHE, /* don't exceed i_size, may allocate page */
62 SGP_WRITE, /* may exceed i_size, may allocate page */
65 static int shmem_getpage(struct inode *inode, unsigned long idx,
66 struct page **pagep, enum sgp_type sgp);
68 static struct super_operations shmem_ops;
69 static struct address_space_operations shmem_aops;
70 static struct file_operations shmem_file_operations;
71 static struct inode_operations shmem_inode_operations;
72 static struct inode_operations shmem_dir_inode_operations;
73 static struct vm_operations_struct shmem_vm_ops;
75 LIST_HEAD(shmem_inodes);
76 static spinlock_t shmem_ilock = SPIN_LOCK_UNLOCKED;
78 static void shmem_free_block(struct inode *inode)
80 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
81 spin_lock(&sbinfo->stat_lock);
82 sbinfo->free_blocks++;
83 inode->i_blocks -= BLOCKS_PER_PAGE;
84 spin_unlock(&sbinfo->stat_lock);
87 static void shmem_removepage(struct page *page)
89 if (!PageLaunder(page) && !PageError(page))
90 shmem_free_block(page->mapping->host);
94 * shmem_swp_entry - find the swap vector position in the info structure
96 * @info: info structure for the inode
97 * @index: index of the page to find
98 * @page: optional page to add to the structure. Has to be preset to
101 * If there is no space allocated yet it will return NULL when
102 * page is 0, else it will use the page for the needed block,
103 * setting it to 0 on return to indicate that it has been used.
105 * The swap vector is organized the following way:
107 * There are SHMEM_NR_DIRECT entries directly stored in the
108 * shmem_inode_info structure. So small files do not need an addional
111 * For pages with index > SHMEM_NR_DIRECT there is the pointer
112 * i_indirect which points to a page which holds in the first half
113 * doubly indirect blocks, in the second half triple indirect blocks:
115 * For an artificial ENTRIES_PER_PAGE = 4 this would lead to the
116 * following layout (for SHMEM_NR_DIRECT == 16):
118 * i_indirect -> dir --> 16-19
131 static swp_entry_t *shmem_swp_entry(struct shmem_inode_info *info, unsigned long index, unsigned long *page)
133 unsigned long offset;
136 if (index < SHMEM_NR_DIRECT)
137 return info->i_direct+index;
138 if (!info->i_indirect) {
140 info->i_indirect = (void **) *page;
143 return NULL; /* need another page */
146 index -= SHMEM_NR_DIRECT;
147 offset = index % ENTRIES_PER_PAGE;
148 index /= ENTRIES_PER_PAGE;
149 dir = info->i_indirect;
151 if (index >= ENTRIES_PER_PAGE/2) {
152 index -= ENTRIES_PER_PAGE/2;
153 dir += ENTRIES_PER_PAGE/2 + index/ENTRIES_PER_PAGE;
154 index %= ENTRIES_PER_PAGE;
157 *dir = (void *) *page;
160 return NULL; /* need another page */
162 dir = (void **) *dir;
168 return NULL; /* need a page */
169 *dir = (void *) *page;
172 return (swp_entry_t *) *dir + offset;
176 * shmem_swp_alloc - get the position of the swap entry for the page.
177 * If it does not exist allocate the entry.
179 * @info: info structure for the inode
180 * @index: index of the page to find
181 * @sgp: check and recheck i_size? skip allocation?
183 static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp)
185 struct inode *inode = info->inode;
186 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
187 unsigned long page = 0;
189 static const swp_entry_t unswapped = {0};
191 if (sgp != SGP_WRITE &&
192 ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size)
193 return ERR_PTR(-EINVAL);
195 while (!(entry = shmem_swp_entry(info, index, &page))) {
197 return (swp_entry_t *) &unswapped;
199 * Test free_blocks against 1 not 0, since we have 1 data
200 * page (and perhaps indirect index pages) yet to allocate:
201 * a waste to allocate index if we cannot allocate data.
203 spin_lock(&sbinfo->stat_lock);
204 if (sbinfo->free_blocks <= 1) {
205 spin_unlock(&sbinfo->stat_lock);
206 return ERR_PTR(-ENOSPC);
208 sbinfo->free_blocks--;
209 inode->i_blocks += BLOCKS_PER_PAGE;
210 spin_unlock(&sbinfo->stat_lock);
212 spin_unlock(&info->lock);
213 page = get_zeroed_page(GFP_USER);
214 spin_lock(&info->lock);
217 shmem_free_block(inode);
218 return ERR_PTR(-ENOMEM);
220 if (sgp != SGP_WRITE &&
221 ((loff_t) index << PAGE_CACHE_SHIFT) >= inode->i_size) {
222 entry = ERR_PTR(-EINVAL);
225 if (info->next_index <= index)
226 info->next_index = index + 1;
229 /* another task gave its page, or truncated the file */
230 shmem_free_block(inode);
233 if (info->next_index <= index && !IS_ERR(entry))
234 info->next_index = index + 1;
239 * shmem_free_swp - free some swap entries in a directory
241 * @dir: pointer to the directory
242 * @edir: pointer after last entry of the directory
244 static int shmem_free_swp(swp_entry_t *dir, swp_entry_t *edir)
249 for (ptr = dir; ptr < edir; ptr++) {
251 free_swap_and_cache(*ptr);
252 *ptr = (swp_entry_t){0};
260 * shmem_truncate_direct - free the swap entries of a whole doubly
263 * @info: the info structure of the inode
264 * @dir: pointer to the pointer to the block
265 * @start: offset to start from (in pages)
266 * @len: how many pages are stored in this block
268 static inline unsigned long
269 shmem_truncate_direct(struct shmem_inode_info *info, swp_entry_t ***dir, unsigned long start, unsigned long len)
271 swp_entry_t **last, **ptr;
272 unsigned long off, freed_swp, freed = 0;
274 last = *dir + (len + ENTRIES_PER_PAGE - 1) / ENTRIES_PER_PAGE;
275 off = start % ENTRIES_PER_PAGE;
277 for (ptr = *dir + start/ENTRIES_PER_PAGE; ptr < last; ptr++, off = 0) {
282 freed_swp = shmem_free_swp(*ptr + off,
283 *ptr + ENTRIES_PER_PAGE);
284 info->swapped -= freed_swp;
290 free_page((unsigned long) *ptr);
297 free_page((unsigned long) *dir);
304 * shmem_truncate_indirect - truncate an inode
306 * @info: the info structure of the inode
307 * @index: the index to truncate
309 * This function locates the last doubly indirect block and calls
310 * then shmem_truncate_direct to do the real work
312 static inline unsigned long
313 shmem_truncate_indirect(struct shmem_inode_info *info, unsigned long index)
316 unsigned long baseidx, start;
317 unsigned long len = info->next_index;
320 if (len <= SHMEM_NR_DIRECT) {
321 info->next_index = index;
324 freed = shmem_free_swp(info->i_direct + index,
325 info->i_direct + len);
326 info->swapped -= freed;
330 if (len <= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT) {
331 len -= SHMEM_NR_DIRECT;
332 base = (swp_entry_t ***) &info->i_indirect;
333 baseidx = SHMEM_NR_DIRECT;
335 len -= ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
336 BUG_ON(len > ENTRIES_PER_PAGEPAGE*ENTRIES_PER_PAGE/2);
338 baseidx -= baseidx % ENTRIES_PER_PAGEPAGE;
339 base = (swp_entry_t ***) info->i_indirect +
340 ENTRIES_PER_PAGE/2 + baseidx/ENTRIES_PER_PAGEPAGE;
342 baseidx += ENTRIES_PER_PAGEPAGE/2 + SHMEM_NR_DIRECT;
345 if (index > baseidx) {
346 info->next_index = index;
347 start = index - baseidx;
349 info->next_index = baseidx;
352 return *base? shmem_truncate_direct(info, base, start, len): 0;
355 static void shmem_truncate(struct inode *inode)
357 struct shmem_inode_info *info = SHMEM_I(inode);
358 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
359 unsigned long freed = 0;
362 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
363 index = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
364 if (index >= info->next_index)
367 spin_lock(&info->lock);
368 while (index < info->next_index)
369 freed += shmem_truncate_indirect(info, index);
370 BUG_ON(info->swapped > info->next_index);
372 if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) {
374 * Call truncate_inode_pages again: racing shmem_unuse_inode
375 * may have swizzled a page in from swap since vmtruncate or
376 * generic_delete_inode did it, before we lowered next_index.
377 * Also, though shmem_getpage checks i_size before adding to
378 * cache, no recheck after: so fix the narrow window there too.
380 info->flags |= SHMEM_TRUNCATE;
381 spin_unlock(&info->lock);
382 truncate_inode_pages(inode->i_mapping, inode->i_size);
383 spin_lock(&info->lock);
384 info->flags &= ~SHMEM_TRUNCATE;
387 spin_unlock(&info->lock);
388 spin_lock(&sbinfo->stat_lock);
389 sbinfo->free_blocks += freed;
390 inode->i_blocks -= freed*BLOCKS_PER_PAGE;
391 spin_unlock(&sbinfo->stat_lock);
394 static int shmem_notify_change(struct dentry *dentry, struct iattr *attr)
396 struct inode *inode = dentry->d_inode;
397 struct page *page = NULL;
400 if (attr->ia_valid & ATTR_SIZE) {
401 if (attr->ia_size < inode->i_size) {
403 * If truncating down to a partial page, then
404 * if that page is already allocated, hold it
405 * in memory until the truncation is over, so
406 * truncate_partial_page cannnot miss it were
407 * it assigned to swap.
409 if (attr->ia_size & (PAGE_CACHE_SIZE-1)) {
410 (void) shmem_getpage(inode,
411 attr->ia_size>>PAGE_CACHE_SHIFT,
415 * Reset SHMEM_PAGEIN flag so that shmem_truncate can
416 * detect if any pages might have been added to cache
417 * after truncate_inode_pages. But we needn't bother
418 * if it's being fully truncated to zero-length: the
419 * nrpages check is efficient enough in that case.
422 struct shmem_inode_info *info = SHMEM_I(inode);
423 spin_lock(&info->lock);
424 info->flags &= ~SHMEM_PAGEIN;
425 spin_unlock(&info->lock);
430 error = inode_change_ok(inode, attr);
432 error = inode_setattr(inode, attr);
434 page_cache_release(page);
438 static void shmem_delete_inode(struct inode *inode)
440 struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
441 struct shmem_inode_info *info = SHMEM_I(inode);
443 if (inode->i_op->truncate == shmem_truncate) {
444 spin_lock(&shmem_ilock);
445 list_del(&info->list);
446 spin_unlock(&shmem_ilock);
448 shmem_truncate(inode);
450 BUG_ON(inode->i_blocks);
451 spin_lock(&sbinfo->stat_lock);
452 sbinfo->free_inodes++;
453 spin_unlock(&sbinfo->stat_lock);
457 static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir)
461 for (ptr = dir; ptr < edir; ptr++) {
462 if (ptr->val == entry.val)
468 static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, struct page *page)
471 struct address_space *mapping;
477 ptr = info->i_direct;
478 spin_lock(&info->lock);
479 offset = info->next_index;
480 if (offset > SHMEM_NR_DIRECT)
481 offset = SHMEM_NR_DIRECT;
482 offset = shmem_find_swp(entry, ptr, ptr + offset);
486 for (idx = SHMEM_NR_DIRECT; idx < info->next_index;
487 idx += ENTRIES_PER_PAGE) {
488 ptr = shmem_swp_entry(info, idx, NULL);
491 offset = info->next_index - idx;
492 if (offset > ENTRIES_PER_PAGE)
493 offset = ENTRIES_PER_PAGE;
494 offset = shmem_find_swp(entry, ptr, ptr + offset);
498 spin_unlock(&info->lock);
503 mapping = inode->i_mapping;
504 delete_from_swap_cache(page);
505 if (add_to_page_cache_unique(page,
506 mapping, idx, page_hash(mapping, idx)) == 0) {
507 info->flags |= SHMEM_PAGEIN;
510 } else if (add_to_swap_cache(page, entry) != 0)
512 spin_unlock(&info->lock);
513 SetPageUptodate(page);
515 * Decrement swap count even when the entry is left behind:
516 * try_to_unuse will skip over mms, then reincrement count.
523 * shmem_unuse() search for an eventually swapped out shmem page.
525 int shmem_unuse(swp_entry_t entry, struct page *page)
528 struct shmem_inode_info *info;
531 spin_lock(&shmem_ilock);
532 list_for_each(p, &shmem_inodes) {
533 info = list_entry(p, struct shmem_inode_info, list);
535 if (info->swapped && shmem_unuse_inode(info, entry, page)) {
536 /* move head to start search for next from here */
537 list_move_tail(&shmem_inodes, &info->list);
542 spin_unlock(&shmem_ilock);
547 * Move the page from the page cache to the swap cache.
549 static int shmem_writepage(struct page *page)
551 struct shmem_inode_info *info;
552 swp_entry_t *entry, swap;
553 struct address_space *mapping;
557 BUG_ON(!PageLocked(page));
558 if (!PageLaunder(page))
561 mapping = page->mapping;
563 inode = mapping->host;
564 info = SHMEM_I(inode);
565 if (info->flags & VM_LOCKED)
568 swap = get_swap_page();
572 spin_lock(&info->lock);
573 if (index >= info->next_index) {
574 BUG_ON(!(info->flags & SHMEM_TRUNCATE));
575 spin_unlock(&info->lock);
579 entry = shmem_swp_entry(info, index, NULL);
583 /* Remove it from the page cache */
584 remove_inode_page(page);
585 page_cache_release(page);
587 /* Add it to the swap cache */
588 if (add_to_swap_cache(page, swap) != 0) {
590 * Raced with "speculative" read_swap_cache_async.
591 * Add page back to page cache, unref swap, try again.
593 add_to_page_cache_locked(page, mapping, index);
594 info->flags |= SHMEM_PAGEIN;
595 spin_unlock(&info->lock);
602 spin_unlock(&info->lock);
603 SetPageUptodate(page);
604 set_page_dirty(page);
608 return fail_writepage(page);
612 * shmem_getpage - either get the page from swap or allocate a new one
614 * If we allocate a new one we do not mark it dirty. That's up to the
615 * vm. If we swap it in we mark it dirty since we also free the swap
616 * entry since a page cannot live in both the swap and page cache
618 static int shmem_getpage(struct inode *inode, unsigned long idx, struct page **pagep, enum sgp_type sgp)
620 struct address_space *mapping = inode->i_mapping;
621 struct shmem_inode_info *info = SHMEM_I(inode);
622 struct shmem_sb_info *sbinfo;
623 struct page *filepage = *pagep;
624 struct page *swappage;
629 if (idx >= SHMEM_MAX_INDEX) {
635 * Normally, filepage is NULL on entry, and either found
636 * uptodate immediately, or allocated and zeroed, or read
637 * in under swappage, which is then assigned to filepage.
638 * But shmem_readpage and shmem_prepare_write pass in a locked
639 * filepage, which may be found not uptodate by other callers
640 * too, and may need to be copied from the swappage read in.
644 filepage = find_lock_page(mapping, idx);
645 if (filepage && Page_Uptodate(filepage))
648 spin_lock(&info->lock);
649 entry = shmem_swp_alloc(info, idx, sgp);
651 spin_unlock(&info->lock);
652 error = PTR_ERR(entry);
658 /* Look it up and read it in.. */
659 swappage = lookup_swap_cache(swap);
661 spin_unlock(&info->lock);
662 swapin_readahead(swap);
663 swappage = read_swap_cache_async(swap);
665 spin_lock(&info->lock);
666 entry = shmem_swp_alloc(info, idx, sgp);
668 error = PTR_ERR(entry);
669 else if (entry->val == swap.val)
671 spin_unlock(&info->lock);
676 wait_on_page(swappage);
677 page_cache_release(swappage);
681 /* We have to do this with page locked to prevent races */
682 if (TryLockPage(swappage)) {
683 spin_unlock(&info->lock);
684 wait_on_page(swappage);
685 page_cache_release(swappage);
688 if (!Page_Uptodate(swappage)) {
689 spin_unlock(&info->lock);
690 UnlockPage(swappage);
691 page_cache_release(swappage);
696 delete_from_swap_cache(swappage);
700 spin_unlock(&info->lock);
701 flush_page_to_ram(swappage);
702 copy_highpage(filepage, swappage);
703 UnlockPage(swappage);
704 page_cache_release(swappage);
705 flush_dcache_page(filepage);
706 SetPageUptodate(filepage);
707 SetPageDirty(filepage);
709 } else if (add_to_page_cache_unique(swappage,
710 mapping, idx, page_hash(mapping, idx)) == 0) {
711 info->flags |= SHMEM_PAGEIN;
714 spin_unlock(&info->lock);
716 SetPageUptodate(filepage);
717 SetPageDirty(filepage);
720 if (add_to_swap_cache(swappage, swap) != 0)
722 spin_unlock(&info->lock);
723 SetPageUptodate(swappage);
724 SetPageDirty(swappage);
725 UnlockPage(swappage);
726 page_cache_release(swappage);
729 } else if (sgp == SGP_READ && !filepage) {
730 filepage = find_get_page(mapping, idx);
732 (!Page_Uptodate(filepage) || TryLockPage(filepage))) {
733 spin_unlock(&info->lock);
734 wait_on_page(filepage);
735 page_cache_release(filepage);
739 spin_unlock(&info->lock);
741 sbinfo = SHMEM_SB(inode->i_sb);
742 spin_lock(&sbinfo->stat_lock);
743 if (sbinfo->free_blocks == 0) {
744 spin_unlock(&sbinfo->stat_lock);
745 spin_unlock(&info->lock);
749 sbinfo->free_blocks--;
750 inode->i_blocks += BLOCKS_PER_PAGE;
751 spin_unlock(&sbinfo->stat_lock);
754 spin_unlock(&info->lock);
755 filepage = page_cache_alloc(mapping);
757 shmem_free_block(inode);
762 spin_lock(&info->lock);
763 entry = shmem_swp_alloc(info, idx, sgp);
765 error = PTR_ERR(entry);
766 if (error || entry->val ||
767 add_to_page_cache_unique(filepage,
768 mapping, idx, page_hash(mapping, idx)) != 0) {
769 spin_unlock(&info->lock);
770 page_cache_release(filepage);
771 shmem_free_block(inode);
777 info->flags |= SHMEM_PAGEIN;
780 spin_unlock(&info->lock);
781 clear_highpage(filepage);
782 flush_dcache_page(filepage);
783 SetPageUptodate(filepage);
788 UnlockPage(filepage);
790 filepage = ZERO_PAGE(0);
793 if (PageError(filepage))
794 ClearPageError(filepage);
799 if (*pagep == filepage)
800 SetPageError(filepage);
802 UnlockPage(filepage);
803 page_cache_release(filepage);
809 struct page *shmem_nopage(struct vm_area_struct *vma, unsigned long address, int unused)
811 struct inode *inode = vma->vm_file->f_dentry->d_inode;
812 struct page *page = NULL;
816 idx = (address - vma->vm_start) >> PAGE_SHIFT;
817 idx += vma->vm_pgoff;
818 idx >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
820 error = shmem_getpage(inode, idx, &page, SGP_CACHE);
822 return (error == -ENOMEM)? NOPAGE_OOM: NOPAGE_SIGBUS;
824 mark_page_accessed(page);
825 flush_page_to_ram(page);
829 void shmem_lock(struct file *file, int lock)
831 struct inode *inode = file->f_dentry->d_inode;
832 struct shmem_inode_info *info = SHMEM_I(inode);
834 spin_lock(&info->lock);
836 info->flags |= VM_LOCKED;
838 info->flags &= ~VM_LOCKED;
839 spin_unlock(&info->lock);
842 static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
844 struct vm_operations_struct *ops;
845 struct inode *inode = file->f_dentry->d_inode;
848 if (!S_ISREG(inode->i_mode))
855 static struct inode *shmem_get_inode(struct super_block *sb, int mode, int dev)
858 struct shmem_inode_info *info;
859 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
861 spin_lock(&sbinfo->stat_lock);
862 if (!sbinfo->free_inodes) {
863 spin_unlock(&sbinfo->stat_lock);
866 sbinfo->free_inodes--;
867 spin_unlock(&sbinfo->stat_lock);
869 inode = new_inode(sb);
871 inode->i_mode = mode;
872 inode->i_uid = current->fsuid;
873 inode->i_gid = current->fsgid;
874 inode->i_blksize = PAGE_CACHE_SIZE;
876 inode->i_rdev = NODEV;
877 inode->i_mapping->a_ops = &shmem_aops;
878 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
879 info = SHMEM_I(inode);
881 spin_lock_init(&info->lock);
882 switch (mode & S_IFMT) {
884 init_special_inode(inode, mode, dev);
887 inode->i_op = &shmem_inode_operations;
888 inode->i_fop = &shmem_file_operations;
889 spin_lock(&shmem_ilock);
890 list_add_tail(&info->list, &shmem_inodes);
891 spin_unlock(&shmem_ilock);
895 /* Some things misbehave if size == 0 on a directory */
896 inode->i_size = 2 * BOGO_DIRENT_SIZE;
897 inode->i_op = &shmem_dir_inode_operations;
898 inode->i_fop = &dcache_dir_ops;
907 static int shmem_set_size(struct shmem_sb_info *info,
908 unsigned long max_blocks, unsigned long max_inodes)
911 unsigned long blocks, inodes;
913 spin_lock(&info->stat_lock);
914 blocks = info->max_blocks - info->free_blocks;
915 inodes = info->max_inodes - info->free_inodes;
917 if (max_blocks < blocks)
919 if (max_inodes < inodes)
922 info->max_blocks = max_blocks;
923 info->free_blocks = max_blocks - blocks;
924 info->max_inodes = max_inodes;
925 info->free_inodes = max_inodes - inodes;
927 spin_unlock(&info->stat_lock);
933 static struct inode_operations shmem_symlink_inode_operations;
934 static struct inode_operations shmem_symlink_inline_operations;
937 * tmpfs itself makes no use of generic_file_read, generic_file_mmap
938 * or generic_file_write; but shmem_readpage, shmem_prepare_write and
939 * shmem_commit_write let a tmpfs file be used below the loop driver,
940 * and shmem_readpage lets a tmpfs file be used by sendfile.
943 shmem_readpage(struct file *file, struct page *page)
945 struct inode *inode = page->mapping->host;
946 int error = shmem_getpage(inode, page->index, &page, SGP_CACHE);
952 shmem_prepare_write(struct file *file, struct page *page, unsigned offset, unsigned to)
954 struct inode *inode = page->mapping->host;
955 return shmem_getpage(inode, page->index, &page, SGP_WRITE);
959 shmem_commit_write(struct file *file, struct page *page, unsigned offset, unsigned to)
961 struct inode *inode = page->mapping->host;
962 loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
964 if (pos > inode->i_size)
971 shmem_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos)
973 struct inode *inode = file->f_dentry->d_inode;
975 unsigned long written;
978 if ((ssize_t) count < 0)
981 if (!access_ok(VERIFY_READ, buf, count))
989 err = precheck_file_write(file, inode, &count, &pos);
994 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
997 struct page *page = NULL;
998 unsigned long bytes, index, offset;
1002 offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
1003 index = pos >> PAGE_CACHE_SHIFT;
1004 bytes = PAGE_CACHE_SIZE - offset;
1009 * We don't hold page lock across copy from user -
1010 * what would it guard against? - so no deadlock here.
1013 err = shmem_getpage(inode, index, &page, SGP_WRITE);
1018 left = __copy_from_user(kaddr + offset, buf, bytes);
1025 if (pos > inode->i_size)
1026 inode->i_size = pos;
1028 flush_dcache_page(page);
1030 SetPageReferenced(page);
1031 page_cache_release(page);
1049 static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_t *desc)
1051 struct inode *inode = filp->f_dentry->d_inode;
1052 struct address_space *mapping = inode->i_mapping;
1053 unsigned long index, offset;
1056 if (unlikely(pos < 0))
1059 index = pos >> PAGE_CACHE_SHIFT;
1060 offset = pos & ~PAGE_CACHE_MASK;
1063 struct page *page = NULL;
1064 unsigned long end_index, nr, ret;
1066 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1067 if (index > end_index)
1069 if (index == end_index) {
1070 nr = inode->i_size & ~PAGE_CACHE_MASK;
1075 desc->error = shmem_getpage(inode, index, &page, SGP_READ);
1077 if (desc->error == -EINVAL)
1083 * We must evaluate after, since reads (unlike writes)
1084 * are called without i_sem protection against truncate
1086 nr = PAGE_CACHE_SIZE;
1087 end_index = inode->i_size >> PAGE_CACHE_SHIFT;
1088 if (index == end_index) {
1089 nr = inode->i_size & ~PAGE_CACHE_MASK;
1091 page_cache_release(page);
1097 if (page != ZERO_PAGE(0)) {
1099 * If users can be writing to this page using arbitrary
1100 * virtual addresses, take care about potential aliasing
1101 * before reading the page on the kernel side.
1103 if (mapping->i_mmap_shared != NULL)
1104 flush_dcache_page(page);
1106 * Mark the page accessed if we read the
1107 * beginning or we just did an lseek.
1109 if (!offset || !filp->f_reada)
1110 mark_page_accessed(page);
1114 * Ok, we have the page, and it's up-to-date, so
1115 * now we can copy it to user space...
1117 * The actor routine returns how many bytes were actually used..
1118 * NOTE! This may not be the same as how much of a user buffer
1119 * we filled up (we may be padding etc), so we can only update
1120 * "pos" here (the actor routine has to update the user buffer
1121 * pointers and the remaining count).
1123 ret = file_read_actor(desc, page, offset, nr);
1125 index += offset >> PAGE_CACHE_SHIFT;
1126 offset &= ~PAGE_CACHE_MASK;
1128 page_cache_release(page);
1129 if (ret != nr || !desc->count)
1133 *ppos = ((loff_t) index << PAGE_CACHE_SHIFT) + offset;
1135 UPDATE_ATIME(inode);
1138 static ssize_t shmem_file_read(struct file *filp, char *buf, size_t count, loff_t *ppos)
1140 read_descriptor_t desc;
1142 if ((ssize_t) count < 0)
1144 if (!access_ok(VERIFY_WRITE, buf, count))
1154 do_shmem_file_read(filp, ppos, &desc);
1156 return desc.written;
1160 static int shmem_statfs(struct super_block *sb, struct statfs *buf)
1162 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1164 buf->f_type = TMPFS_MAGIC;
1165 buf->f_bsize = PAGE_CACHE_SIZE;
1166 spin_lock(&sbinfo->stat_lock);
1167 buf->f_blocks = sbinfo->max_blocks;
1168 buf->f_bavail = buf->f_bfree = sbinfo->free_blocks;
1169 buf->f_files = sbinfo->max_inodes;
1170 buf->f_ffree = sbinfo->free_inodes;
1171 spin_unlock(&sbinfo->stat_lock);
1172 buf->f_namelen = NAME_MAX;
1177 * Retaining negative dentries for an in-memory filesystem just wastes
1178 * memory and lookup time: arrange for them to be deleted immediately.
1180 static int shmem_delete_dentry(struct dentry *dentry)
1186 * Lookup the data. This is trivial - if the dentry didn't already
1187 * exist, we know it is negative. Set d_op to delete negative dentries.
1189 static struct dentry *shmem_lookup(struct inode *dir, struct dentry *dentry)
1191 static struct dentry_operations shmem_dentry_operations = {
1192 .d_delete = shmem_delete_dentry,
1195 if (dentry->d_name.len > NAME_MAX)
1196 return ERR_PTR(-ENAMETOOLONG);
1197 dentry->d_op = &shmem_dentry_operations;
1198 d_add(dentry, NULL);
1203 * File creation. Allocate an inode, and we're done..
1205 static int shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, int dev)
1207 struct inode *inode = shmem_get_inode(dir->i_sb, mode, dev);
1208 int error = -ENOSPC;
1211 if (dir->i_mode & S_ISGID) {
1212 inode->i_gid = dir->i_gid;
1214 inode->i_mode |= S_ISGID;
1216 dir->i_size += BOGO_DIRENT_SIZE;
1217 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1218 d_instantiate(dentry, inode);
1219 dget(dentry); /* Extra count - pin the dentry in core */
1225 static int shmem_mkdir(struct inode *dir, struct dentry *dentry, int mode)
1229 if ((error = shmem_mknod(dir, dentry, mode | S_IFDIR, 0)))
1235 static int shmem_create(struct inode *dir, struct dentry *dentry, int mode)
1237 return shmem_mknod(dir, dentry, mode | S_IFREG, 0);
1243 static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry)
1245 struct inode *inode = old_dentry->d_inode;
1247 if (S_ISDIR(inode->i_mode))
1250 dir->i_size += BOGO_DIRENT_SIZE;
1251 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1253 atomic_inc(&inode->i_count); /* New dentry reference */
1254 dget(dentry); /* Extra pinning count for the created dentry */
1255 d_instantiate(dentry, inode);
1259 static inline int shmem_positive(struct dentry *dentry)
1261 return dentry->d_inode && !d_unhashed(dentry);
1265 * Check that a directory is empty (this works
1266 * for regular files too, they'll just always be
1267 * considered empty..).
1269 * Note that an empty directory can still have
1270 * children, they just all have to be negative..
1272 static int shmem_empty(struct dentry *dentry)
1274 struct list_head *list;
1276 spin_lock(&dcache_lock);
1277 list = dentry->d_subdirs.next;
1279 while (list != &dentry->d_subdirs) {
1280 struct dentry *de = list_entry(list, struct dentry, d_child);
1282 if (shmem_positive(de)) {
1283 spin_unlock(&dcache_lock);
1288 spin_unlock(&dcache_lock);
1292 static int shmem_unlink(struct inode *dir, struct dentry *dentry)
1294 struct inode *inode = dentry->d_inode;
1296 dir->i_size -= BOGO_DIRENT_SIZE;
1297 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1299 dput(dentry); /* Undo the count from "create" - this does all the work */
1303 static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
1305 if (!shmem_empty(dentry))
1309 return shmem_unlink(dir, dentry);
1313 * The VFS layer already does all the dentry stuff for rename,
1314 * we just have to decrement the usage count for the target if
1315 * it exists so that the VFS layer correctly free's it when it
1318 static int shmem_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
1320 struct inode *inode = old_dentry->d_inode;
1321 int they_are_dirs = S_ISDIR(inode->i_mode);
1323 if (!shmem_empty(new_dentry))
1326 if (new_dentry->d_inode) {
1327 (void) shmem_unlink(new_dir, new_dentry);
1330 } else if (they_are_dirs) {
1335 old_dir->i_size -= BOGO_DIRENT_SIZE;
1336 new_dir->i_size += BOGO_DIRENT_SIZE;
1337 old_dir->i_ctime = old_dir->i_mtime =
1338 new_dir->i_ctime = new_dir->i_mtime =
1339 inode->i_ctime = CURRENT_TIME;
1343 static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *symname)
1347 struct inode *inode;
1348 struct page *page = NULL;
1350 struct shmem_inode_info *info;
1352 len = strlen(symname) + 1;
1353 if (len > PAGE_CACHE_SIZE)
1354 return -ENAMETOOLONG;
1356 inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0);
1360 info = SHMEM_I(inode);
1361 inode->i_size = len-1;
1362 if (len <= sizeof(struct shmem_inode_info)) {
1364 memcpy(info, symname, len);
1365 inode->i_op = &shmem_symlink_inline_operations;
1367 error = shmem_getpage(inode, 0, &page, SGP_WRITE);
1372 inode->i_op = &shmem_symlink_inode_operations;
1373 spin_lock(&shmem_ilock);
1374 list_add_tail(&info->list, &shmem_inodes);
1375 spin_unlock(&shmem_ilock);
1377 memcpy(kaddr, symname, len);
1380 page_cache_release(page);
1382 if (dir->i_mode & S_ISGID)
1383 inode->i_gid = dir->i_gid;
1384 dir->i_size += BOGO_DIRENT_SIZE;
1385 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
1386 d_instantiate(dentry, inode);
1391 static int shmem_readlink_inline(struct dentry *dentry, char *buffer, int buflen)
1393 return vfs_readlink(dentry, buffer, buflen, (const char *)SHMEM_I(dentry->d_inode));
1396 static int shmem_follow_link_inline(struct dentry *dentry, struct nameidata *nd)
1398 return vfs_follow_link(nd, (const char *)SHMEM_I(dentry->d_inode));
1401 static int shmem_readlink(struct dentry *dentry, char *buffer, int buflen)
1403 struct page *page = NULL;
1404 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
1407 res = vfs_readlink(dentry, buffer, buflen, kmap(page));
1409 mark_page_accessed(page);
1410 page_cache_release(page);
1414 static int shmem_follow_link(struct dentry *dentry, struct nameidata *nd)
1416 struct page *page = NULL;
1417 int res = shmem_getpage(dentry->d_inode, 0, &page, SGP_READ);
1420 res = vfs_follow_link(nd, kmap(page));
1422 mark_page_accessed(page);
1423 page_cache_release(page);
1427 static struct inode_operations shmem_symlink_inline_operations = {
1428 readlink: shmem_readlink_inline,
1429 follow_link: shmem_follow_link_inline,
1432 static struct inode_operations shmem_symlink_inode_operations = {
1433 truncate: shmem_truncate,
1434 readlink: shmem_readlink,
1435 follow_link: shmem_follow_link,
1438 static int shmem_parse_options(char *options, int *mode, uid_t *uid, gid_t *gid, unsigned long *blocks, unsigned long *inodes)
1440 char *this_char, *value, *rest;
1442 while ((this_char = strsep(&options, ",")) != NULL) {
1445 if ((value = strchr(this_char,'=')) != NULL) {
1449 "tmpfs: No value for mount option '%s'\n",
1454 if (!strcmp(this_char,"size")) {
1455 unsigned long long size;
1456 size = memparse(value,&rest);
1460 size <<= PAGE_SHIFT;
1461 size *= si.totalram;
1467 *blocks = size >> PAGE_CACHE_SHIFT;
1468 } else if (!strcmp(this_char,"nr_blocks")) {
1469 *blocks = memparse(value,&rest);
1472 } else if (!strcmp(this_char,"nr_inodes")) {
1473 *inodes = memparse(value,&rest);
1476 } else if (!strcmp(this_char,"mode")) {
1479 *mode = simple_strtoul(value,&rest,8);
1482 } else if (!strcmp(this_char,"uid")) {
1485 *uid = simple_strtoul(value,&rest,0);
1488 } else if (!strcmp(this_char,"gid")) {
1491 *gid = simple_strtoul(value,&rest,0);
1495 printk(KERN_ERR "tmpfs: Bad mount option %s\n",
1503 printk(KERN_ERR "tmpfs: Bad value '%s' for mount option '%s'\n",
1508 static int shmem_remount_fs(struct super_block *sb, int *flags, char *data)
1510 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1511 unsigned long max_blocks = sbinfo->max_blocks;
1512 unsigned long max_inodes = sbinfo->max_inodes;
1514 if (shmem_parse_options(data, NULL, NULL, NULL, &max_blocks, &max_inodes))
1516 return shmem_set_size(sbinfo, max_blocks, max_inodes);
1519 static int shmem_sync_file(struct file *file, struct dentry *dentry, int datasync)
1525 static struct super_block *shmem_read_super(struct super_block *sb, void *data, int silent)
1527 struct inode *inode;
1528 struct dentry *root;
1529 unsigned long blocks, inodes;
1530 int mode = S_IRWXUGO | S_ISVTX;
1531 uid_t uid = current->fsuid;
1532 gid_t gid = current->fsgid;
1533 struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
1537 * Per default we only allow half of the physical ram per
1541 blocks = inodes = si.totalram / 2;
1544 if (shmem_parse_options(data, &mode, &uid, &gid, &blocks, &inodes))
1548 spin_lock_init(&sbinfo->stat_lock);
1549 sbinfo->max_blocks = blocks;
1550 sbinfo->free_blocks = blocks;
1551 sbinfo->max_inodes = inodes;
1552 sbinfo->free_inodes = inodes;
1553 sb->s_maxbytes = SHMEM_MAX_BYTES;
1554 sb->s_blocksize = PAGE_CACHE_SIZE;
1555 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
1556 sb->s_magic = TMPFS_MAGIC;
1557 sb->s_op = &shmem_ops;
1558 inode = shmem_get_inode(sb, S_IFDIR | mode, 0);
1564 root = d_alloc_root(inode);
1573 static struct address_space_operations shmem_aops = {
1574 removepage: shmem_removepage,
1575 writepage: shmem_writepage,
1577 readpage: shmem_readpage,
1578 prepare_write: shmem_prepare_write,
1579 commit_write: shmem_commit_write,
1583 static struct file_operations shmem_file_operations = {
1586 read: shmem_file_read,
1587 write: shmem_file_write,
1588 fsync: shmem_sync_file,
1592 static struct inode_operations shmem_inode_operations = {
1593 truncate: shmem_truncate,
1594 setattr: shmem_notify_change,
1597 static struct inode_operations shmem_dir_inode_operations = {
1599 create: shmem_create,
1600 lookup: shmem_lookup,
1602 unlink: shmem_unlink,
1603 symlink: shmem_symlink,
1607 rename: shmem_rename,
1611 static struct super_operations shmem_ops = {
1613 statfs: shmem_statfs,
1614 remount_fs: shmem_remount_fs,
1616 delete_inode: shmem_delete_inode,
1617 put_inode: force_delete,
1620 static struct vm_operations_struct shmem_vm_ops = {
1621 nopage: shmem_nopage,
1625 /* type "shm" will be tagged obsolete in 2.5 */
1626 static DECLARE_FSTYPE(shmem_fs_type, "shm", shmem_read_super, FS_LITTER);
1627 static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs", shmem_read_super, FS_LITTER);
1629 static DECLARE_FSTYPE(tmpfs_fs_type, "tmpfs", shmem_read_super, FS_LITTER|FS_NOMOUNT);
1631 static struct vfsmount *shm_mnt;
1633 static int __init init_tmpfs(void)
1637 error = register_filesystem(&tmpfs_fs_type);
1639 printk(KERN_ERR "Could not register tmpfs\n");
1643 error = register_filesystem(&shmem_fs_type);
1645 printk(KERN_ERR "Could not register shm fs\n");
1648 devfs_mk_dir(NULL, "shm", NULL);
1650 shm_mnt = kern_mount(&tmpfs_fs_type);
1651 if (IS_ERR(shm_mnt)) {
1652 error = PTR_ERR(shm_mnt);
1653 printk(KERN_ERR "Could not kern_mount tmpfs\n");
1657 /* The internal instance should not do size checking */
1658 shmem_set_size(SHMEM_SB(shm_mnt->mnt_sb), ULONG_MAX, ULONG_MAX);
1663 unregister_filesystem(&shmem_fs_type);
1666 unregister_filesystem(&tmpfs_fs_type);
1668 shm_mnt = ERR_PTR(error);
1671 module_init(init_tmpfs)
1674 * shmem_file_setup - get an unlinked file living in tmpfs
1676 * @name: name for dentry (to be seen in /proc/<pid>/maps
1677 * @size: size to be set for the file
1680 struct file *shmem_file_setup(char *name, loff_t size)
1684 struct inode *inode;
1685 struct dentry *dentry, *root;
1687 int vm_enough_memory(long pages);
1689 if (IS_ERR(shm_mnt))
1690 return (void *)shm_mnt;
1692 if (size > SHMEM_MAX_BYTES)
1693 return ERR_PTR(-EINVAL);
1695 if (!vm_enough_memory(VM_ACCT(size)))
1696 return ERR_PTR(-ENOMEM);
1699 this.len = strlen(name);
1700 this.hash = 0; /* will go */
1701 root = shm_mnt->mnt_root;
1702 dentry = d_alloc(root, &this);
1704 return ERR_PTR(-ENOMEM);
1707 file = get_empty_filp();
1712 inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0);
1716 d_instantiate(dentry, inode);
1717 inode->i_size = size;
1718 inode->i_nlink = 0; /* It is unlinked */
1719 file->f_vfsmnt = mntget(shm_mnt);
1720 file->f_dentry = dentry;
1721 file->f_op = &shmem_file_operations;
1722 file->f_mode = FMODE_WRITE | FMODE_READ;
1729 return ERR_PTR(error);
1733 * shmem_zero_setup - setup a shared anonymous mapping
1735 * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
1737 int shmem_zero_setup(struct vm_area_struct *vma)
1740 loff_t size = vma->vm_end - vma->vm_start;
1742 file = shmem_file_setup("dev/zero", size);
1744 return PTR_ERR(file);
1748 vma->vm_file = file;
1749 vma->vm_ops = &shmem_vm_ops;
1753 EXPORT_SYMBOL(shmem_file_setup);