OSDN Git Service

ext4: remove unused header files
[uclinux-h8/linux.git] / fs / ext4 / file.c
1 /*
2  *  linux/fs/ext4/file.c
3  *
4  * Copyright (C) 1992, 1993, 1994, 1995
5  * Remy Card (card@masi.ibp.fr)
6  * Laboratoire MASI - Institut Blaise Pascal
7  * Universite Pierre et Marie Curie (Paris VI)
8  *
9  *  from
10  *
11  *  linux/fs/minix/file.c
12  *
13  *  Copyright (C) 1991, 1992  Linus Torvalds
14  *
15  *  ext4 fs regular file handling primitives
16  *
17  *  64-bit file support on 64-bit platforms by Jakub Jelinek
18  *      (jj@sunsite.ms.mff.cuni.cz)
19  */
20
21 #include <linux/time.h>
22 #include <linux/fs.h>
23 #include <linux/mount.h>
24 #include <linux/path.h>
25 #include <linux/aio.h>
26 #include <linux/quotaops.h>
27 #include <linux/pagevec.h>
28 #include "ext4.h"
29 #include "ext4_jbd2.h"
30 #include "xattr.h"
31 #include "acl.h"
32
33 /*
34  * Called when an inode is released. Note that this is different
35  * from ext4_file_open: open gets called at every open, but release
36  * gets called only when /all/ the files are closed.
37  */
38 static int ext4_release_file(struct inode *inode, struct file *filp)
39 {
40         if (ext4_test_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE)) {
41                 ext4_alloc_da_blocks(inode);
42                 ext4_clear_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
43         }
44         /* if we are the last writer on the inode, drop the block reservation */
45         if ((filp->f_mode & FMODE_WRITE) &&
46                         (atomic_read(&inode->i_writecount) == 1) &&
47                         !EXT4_I(inode)->i_reserved_data_blocks)
48         {
49                 down_write(&EXT4_I(inode)->i_data_sem);
50                 ext4_discard_preallocations(inode);
51                 up_write(&EXT4_I(inode)->i_data_sem);
52         }
53         if (is_dx(inode) && filp->private_data)
54                 ext4_htree_free_dir_info(filp->private_data);
55
56         return 0;
57 }
58
59 static void ext4_unwritten_wait(struct inode *inode)
60 {
61         wait_queue_head_t *wq = ext4_ioend_wq(inode);
62
63         wait_event(*wq, (atomic_read(&EXT4_I(inode)->i_unwritten) == 0));
64 }
65
66 /*
67  * This tests whether the IO in question is block-aligned or not.
68  * Ext4 utilizes unwritten extents when hole-filling during direct IO, and they
69  * are converted to written only after the IO is complete.  Until they are
70  * mapped, these blocks appear as holes, so dio_zero_block() will assume that
71  * it needs to zero out portions of the start and/or end block.  If 2 AIO
72  * threads are at work on the same unwritten block, they must be synchronized
73  * or one thread will zero the other's data, causing corruption.
74  */
75 static int
76 ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
77 {
78         struct super_block *sb = inode->i_sb;
79         int blockmask = sb->s_blocksize - 1;
80
81         if (pos >= i_size_read(inode))
82                 return 0;
83
84         if ((pos | iov_iter_alignment(from)) & blockmask)
85                 return 1;
86
87         return 0;
88 }
89
90 static ssize_t
91 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
92 {
93         struct file *file = iocb->ki_filp;
94         struct inode *inode = file_inode(iocb->ki_filp);
95         struct mutex *aio_mutex = NULL;
96         struct blk_plug plug;
97         int o_direct = io_is_direct(file);
98         int overwrite = 0;
99         size_t length = iov_iter_count(from);
100         ssize_t ret;
101         loff_t pos = iocb->ki_pos;
102
103         /*
104          * Unaligned direct AIO must be serialized; see comment above
105          * In the case of O_APPEND, assume that we must always serialize
106          */
107         if (o_direct &&
108             ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
109             !is_sync_kiocb(iocb) &&
110             (file->f_flags & O_APPEND ||
111              ext4_unaligned_aio(inode, from, pos))) {
112                 aio_mutex = ext4_aio_mutex(inode);
113                 mutex_lock(aio_mutex);
114                 ext4_unwritten_wait(inode);
115         }
116
117         mutex_lock(&inode->i_mutex);
118         if (file->f_flags & O_APPEND)
119                 iocb->ki_pos = pos = i_size_read(inode);
120
121         /*
122          * If we have encountered a bitmap-format file, the size limit
123          * is smaller than s_maxbytes, which is for extent-mapped files.
124          */
125         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
126                 struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
127
128                 if ((pos > sbi->s_bitmap_maxbytes) ||
129                     (pos == sbi->s_bitmap_maxbytes && length > 0)) {
130                         mutex_unlock(&inode->i_mutex);
131                         ret = -EFBIG;
132                         goto errout;
133                 }
134
135                 if (pos + length > sbi->s_bitmap_maxbytes)
136                         iov_iter_truncate(from, sbi->s_bitmap_maxbytes - pos);
137         }
138
139         iocb->private = &overwrite;
140         if (o_direct) {
141                 blk_start_plug(&plug);
142
143
144                 /* check whether we do a DIO overwrite or not */
145                 if (ext4_should_dioread_nolock(inode) && !aio_mutex &&
146                     !file->f_mapping->nrpages && pos + length <= i_size_read(inode)) {
147                         struct ext4_map_blocks map;
148                         unsigned int blkbits = inode->i_blkbits;
149                         int err, len;
150
151                         map.m_lblk = pos >> blkbits;
152                         map.m_len = (EXT4_BLOCK_ALIGN(pos + length, blkbits) >> blkbits)
153                                 - map.m_lblk;
154                         len = map.m_len;
155
156                         err = ext4_map_blocks(NULL, inode, &map, 0);
157                         /*
158                          * 'err==len' means that all of blocks has
159                          * been preallocated no matter they are
160                          * initialized or not.  For excluding
161                          * unwritten extents, we need to check
162                          * m_flags.  There are two conditions that
163                          * indicate for initialized extents.  1) If we
164                          * hit extent cache, EXT4_MAP_MAPPED flag is
165                          * returned; 2) If we do a real lookup,
166                          * non-flags are returned.  So we should check
167                          * these two conditions.
168                          */
169                         if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
170                                 overwrite = 1;
171                 }
172         }
173
174         ret = __generic_file_write_iter(iocb, from);
175         mutex_unlock(&inode->i_mutex);
176
177         if (ret > 0) {
178                 ssize_t err;
179
180                 err = generic_write_sync(file, iocb->ki_pos - ret, ret);
181                 if (err < 0)
182                         ret = err;
183         }
184         if (o_direct)
185                 blk_finish_plug(&plug);
186
187 errout:
188         if (aio_mutex)
189                 mutex_unlock(aio_mutex);
190         return ret;
191 }
192
193 #ifdef CONFIG_FS_DAX
194 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
195 {
196         return dax_fault(vma, vmf, ext4_get_block);
197                                         /* Is this the right get_block? */
198 }
199
200 static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
201 {
202         return dax_mkwrite(vma, vmf, ext4_get_block);
203 }
204
205 static const struct vm_operations_struct ext4_dax_vm_ops = {
206         .fault          = ext4_dax_fault,
207         .page_mkwrite   = ext4_dax_mkwrite,
208 };
209 #else
210 #define ext4_dax_vm_ops ext4_file_vm_ops
211 #endif
212
213 static const struct vm_operations_struct ext4_file_vm_ops = {
214         .fault          = filemap_fault,
215         .map_pages      = filemap_map_pages,
216         .page_mkwrite   = ext4_page_mkwrite,
217 };
218
219 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
220 {
221         file_accessed(file);
222         if (IS_DAX(file_inode(file))) {
223                 vma->vm_ops = &ext4_dax_vm_ops;
224                 vma->vm_flags |= VM_MIXEDMAP;
225         } else {
226                 vma->vm_ops = &ext4_file_vm_ops;
227         }
228         return 0;
229 }
230
231 static int ext4_file_open(struct inode * inode, struct file * filp)
232 {
233         struct super_block *sb = inode->i_sb;
234         struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
235         struct vfsmount *mnt = filp->f_path.mnt;
236         struct path path;
237         char buf[64], *cp;
238
239         if (unlikely(!(sbi->s_mount_flags & EXT4_MF_MNTDIR_SAMPLED) &&
240                      !(sb->s_flags & MS_RDONLY))) {
241                 sbi->s_mount_flags |= EXT4_MF_MNTDIR_SAMPLED;
242                 /*
243                  * Sample where the filesystem has been mounted and
244                  * store it in the superblock for sysadmin convenience
245                  * when trying to sort through large numbers of block
246                  * devices or filesystem images.
247                  */
248                 memset(buf, 0, sizeof(buf));
249                 path.mnt = mnt;
250                 path.dentry = mnt->mnt_root;
251                 cp = d_path(&path, buf, sizeof(buf));
252                 if (!IS_ERR(cp)) {
253                         handle_t *handle;
254                         int err;
255
256                         handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
257                         if (IS_ERR(handle))
258                                 return PTR_ERR(handle);
259                         BUFFER_TRACE(sbi->s_sbh, "get_write_access");
260                         err = ext4_journal_get_write_access(handle, sbi->s_sbh);
261                         if (err) {
262                                 ext4_journal_stop(handle);
263                                 return err;
264                         }
265                         strlcpy(sbi->s_es->s_last_mounted, cp,
266                                 sizeof(sbi->s_es->s_last_mounted));
267                         ext4_handle_dirty_super(handle, sb);
268                         ext4_journal_stop(handle);
269                 }
270         }
271         /*
272          * Set up the jbd2_inode if we are opening the inode for
273          * writing and the journal is present
274          */
275         if (filp->f_mode & FMODE_WRITE) {
276                 int ret = ext4_inode_attach_jinode(inode);
277                 if (ret < 0)
278                         return ret;
279         }
280         return dquot_file_open(inode, filp);
281 }
282
283 /*
284  * Here we use ext4_map_blocks() to get a block mapping for a extent-based
285  * file rather than ext4_ext_walk_space() because we can introduce
286  * SEEK_DATA/SEEK_HOLE for block-mapped and extent-mapped file at the same
287  * function.  When extent status tree has been fully implemented, it will
288  * track all extent status for a file and we can directly use it to
289  * retrieve the offset for SEEK_DATA/SEEK_HOLE.
290  */
291
292 /*
293  * When we retrieve the offset for SEEK_DATA/SEEK_HOLE, we would need to
294  * lookup page cache to check whether or not there has some data between
295  * [startoff, endoff] because, if this range contains an unwritten extent,
296  * we determine this extent as a data or a hole according to whether the
297  * page cache has data or not.
298  */
299 static int ext4_find_unwritten_pgoff(struct inode *inode,
300                                      int whence,
301                                      struct ext4_map_blocks *map,
302                                      loff_t *offset)
303 {
304         struct pagevec pvec;
305         unsigned int blkbits;
306         pgoff_t index;
307         pgoff_t end;
308         loff_t endoff;
309         loff_t startoff;
310         loff_t lastoff;
311         int found = 0;
312
313         blkbits = inode->i_sb->s_blocksize_bits;
314         startoff = *offset;
315         lastoff = startoff;
316         endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
317
318         index = startoff >> PAGE_CACHE_SHIFT;
319         end = endoff >> PAGE_CACHE_SHIFT;
320
321         pagevec_init(&pvec, 0);
322         do {
323                 int i, num;
324                 unsigned long nr_pages;
325
326                 num = min_t(pgoff_t, end - index, PAGEVEC_SIZE);
327                 nr_pages = pagevec_lookup(&pvec, inode->i_mapping, index,
328                                           (pgoff_t)num);
329                 if (nr_pages == 0) {
330                         if (whence == SEEK_DATA)
331                                 break;
332
333                         BUG_ON(whence != SEEK_HOLE);
334                         /*
335                          * If this is the first time to go into the loop and
336                          * offset is not beyond the end offset, it will be a
337                          * hole at this offset
338                          */
339                         if (lastoff == startoff || lastoff < endoff)
340                                 found = 1;
341                         break;
342                 }
343
344                 /*
345                  * If this is the first time to go into the loop and
346                  * offset is smaller than the first page offset, it will be a
347                  * hole at this offset.
348                  */
349                 if (lastoff == startoff && whence == SEEK_HOLE &&
350                     lastoff < page_offset(pvec.pages[0])) {
351                         found = 1;
352                         break;
353                 }
354
355                 for (i = 0; i < nr_pages; i++) {
356                         struct page *page = pvec.pages[i];
357                         struct buffer_head *bh, *head;
358
359                         /*
360                          * If the current offset is not beyond the end of given
361                          * range, it will be a hole.
362                          */
363                         if (lastoff < endoff && whence == SEEK_HOLE &&
364                             page->index > end) {
365                                 found = 1;
366                                 *offset = lastoff;
367                                 goto out;
368                         }
369
370                         lock_page(page);
371
372                         if (unlikely(page->mapping != inode->i_mapping)) {
373                                 unlock_page(page);
374                                 continue;
375                         }
376
377                         if (!page_has_buffers(page)) {
378                                 unlock_page(page);
379                                 continue;
380                         }
381
382                         if (page_has_buffers(page)) {
383                                 lastoff = page_offset(page);
384                                 bh = head = page_buffers(page);
385                                 do {
386                                         if (buffer_uptodate(bh) ||
387                                             buffer_unwritten(bh)) {
388                                                 if (whence == SEEK_DATA)
389                                                         found = 1;
390                                         } else {
391                                                 if (whence == SEEK_HOLE)
392                                                         found = 1;
393                                         }
394                                         if (found) {
395                                                 *offset = max_t(loff_t,
396                                                         startoff, lastoff);
397                                                 unlock_page(page);
398                                                 goto out;
399                                         }
400                                         lastoff += bh->b_size;
401                                         bh = bh->b_this_page;
402                                 } while (bh != head);
403                         }
404
405                         lastoff = page_offset(page) + PAGE_SIZE;
406                         unlock_page(page);
407                 }
408
409                 /*
410                  * The no. of pages is less than our desired, that would be a
411                  * hole in there.
412                  */
413                 if (nr_pages < num && whence == SEEK_HOLE) {
414                         found = 1;
415                         *offset = lastoff;
416                         break;
417                 }
418
419                 index = pvec.pages[i - 1]->index + 1;
420                 pagevec_release(&pvec);
421         } while (index <= end);
422
423 out:
424         pagevec_release(&pvec);
425         return found;
426 }
427
428 /*
429  * ext4_seek_data() retrieves the offset for SEEK_DATA.
430  */
431 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
432 {
433         struct inode *inode = file->f_mapping->host;
434         struct ext4_map_blocks map;
435         struct extent_status es;
436         ext4_lblk_t start, last, end;
437         loff_t dataoff, isize;
438         int blkbits;
439         int ret = 0;
440
441         mutex_lock(&inode->i_mutex);
442
443         isize = i_size_read(inode);
444         if (offset >= isize) {
445                 mutex_unlock(&inode->i_mutex);
446                 return -ENXIO;
447         }
448
449         blkbits = inode->i_sb->s_blocksize_bits;
450         start = offset >> blkbits;
451         last = start;
452         end = isize >> blkbits;
453         dataoff = offset;
454
455         do {
456                 map.m_lblk = last;
457                 map.m_len = end - last + 1;
458                 ret = ext4_map_blocks(NULL, inode, &map, 0);
459                 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
460                         if (last != start)
461                                 dataoff = (loff_t)last << blkbits;
462                         break;
463                 }
464
465                 /*
466                  * If there is a delay extent at this offset,
467                  * it will be as a data.
468                  */
469                 ext4_es_find_delayed_extent_range(inode, last, last, &es);
470                 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
471                         if (last != start)
472                                 dataoff = (loff_t)last << blkbits;
473                         break;
474                 }
475
476                 /*
477                  * If there is a unwritten extent at this offset,
478                  * it will be as a data or a hole according to page
479                  * cache that has data or not.
480                  */
481                 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
482                         int unwritten;
483                         unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
484                                                               &map, &dataoff);
485                         if (unwritten)
486                                 break;
487                 }
488
489                 last++;
490                 dataoff = (loff_t)last << blkbits;
491         } while (last <= end);
492
493         mutex_unlock(&inode->i_mutex);
494
495         if (dataoff > isize)
496                 return -ENXIO;
497
498         return vfs_setpos(file, dataoff, maxsize);
499 }
500
501 /*
502  * ext4_seek_hole() retrieves the offset for SEEK_HOLE.
503  */
504 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
505 {
506         struct inode *inode = file->f_mapping->host;
507         struct ext4_map_blocks map;
508         struct extent_status es;
509         ext4_lblk_t start, last, end;
510         loff_t holeoff, isize;
511         int blkbits;
512         int ret = 0;
513
514         mutex_lock(&inode->i_mutex);
515
516         isize = i_size_read(inode);
517         if (offset >= isize) {
518                 mutex_unlock(&inode->i_mutex);
519                 return -ENXIO;
520         }
521
522         blkbits = inode->i_sb->s_blocksize_bits;
523         start = offset >> blkbits;
524         last = start;
525         end = isize >> blkbits;
526         holeoff = offset;
527
528         do {
529                 map.m_lblk = last;
530                 map.m_len = end - last + 1;
531                 ret = ext4_map_blocks(NULL, inode, &map, 0);
532                 if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
533                         last += ret;
534                         holeoff = (loff_t)last << blkbits;
535                         continue;
536                 }
537
538                 /*
539                  * If there is a delay extent at this offset,
540                  * we will skip this extent.
541                  */
542                 ext4_es_find_delayed_extent_range(inode, last, last, &es);
543                 if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
544                         last = es.es_lblk + es.es_len;
545                         holeoff = (loff_t)last << blkbits;
546                         continue;
547                 }
548
549                 /*
550                  * If there is a unwritten extent at this offset,
551                  * it will be as a data or a hole according to page
552                  * cache that has data or not.
553                  */
554                 if (map.m_flags & EXT4_MAP_UNWRITTEN) {
555                         int unwritten;
556                         unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
557                                                               &map, &holeoff);
558                         if (!unwritten) {
559                                 last += ret;
560                                 holeoff = (loff_t)last << blkbits;
561                                 continue;
562                         }
563                 }
564
565                 /* find a hole */
566                 break;
567         } while (last <= end);
568
569         mutex_unlock(&inode->i_mutex);
570
571         if (holeoff > isize)
572                 holeoff = isize;
573
574         return vfs_setpos(file, holeoff, maxsize);
575 }
576
577 /*
578  * ext4_llseek() handles both block-mapped and extent-mapped maxbytes values
579  * by calling generic_file_llseek_size() with the appropriate maxbytes
580  * value for each.
581  */
582 loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
583 {
584         struct inode *inode = file->f_mapping->host;
585         loff_t maxbytes;
586
587         if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
588                 maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
589         else
590                 maxbytes = inode->i_sb->s_maxbytes;
591
592         switch (whence) {
593         case SEEK_SET:
594         case SEEK_CUR:
595         case SEEK_END:
596                 return generic_file_llseek_size(file, offset, whence,
597                                                 maxbytes, i_size_read(inode));
598         case SEEK_DATA:
599                 return ext4_seek_data(file, offset, maxbytes);
600         case SEEK_HOLE:
601                 return ext4_seek_hole(file, offset, maxbytes);
602         }
603
604         return -EINVAL;
605 }
606
607 const struct file_operations ext4_file_operations = {
608         .llseek         = ext4_llseek,
609         .read           = new_sync_read,
610         .write          = new_sync_write,
611         .read_iter      = generic_file_read_iter,
612         .write_iter     = ext4_file_write_iter,
613         .unlocked_ioctl = ext4_ioctl,
614 #ifdef CONFIG_COMPAT
615         .compat_ioctl   = ext4_compat_ioctl,
616 #endif
617         .mmap           = ext4_file_mmap,
618         .open           = ext4_file_open,
619         .release        = ext4_release_file,
620         .fsync          = ext4_sync_file,
621         .splice_read    = generic_file_splice_read,
622         .splice_write   = iter_file_splice_write,
623         .fallocate      = ext4_fallocate,
624 };
625
626 #ifdef CONFIG_FS_DAX
627 const struct file_operations ext4_dax_file_operations = {
628         .llseek         = ext4_llseek,
629         .read           = new_sync_read,
630         .write          = new_sync_write,
631         .read_iter      = generic_file_read_iter,
632         .write_iter     = ext4_file_write_iter,
633         .unlocked_ioctl = ext4_ioctl,
634 #ifdef CONFIG_COMPAT
635         .compat_ioctl   = ext4_compat_ioctl,
636 #endif
637         .mmap           = ext4_file_mmap,
638         .open           = ext4_file_open,
639         .release        = ext4_release_file,
640         .fsync          = ext4_sync_file,
641         /* Splice not yet supported with DAX */
642         .fallocate      = ext4_fallocate,
643 };
644 #endif
645
646 const struct inode_operations ext4_file_inode_operations = {
647         .setattr        = ext4_setattr,
648         .getattr        = ext4_getattr,
649         .setxattr       = generic_setxattr,
650         .getxattr       = generic_getxattr,
651         .listxattr      = ext4_listxattr,
652         .removexattr    = generic_removexattr,
653         .get_acl        = ext4_get_acl,
654         .set_acl        = ext4_set_acl,
655         .fiemap         = ext4_fiemap,
656 };
657