OSDN Git Service

Merge tag 'xfs-for-linus-4.2-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git...
[uclinux-h8/linux.git] / fs / xfs / xfs_file.c
index 7c62fca..874507d 100644 (file)
@@ -80,14 +80,15 @@ xfs_rw_ilock_demote(
 }
 
 /*
- *     xfs_iozero
+ * xfs_iozero clears the specified range supplied via the page cache (except in
+ * the DAX case). Writes through the page cache will allocate blocks over holes,
+ * though the callers usually map the holes first and avoid them. If a block is
+ * not completely zeroed, then it will be read from disk before being partially
+ * zeroed.
  *
- *     xfs_iozero clears the specified range of buffer supplied,
- *     and marks all the affected blocks as valid and modified.  If
- *     an affected block is not allocated, it will be allocated.  If
- *     an affected block is not completely overwritten, and is not
- *     valid before the operation, it will be read from disk before
- *     being partially zeroed.
+ * In the DAX case, we can just directly write to the underlying pages. This
+ * will not allocate blocks, but will avoid holes and unwritten extents and so
+ * not do unnecessary work.
  */
 int
 xfs_iozero(
@@ -97,7 +98,8 @@ xfs_iozero(
 {
        struct page             *page;
        struct address_space    *mapping;
-       int                     status;
+       int                     status = 0;
+
 
        mapping = VFS_I(ip)->i_mapping;
        do {
@@ -109,20 +111,27 @@ xfs_iozero(
                if (bytes > count)
                        bytes = count;
 
-               status = pagecache_write_begin(NULL, mapping, pos, bytes,
-                                       AOP_FLAG_UNINTERRUPTIBLE,
-                                       &page, &fsdata);
-               if (status)
-                       break;
+               if (IS_DAX(VFS_I(ip))) {
+                       status = dax_zero_page_range(VFS_I(ip), pos, bytes,
+                                                    xfs_get_blocks_direct);
+                       if (status)
+                               break;
+               } else {
+                       status = pagecache_write_begin(NULL, mapping, pos, bytes,
+                                               AOP_FLAG_UNINTERRUPTIBLE,
+                                               &page, &fsdata);
+                       if (status)
+                               break;
 
-               zero_user(page, offset, bytes);
+                       zero_user(page, offset, bytes);
 
-               status = pagecache_write_end(NULL, mapping, pos, bytes, bytes,
-                                       page, fsdata);
-               WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = pagecache_write_end(NULL, mapping, pos, bytes,
+                                               bytes, page, fsdata);
+                       WARN_ON(status <= 0); /* can't return less than zero! */
+                       status = 0;
+               }
                pos += bytes;
                count -= bytes;
-               status = 0;
        } while (count);
 
        return status;
@@ -139,7 +148,7 @@ xfs_update_prealloc_flags(
        tp = xfs_trans_alloc(ip->i_mount, XFS_TRANS_WRITEID);
        error = xfs_trans_reserve(tp, &M_RES(ip->i_mount)->tr_writeid, 0, 0);
        if (error) {
-               xfs_trans_cancel(tp, 0);
+               xfs_trans_cancel(tp);
                return error;
        }
 
@@ -161,7 +170,7 @@ xfs_update_prealloc_flags(
        xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
        if (flags & XFS_PREALLOC_SYNC)
                xfs_trans_set_sync(tp);
-       return xfs_trans_commit(tp, 0);
+       return xfs_trans_commit(tp);
 }
 
 /*
@@ -285,7 +294,7 @@ xfs_file_read_iter(
        if (file->f_mode & FMODE_NOCMTIME)
                ioflags |= XFS_IO_INVIS;
 
-       if (unlikely(ioflags & XFS_IO_ISDIRECT)) {
+       if ((ioflags & XFS_IO_ISDIRECT) && !IS_DAX(inode)) {
                xfs_buftarg_t   *target =
                        XFS_IS_REALTIME_INODE(ip) ?
                                mp->m_rtdev_targp : mp->m_ddev_targp;
@@ -379,7 +388,11 @@ xfs_file_splice_read(
 
        trace_xfs_file_splice_read(ip, count, *ppos, ioflags);
 
-       ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
+       /* for dax, we need to avoid the page cache */
+       if (IS_DAX(VFS_I(ip)))
+               ret = default_file_splice_read(infilp, ppos, pipe, count, flags);
+       else
+               ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
        if (ret > 0)
                XFS_STATS_ADD(xs_read_bytes, ret);
 
@@ -673,7 +686,7 @@ xfs_file_dio_aio_write(
                                        mp->m_rtdev_targp : mp->m_ddev_targp;
 
        /* DIO must be aligned to device logical sector size */
-       if ((pos | count) & target->bt_logical_sectormask)
+       if (!IS_DAX(inode) && ((pos | count) & target->bt_logical_sectormask))
                return -EINVAL;
 
        /* "unaligned" here means not aligned to a filesystem block */
@@ -759,8 +772,11 @@ xfs_file_dio_aio_write(
 out:
        xfs_rw_iunlock(ip, iolock);
 
-       /* No fallback to buffered IO on errors for XFS. */
-       ASSERT(ret < 0 || ret == count);
+       /*
+        * No fallback to buffered IO on errors for XFS. DAX can result in
+        * partial writes, but direct IO will either complete fully or fail.
+        */
+       ASSERT(ret < 0 || ret == count || IS_DAX(VFS_I(ip)));
        return ret;
 }
 
@@ -843,7 +859,7 @@ xfs_file_write_iter(
        if (XFS_FORCED_SHUTDOWN(ip->i_mount))
                return -EIO;
 
-       if (unlikely(iocb->ki_flags & IOCB_DIRECT))
+       if ((iocb->ki_flags & IOCB_DIRECT) || IS_DAX(inode))
                ret = xfs_file_dio_aio_write(iocb, from);
        else
                ret = xfs_file_buffered_aio_write(iocb, from);
@@ -1064,17 +1080,6 @@ xfs_file_readdir(
        return xfs_readdir(ip, ctx, bufsize);
 }
 
-STATIC int
-xfs_file_mmap(
-       struct file     *filp,
-       struct vm_area_struct *vma)
-{
-       vma->vm_ops = &xfs_file_vm_ops;
-
-       file_accessed(filp);
-       return 0;
-}
-
 /*
  * This type is designed to indicate the type of offset we would like
  * to search from page cache for xfs_seek_hole_data().
@@ -1455,48 +1460,83 @@ xfs_file_llseek(
  * ordering of:
  *
  * mmap_sem (MM)
- *   i_mmap_lock (XFS - truncate serialisation)
- *     page_lock (MM)
- *       i_lock (XFS - extent map serialisation)
+ *   sb_start_pagefault(vfs, freeze)
+ *     i_mmap_lock (XFS - truncate serialisation)
+ *       page_lock (MM)
+ *         i_lock (XFS - extent map serialisation)
+ */
+
+/*
+ * mmap()d file has taken write protection fault and is being made writable. We
+ * can set the page state up correctly for a writable page, which means we can
+ * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
+ * mapping.
  */
 STATIC int
-xfs_filemap_fault(
+xfs_filemap_page_mkwrite(
        struct vm_area_struct   *vma,
        struct vm_fault         *vmf)
 {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
+       struct inode            *inode = file_inode(vma->vm_file);
+       int                     ret;
 
-       trace_xfs_filemap_fault(ip);
+       trace_xfs_filemap_page_mkwrite(XFS_I(inode));
 
-       xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = filemap_fault(vma, vmf);
-       xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
+       sb_start_pagefault(inode->i_sb);
+       file_update_time(vma->vm_file);
+       xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
-       return error;
+       if (IS_DAX(inode)) {
+               ret = __dax_mkwrite(vma, vmf, xfs_get_blocks_direct,
+                                   xfs_end_io_dax_write);
+       } else {
+               ret = __block_page_mkwrite(vma, vmf, xfs_get_blocks);
+               ret = block_page_mkwrite_return(ret);
+       }
+
+       xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
+       sb_end_pagefault(inode->i_sb);
+
+       return ret;
 }
 
-/*
- * mmap()d file has taken write protection fault and is being made writable. We
- * can set the page state up correctly for a writable page, which means we can
- * do correct delalloc accounting (ENOSPC checking!) and unwritten extent
- * mapping.
- */
 STATIC int
-xfs_filemap_page_mkwrite(
+xfs_filemap_fault(
        struct vm_area_struct   *vma,
        struct vm_fault         *vmf)
 {
-       struct xfs_inode        *ip = XFS_I(vma->vm_file->f_mapping->host);
-       int                     error;
+       struct xfs_inode        *ip = XFS_I(file_inode(vma->vm_file));
+       int                     ret;
+
+       trace_xfs_filemap_fault(ip);
 
-       trace_xfs_filemap_page_mkwrite(ip);
+       /* DAX can shortcut the normal fault path on write faults! */
+       if ((vmf->flags & FAULT_FLAG_WRITE) && IS_DAX(VFS_I(ip)))
+               return xfs_filemap_page_mkwrite(vma, vmf);
 
        xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
-       error = block_page_mkwrite(vma, vmf, xfs_get_blocks);
+       ret = filemap_fault(vma, vmf);
        xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
 
-       return error;
+       return ret;
+}
+
+static const struct vm_operations_struct xfs_file_vm_ops = {
+       .fault          = xfs_filemap_fault,
+       .map_pages      = filemap_map_pages,
+       .page_mkwrite   = xfs_filemap_page_mkwrite,
+};
+
+STATIC int
+xfs_file_mmap(
+       struct file     *filp,
+       struct vm_area_struct *vma)
+{
+       file_accessed(filp);
+       vma->vm_ops = &xfs_file_vm_ops;
+       if (IS_DAX(file_inode(filp)))
+               vma->vm_flags |= VM_MIXEDMAP;
+       return 0;
 }
 
 const struct file_operations xfs_file_operations = {
@@ -1527,9 +1567,3 @@ const struct file_operations xfs_dir_file_operations = {
 #endif
        .fsync          = xfs_dir_fsync,
 };
-
-static const struct vm_operations_struct xfs_file_vm_ops = {
-       .fault          = xfs_filemap_fault,
-       .map_pages      = filemap_map_pages,
-       .page_mkwrite   = xfs_filemap_page_mkwrite,
-};