fs/xfs/xfs_aops.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_shared.h"
   8 #include "xfs_format.h"
   9 #include "xfs_log_format.h"
  10 #include "xfs_trans_resv.h"
  11 #include "xfs_mount.h"
  12 #include "xfs_inode.h"
  13 #include "xfs_trans.h"
  14 #include "xfs_inode_item.h"
  15 #include "xfs_alloc.h"
  16 #include "xfs_error.h"
  17 #include "xfs_iomap.h"
  18 #include "xfs_trace.h"
  19 #include "xfs_bmap.h"
  20 #include "xfs_bmap_util.h"
  21 #include "xfs_bmap_btree.h"
  22 #include "xfs_reflink.h"
  23 #include <linux/writeback.h>
  24
  25 /*
  26  * structure owned by writepages passed to individual writepage calls
  27  */
  28 struct xfs_writepage_ctx {
  29         struct xfs_bmbt_irec    imap;
  30         unsigned int            io_type;
  31         struct xfs_ioend        *ioend;
  32 };
  33
  34 struct block_device *
  35 xfs_find_bdev_for_inode(
  36         struct inode            *inode)
  37 {
  38         struct xfs_inode        *ip = XFS_I(inode);
  39         struct xfs_mount        *mp = ip->i_mount;
  40
  41         if (XFS_IS_REALTIME_INODE(ip))
  42                 return mp->m_rtdev_targp->bt_bdev;
  43         else
  44                 return mp->m_ddev_targp->bt_bdev;
  45 }
  46
  47 struct dax_device *
  48 xfs_find_daxdev_for_inode(
  49         struct inode            *inode)
  50 {
  51         struct xfs_inode        *ip = XFS_I(inode);
  52         struct xfs_mount        *mp = ip->i_mount;
  53
  54         if (XFS_IS_REALTIME_INODE(ip))
  55                 return mp->m_rtdev_targp->bt_daxdev;
  56         else
  57                 return mp->m_ddev_targp->bt_daxdev;
  58 }
  59
  60 static void
  61 xfs_finish_page_writeback(
  62         struct inode            *inode,
  63         struct bio_vec          *bvec,
  64         int                     error)
  65 {
  66         struct iomap_page       *iop = to_iomap_page(bvec->bv_page);
  67
  68         if (error) {
  69                 SetPageError(bvec->bv_page);
  70                 mapping_set_error(inode->i_mapping, -EIO);
  71         }
  72
  73         ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
  74         ASSERT(!iop || atomic_read(&iop->write_count) > 0);
  75
  76         if (!iop || atomic_dec_and_test(&iop->write_count))
  77                 end_page_writeback(bvec->bv_page);
  78 }
  79
  80 /*
  81  * We're now finished for good with this ioend structure.  Update the page
  82  * state, release holds on bios, and finally free up memory.  Do not use the
  83  * ioend after this.
  84  */
  85 STATIC void
  86 xfs_destroy_ioend(
  87         struct xfs_ioend        *ioend,
  88         int                     error)
  89 {
  90         struct inode            *inode = ioend->io_inode;
  91         struct bio              *bio = &ioend->io_inline_bio;
  92         struct bio              *last = ioend->io_bio, *next;
  93         u64                     start = bio->bi_iter.bi_sector;
  94         bool                    quiet = bio_flagged(bio, BIO_QUIET);
  95
  96         for (bio = &ioend->io_inline_bio; bio; bio = next) {
  97                 struct bio_vec  *bvec;
  98                 int             i;
  99
 100                 /*
 101                  * For the last bio, bi_private points to the ioend, so we
 102                  * need to explicitly end the iteration here.
 103                  */
 104                 if (bio == last)
 105                         next = NULL;
 106                 else
 107                         next = bio->bi_private;
 108
 109                 /* walk each page on bio, ending page IO on them */
 110                 bio_for_each_segment_all(bvec, bio, i)
 111                         xfs_finish_page_writeback(inode, bvec, error);
 112                 bio_put(bio);
 113         }
 114
 115         if (unlikely(error && !quiet)) {
 116                 xfs_err_ratelimited(XFS_I(inode)->i_mount,
 117                         "writeback error on sector %llu", start);
 118         }
 119 }
 120
 121 /*
 122  * Fast and loose check if this write could update the on-disk inode size.
 123  */
 124 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 125 {
 126         return ioend->io_offset + ioend->io_size >
 127                 XFS_I(ioend->io_inode)->i_d.di_size;
 128 }
 129
 130 STATIC int
 131 xfs_setfilesize_trans_alloc(
 132         struct xfs_ioend        *ioend)
 133 {
 134         struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 135         struct xfs_trans        *tp;
 136         int                     error;
 137
 138         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0,
 139                                 XFS_TRANS_NOFS, &tp);
 140         if (error)
 141                 return error;
 142
 143         ioend->io_append_trans = tp;
 144
 145         /*
 146          * We may pass freeze protection with a transaction.  So tell lockdep
 147          * we released it.
 148          */
 149         __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
 150         /*
 151          * We hand off the transaction to the completion thread now, so
 152          * clear the flag here.
 153          */
 154         current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 155         return 0;
 156 }
 157
 158 /*
 159  * Update on-disk file size now that data has been written to disk.
 160  */
 161 STATIC int
 162 __xfs_setfilesize(
 163         struct xfs_inode        *ip,
 164         struct xfs_trans        *tp,
 165         xfs_off_t               offset,
 166         size_t                  size)
 167 {
 168         xfs_fsize_t             isize;
 169
 170         xfs_ilock(ip, XFS_ILOCK_EXCL);
 171         isize = xfs_new_eof(ip, offset + size);
 172         if (!isize) {
 173                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 174                 xfs_trans_cancel(tp);
 175                 return 0;
 176         }
 177
 178         trace_xfs_setfilesize(ip, offset, size);
 179
 180         ip->i_d.di_size = isize;
 181         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 182         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 183
 184         return xfs_trans_commit(tp);
 185 }
 186
 187 int
 188 xfs_setfilesize(
 189         struct xfs_inode        *ip,
 190         xfs_off_t               offset,
 191         size_t                  size)
 192 {
 193         struct xfs_mount        *mp = ip->i_mount;
 194         struct xfs_trans        *tp;
 195         int                     error;
 196
 197         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 198         if (error)
 199                 return error;
 200
 201         return __xfs_setfilesize(ip, tp, offset, size);
 202 }
 203
 204 STATIC int
 205 xfs_setfilesize_ioend(
 206         struct xfs_ioend        *ioend,
 207         int                     error)
 208 {
 209         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 210         struct xfs_trans        *tp = ioend->io_append_trans;
 211
 212         /*
 213          * The transaction may have been allocated in the I/O submission thread,
 214          * thus we need to mark ourselves as being in a transaction manually.
 215          * Similarly for freeze protection.
 216          */
 217         current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
 218         __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 219
 220         /* we abort the update if there was an IO error */
 221         if (error) {
 222                 xfs_trans_cancel(tp);
 223                 return error;
 224         }
 225
 226         return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 227 }
 228
 229 /*
 230  * IO write completion.
 231  */
 232 STATIC void
 233 xfs_end_io(
 234         struct work_struct *work)
 235 {
 236         struct xfs_ioend        *ioend =
 237                 container_of(work, struct xfs_ioend, io_work);
 238         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 239         xfs_off_t               offset = ioend->io_offset;
 240         size_t                  size = ioend->io_size;
 241         int                     error;
 242
 243         /*
 244          * Just clean up the in-memory strutures if the fs has been shut down.
 245          */
 246         if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 247                 error = -EIO;
 248                 goto done;
 249         }
 250
 251         /*
 252          * Clean up any COW blocks on an I/O error.
 253          */
 254         error = blk_status_to_errno(ioend->io_bio->bi_status);
 255         if (unlikely(error)) {
 256                 switch (ioend->io_type) {
 257                 case XFS_IO_COW:
 258                         xfs_reflink_cancel_cow_range(ip, offset, size, true);
 259                         break;
 260                 }
 261
 262                 goto done;
 263         }
 264
 265         /*
 266          * Success:  commit the COW or unwritten blocks if needed.
 267          */
 268         switch (ioend->io_type) {
 269         case XFS_IO_COW:
 270                 error = xfs_reflink_end_cow(ip, offset, size);
 271                 break;
 272         case XFS_IO_UNWRITTEN:
 273                 /* writeback should never update isize */
 274                 error = xfs_iomap_write_unwritten(ip, offset, size, false);
 275                 break;
 276         default:
 277                 ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
 278                 break;
 279         }
 280
 281 done:
 282         if (ioend->io_append_trans)
 283                 error = xfs_setfilesize_ioend(ioend, error);
 284         xfs_destroy_ioend(ioend, error);
 285 }
 286
 287 STATIC void
 288 xfs_end_bio(
 289         struct bio              *bio)
 290 {
 291         struct xfs_ioend        *ioend = bio->bi_private;
 292         struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 293
 294         if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
 295                 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 296         else if (ioend->io_append_trans)
 297                 queue_work(mp->m_data_workqueue, &ioend->io_work);
 298         else
 299                 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
 300 }
 301
 302 STATIC int
 303 xfs_map_blocks(
 304         struct xfs_writepage_ctx *wpc,
 305         struct inode            *inode,
 306         loff_t                  offset)
 307 {
 308         struct xfs_inode        *ip = XFS_I(inode);
 309         struct xfs_mount        *mp = ip->i_mount;
 310         ssize_t                 count = i_blocksize(inode);
 311         xfs_fileoff_t           offset_fsb = XFS_B_TO_FSBT(mp, offset), end_fsb;
 312         struct xfs_bmbt_irec    imap;
 313         int                     whichfork = XFS_DATA_FORK;
 314         struct xfs_iext_cursor  icur;
 315         bool                    imap_valid;
 316         int                     error = 0;
 317
 318         /*
 319          * We have to make sure the cached mapping is within EOF to protect
 320          * against eofblocks trimming on file release leaving us with a stale
 321          * mapping. Otherwise, a page for a subsequent file extending buffered
 322          * write could get picked up by this writeback cycle and written to the
 323          * wrong blocks.
 324          *
 325          * Note that what we really want here is a generic mapping invalidation
 326          * mechanism to protect us from arbitrary extent modifying contexts, not
 327          * just eofblocks.
 328          */
 329         xfs_trim_extent_eof(&wpc->imap, ip);
 330
 331         /*
 332          * COW fork blocks can overlap data fork blocks even if the blocks
 333          * aren't shared.  COW I/O always takes precedent, so we must always
 334          * check for overlap on reflink inodes unless the mapping is already a
 335          * COW one.
 336          */
 337         imap_valid = offset_fsb >= wpc->imap.br_startoff &&
 338                      offset_fsb < wpc->imap.br_startoff + wpc->imap.br_blockcount;
 339         if (imap_valid &&
 340             (!xfs_is_reflink_inode(ip) || wpc->io_type == XFS_IO_COW))
 341                 return 0;
 342
 343         if (XFS_FORCED_SHUTDOWN(mp))
 344                 return -EIO;
 345
 346         /*
 347          * If we don't have a valid map, now it's time to get a new one for this
 348          * offset.  This will convert delayed allocations (including COW ones)
 349          * into real extents.  If we return without a valid map, it means we
 350          * landed in a hole and we skip the block.
 351          */
 352         xfs_ilock(ip, XFS_ILOCK_SHARED);
 353         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 354                (ip->i_df.if_flags & XFS_IFEXTENTS));
 355         ASSERT(offset <= mp->m_super->s_maxbytes);
 356
 357         if (offset > mp->m_super->s_maxbytes - count)
 358                 count = mp->m_super->s_maxbytes - offset;
 359         end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 360
 361         /*
 362          * Check if this is offset is covered by a COW extents, and if yes use
 363          * it directly instead of looking up anything in the data fork.
 364          */
 365         if (xfs_is_reflink_inode(ip) &&
 366             xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap) &&
 367             imap.br_startoff <= offset_fsb) {
 368                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 369                 /*
 370                  * Truncate can race with writeback since writeback doesn't
 371                  * take the iolock and truncate decreases the file size before
 372                  * it starts truncating the pages between new_size and old_size.
 373                  * Therefore, we can end up in the situation where writeback
 374                  * gets a CoW fork mapping but the truncate makes the mapping
 375                  * invalid and we end up in here trying to get a new mapping.
 376                  * bail out here so that we simply never get a valid mapping
 377                  * and so we drop the write altogether.  The page truncation
 378                  * will kill the contents anyway.
 379                  */
 380                 if (offset > i_size_read(inode)) {
 381                         wpc->io_type = XFS_IO_HOLE;
 382                         return 0;
 383                 }
 384                 whichfork = XFS_COW_FORK;
 385                 wpc->io_type = XFS_IO_COW;
 386                 goto allocate_blocks;
 387         }
 388
 389         /*
 390          * Map valid and no COW extent in the way?  We're done.
 391          */
 392         if (imap_valid) {
 393                 xfs_iunlock(ip, XFS_ILOCK_SHARED);
 394                 return 0;
 395         }
 396
 397         /*
 398          * If we don't have a valid map, now it's time to get a new one for this
 399          * offset.  This will convert delayed allocations (including COW ones)
 400          * into real extents.
 401          */
 402         if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &icur, &imap))
 403                 imap.br_startoff = end_fsb;     /* fake a hole past EOF */
 404         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 405
 406         if (imap.br_startoff > offset_fsb) {
 407                 /* landed in a hole or beyond EOF */
 408                 imap.br_blockcount = imap.br_startoff - offset_fsb;
 409                 imap.br_startoff = offset_fsb;
 410                 imap.br_startblock = HOLESTARTBLOCK;
 411                 wpc->io_type = XFS_IO_HOLE;
 412         } else {
 413                 if (isnullstartblock(imap.br_startblock)) {
 414                         /* got a delalloc extent */
 415                         wpc->io_type = XFS_IO_DELALLOC;
 416                         goto allocate_blocks;
 417                 }
 418
 419                 if (imap.br_state == XFS_EXT_UNWRITTEN)
 420                         wpc->io_type = XFS_IO_UNWRITTEN;
 421                 else
 422                         wpc->io_type = XFS_IO_OVERWRITE;
 423         }
 424
 425         wpc->imap = imap;
 426         trace_xfs_map_blocks_found(ip, offset, count, wpc->io_type, &imap);
 427         return 0;
 428 allocate_blocks:
 429         error = xfs_iomap_write_allocate(ip, whichfork, offset, &imap);
 430         if (error)
 431                 return error;
 432         wpc->imap = imap;
 433         trace_xfs_map_blocks_alloc(ip, offset, count, wpc->io_type, &imap);
 434         return 0;
 435 }
 436
 437 /*
 438  * Submit the bio for an ioend. We are passed an ioend with a bio attached to
 439  * it, and we submit that bio. The ioend may be used for multiple bio
 440  * submissions, so we only want to allocate an append transaction for the ioend
 441  * once. In the case of multiple bio submission, each bio will take an IO
 442  * reference to the ioend to ensure that the ioend completion is only done once
 443  * all bios have been submitted and the ioend is really done.
 444  *
 445  * If @fail is non-zero, it means that we have a situation where some part of
 446  * the submission process has failed after we have marked paged for writeback
 447  * and unlocked them. In this situation, we need to fail the bio and ioend
 448  * rather than submit it to IO. This typically only happens on a filesystem
 449  * shutdown.
 450  */
 451 STATIC int
 452 xfs_submit_ioend(
 453         struct writeback_control *wbc,
 454         struct xfs_ioend        *ioend,
 455         int                     status)
 456 {
 457         /* Convert CoW extents to regular */
 458         if (!status && ioend->io_type == XFS_IO_COW) {
 459                 /*
 460                  * Yuk. This can do memory allocation, but is not a
 461                  * transactional operation so everything is done in GFP_KERNEL
 462                  * context. That can deadlock, because we hold pages in
 463                  * writeback state and GFP_KERNEL allocations can block on them.
 464                  * Hence we must operate in nofs conditions here.
 465                  */
 466                 unsigned nofs_flag;
 467
 468                 nofs_flag = memalloc_nofs_save();
 469                 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 470                                 ioend->io_offset, ioend->io_size);
 471                 memalloc_nofs_restore(nofs_flag);
 472         }
 473
 474         /* Reserve log space if we might write beyond the on-disk inode size. */
 475         if (!status &&
 476             ioend->io_type != XFS_IO_UNWRITTEN &&
 477             xfs_ioend_is_append(ioend) &&
 478             !ioend->io_append_trans)
 479                 status = xfs_setfilesize_trans_alloc(ioend);
 480
 481         ioend->io_bio->bi_private = ioend;
 482         ioend->io_bio->bi_end_io = xfs_end_bio;
 483         ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 484
 485         /*
 486          * If we are failing the IO now, just mark the ioend with an
 487          * error and finish it. This will run IO completion immediately
 488          * as there is only one reference to the ioend at this point in
 489          * time.
 490          */
 491         if (status) {
 492                 ioend->io_bio->bi_status = errno_to_blk_status(status);
 493                 bio_endio(ioend->io_bio);
 494                 return status;
 495         }
 496
 497         ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
 498         submit_bio(ioend->io_bio);
 499         return 0;
 500 }
 501
 502 static struct xfs_ioend *
 503 xfs_alloc_ioend(
 504         struct inode            *inode,
 505         unsigned int            type,
 506         xfs_off_t               offset,
 507         struct block_device     *bdev,
 508         sector_t                sector)
 509 {
 510         struct xfs_ioend        *ioend;
 511         struct bio              *bio;
 512
 513         bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, &xfs_ioend_bioset);
 514         bio_set_dev(bio, bdev);
 515         bio->bi_iter.bi_sector = sector;
 516
 517         ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
 518         INIT_LIST_HEAD(&ioend->io_list);
 519         ioend->io_type = type;
 520         ioend->io_inode = inode;
 521         ioend->io_size = 0;
 522         ioend->io_offset = offset;
 523         INIT_WORK(&ioend->io_work, xfs_end_io);
 524         ioend->io_append_trans = NULL;
 525         ioend->io_bio = bio;
 526         return ioend;
 527 }
 528
 529 /*
 530  * Allocate a new bio, and chain the old bio to the new one.
 531  *
 532  * Note that we have to do perform the chaining in this unintuitive order
 533  * so that the bi_private linkage is set up in the right direction for the
 534  * traversal in xfs_destroy_ioend().
 535  */
 536 static void
 537 xfs_chain_bio(
 538         struct xfs_ioend        *ioend,
 539         struct writeback_control *wbc,
 540         struct block_device     *bdev,
 541         sector_t                sector)
 542 {
 543         struct bio *new;
 544
 545         new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
 546         bio_set_dev(new, bdev);
 547         new->bi_iter.bi_sector = sector;
 548         bio_chain(ioend->io_bio, new);
 549         bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
 550         ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 551         ioend->io_bio->bi_write_hint = ioend->io_inode->i_write_hint;
 552         submit_bio(ioend->io_bio);
 553         ioend->io_bio = new;
 554 }
 555
 556 /*
 557  * Test to see if we have an existing ioend structure that we could append to
 558  * first, otherwise finish off the current ioend and start another.
 559  */
 560 STATIC void
 561 xfs_add_to_ioend(
 562         struct inode            *inode,
 563         xfs_off_t               offset,
 564         struct page             *page,
 565         struct iomap_page       *iop,
 566         struct xfs_writepage_ctx *wpc,
 567         struct writeback_control *wbc,
 568         struct list_head        *iolist)
 569 {
 570         struct xfs_inode        *ip = XFS_I(inode);
 571         struct xfs_mount        *mp = ip->i_mount;
 572         struct block_device     *bdev = xfs_find_bdev_for_inode(inode);
 573         unsigned                len = i_blocksize(inode);
 574         unsigned                poff = offset & (PAGE_SIZE - 1);
 575         sector_t                sector;
 576
 577         sector = xfs_fsb_to_db(ip, wpc->imap.br_startblock) +
 578                 ((offset - XFS_FSB_TO_B(mp, wpc->imap.br_startoff)) >> 9);
 579
 580         if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 581             sector != bio_end_sector(wpc->ioend->io_bio) ||
 582             offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 583                 if (wpc->ioend)
 584                         list_add(&wpc->ioend->io_list, iolist);
 585                 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset,
 586                                 bdev, sector);
 587         }
 588
 589         if (!__bio_try_merge_page(wpc->ioend->io_bio, page, len, poff)) {
 590                 if (iop)
 591                         atomic_inc(&iop->write_count);
 592                 if (bio_full(wpc->ioend->io_bio))
 593                         xfs_chain_bio(wpc->ioend, wbc, bdev, sector);
 594                 __bio_add_page(wpc->ioend->io_bio, page, len, poff);
 595         }
 596
 597         wpc->ioend->io_size += len;
 598 }
 599
 600 STATIC void
 601 xfs_vm_invalidatepage(
 602         struct page             *page,
 603         unsigned int            offset,
 604         unsigned int            length)
 605 {
 606         trace_xfs_invalidatepage(page->mapping->host, page, offset, length);
 607         iomap_invalidatepage(page, offset, length);
 608 }
 609
 610 /*
 611  * If the page has delalloc blocks on it, we need to punch them out before we
 612  * invalidate the page.  If we don't, we leave a stale delalloc mapping on the
 613  * inode that can trip up a later direct I/O read operation on the same region.
 614  *
 615  * We prevent this by truncating away the delalloc regions on the page.  Because
 616  * they are delalloc, we can do this without needing a transaction. Indeed - if
 617  * we get ENOSPC errors, we have to be able to do this truncation without a
 618  * transaction as there is no space left for block reservation (typically why we
 619  * see a ENOSPC in writeback).
 620  */
 621 STATIC void
 622 xfs_aops_discard_page(
 623         struct page             *page)
 624 {
 625         struct inode            *inode = page->mapping->host;
 626         struct xfs_inode        *ip = XFS_I(inode);
 627         struct xfs_mount        *mp = ip->i_mount;
 628         loff_t                  offset = page_offset(page);
 629         xfs_fileoff_t           start_fsb = XFS_B_TO_FSBT(mp, offset);
 630         int                     error;
 631
 632         if (XFS_FORCED_SHUTDOWN(mp))
 633                 goto out_invalidate;
 634
 635         xfs_alert(mp,
 636                 "page discard on page "PTR_FMT", inode 0x%llx, offset %llu.",
 637                         page, ip->i_ino, offset);
 638
 639         error = xfs_bmap_punch_delalloc_range(ip, start_fsb,
 640                         PAGE_SIZE / i_blocksize(inode));
 641         if (error && !XFS_FORCED_SHUTDOWN(mp))
 642                 xfs_alert(mp, "page discard unable to remove delalloc mapping.");
 643 out_invalidate:
 644         xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
 645 }
 646
 647 /*
 648  * We implement an immediate ioend submission policy here to avoid needing to
 649  * chain multiple ioends and hence nest mempool allocations which can violate
 650  * forward progress guarantees we need to provide. The current ioend we are
 651  * adding blocks to is cached on the writepage context, and if the new block
 652  * does not append to the cached ioend it will create a new ioend and cache that
 653  * instead.
 654  *
 655  * If a new ioend is created and cached, the old ioend is returned and queued
 656  * locally for submission once the entire page is processed or an error has been
 657  * detected.  While ioends are submitted immediately after they are completed,
 658  * batching optimisations are provided by higher level block plugging.
 659  *
 660  * At the end of a writeback pass, there will be a cached ioend remaining on the
 661  * writepage context that the caller will need to submit.
 662  */
 663 static int
 664 xfs_writepage_map(
 665         struct xfs_writepage_ctx *wpc,
 666         struct writeback_control *wbc,
 667         struct inode            *inode,
 668         struct page             *page,
 669         uint64_t                end_offset)
 670 {
 671         LIST_HEAD(submit_list);
 672         struct iomap_page       *iop = to_iomap_page(page);
 673         unsigned                len = i_blocksize(inode);
 674         struct xfs_ioend        *ioend, *next;
 675         uint64_t                file_offset;    /* file offset of page */
 676         int                     error = 0, count = 0, i;
 677
 678         ASSERT(iop || i_blocksize(inode) == PAGE_SIZE);
 679         ASSERT(!iop || atomic_read(&iop->write_count) == 0);
 680
 681         /*
 682          * Walk through the page to find areas to write back. If we run off the
 683          * end of the current map or find the current map invalid, grab a new
 684          * one.
 685          */
 686         for (i = 0, file_offset = page_offset(page);
 687              i < (PAGE_SIZE >> inode->i_blkbits) && file_offset < end_offset;
 688              i++, file_offset += len) {
 689                 if (iop && !test_bit(i, iop->uptodate))
 690                         continue;
 691
 692                 error = xfs_map_blocks(wpc, inode, file_offset);
 693                 if (error)
 694                         break;
 695                 if (wpc->io_type == XFS_IO_HOLE)
 696                         continue;
 697                 xfs_add_to_ioend(inode, file_offset, page, iop, wpc, wbc,
 698                                  &submit_list);
 699                 count++;
 700         }
 701
 702         ASSERT(wpc->ioend || list_empty(&submit_list));
 703         ASSERT(PageLocked(page));
 704         ASSERT(!PageWriteback(page));
 705
 706         /*
 707          * On error, we have to fail the ioend here because we may have set
 708          * pages under writeback, we have to make sure we run IO completion to
 709          * mark the error state of the IO appropriately, so we can't cancel the
 710          * ioend directly here.  That means we have to mark this page as under
 711          * writeback if we included any blocks from it in the ioend chain so
 712          * that completion treats it correctly.
 713          *
 714          * If we didn't include the page in the ioend, the on error we can
 715          * simply discard and unlock it as there are no other users of the page
 716          * now.  The caller will still need to trigger submission of outstanding
 717          * ioends on the writepage context so they are treated correctly on
 718          * error.
 719          */
 720         if (unlikely(error)) {
 721                 if (!count) {
 722                         xfs_aops_discard_page(page);
 723                         ClearPageUptodate(page);
 724                         unlock_page(page);
 725                         goto done;
 726                 }
 727
 728                 /*
 729                  * If the page was not fully cleaned, we need to ensure that the
 730                  * higher layers come back to it correctly.  That means we need
 731                  * to keep the page dirty, and for WB_SYNC_ALL writeback we need
 732                  * to ensure the PAGECACHE_TAG_TOWRITE index mark is not removed
 733                  * so another attempt to write this page in this writeback sweep
 734                  * will be made.
 735                  */
 736                 set_page_writeback_keepwrite(page);
 737         } else {
 738                 clear_page_dirty_for_io(page);
 739                 set_page_writeback(page);
 740         }
 741
 742         unlock_page(page);
 743
 744         /*
 745          * Preserve the original error if there was one, otherwise catch
 746          * submission errors here and propagate into subsequent ioend
 747          * submissions.
 748          */
 749         list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
 750                 int error2;
 751
 752                 list_del_init(&ioend->io_list);
 753                 error2 = xfs_submit_ioend(wbc, ioend, error);
 754                 if (error2 && !error)
 755                         error = error2;
 756         }
 757
 758         /*
 759          * We can end up here with no error and nothing to write only if we race
 760          * with a partial page truncate on a sub-page block sized filesystem.
 761          */
 762         if (!count)
 763                 end_page_writeback(page);
 764 done:
 765         mapping_set_error(page->mapping, error);
 766         return error;
 767 }
 768
 769 /*
 770  * Write out a dirty page.
 771  *
 772  * For delalloc space on the page we need to allocate space and flush it.
 773  * For unwritten space on the page we need to start the conversion to
 774  * regular allocated space.
 775  */
 776 STATIC int
 777 xfs_do_writepage(
 778         struct page             *page,
 779         struct writeback_control *wbc,
 780         void                    *data)
 781 {
 782         struct xfs_writepage_ctx *wpc = data;
 783         struct inode            *inode = page->mapping->host;
 784         loff_t                  offset;
 785         uint64_t              end_offset;
 786         pgoff_t                 end_index;
 787
 788         trace_xfs_writepage(inode, page, 0, 0);
 789
 790         /*
 791          * Refuse to write the page out if we are called from reclaim context.
 792          *
 793          * This avoids stack overflows when called from deeply used stacks in
 794          * random callers for direct reclaim or memcg reclaim.  We explicitly
 795          * allow reclaim from kswapd as the stack usage there is relatively low.
 796          *
 797          * This should never happen except in the case of a VM regression so
 798          * warn about it.
 799          */
 800         if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
 801                         PF_MEMALLOC))
 802                 goto redirty;
 803
 804         /*
 805          * Given that we do not allow direct reclaim to call us, we should
 806          * never be called while in a filesystem transaction.
 807          */
 808         if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
 809                 goto redirty;
 810
 811         /*
 812          * Is this page beyond the end of the file?
 813          *
 814          * The page index is less than the end_index, adjust the end_offset
 815          * to the highest offset that this page should represent.
 816          * -----------------------------------------------------
 817          * |                    file mapping           | <EOF> |
 818          * -----------------------------------------------------
 819          * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
 820          * ^--------------------------------^----------|--------
 821          * |     desired writeback range    |      see else    |
 822          * ---------------------------------^------------------|
 823          */
 824         offset = i_size_read(inode);
 825         end_index = offset >> PAGE_SHIFT;
 826         if (page->index < end_index)
 827                 end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
 828         else {
 829                 /*
 830                  * Check whether the page to write out is beyond or straddles
 831                  * i_size or not.
 832                  * -------------------------------------------------------
 833                  * |            file mapping                    | <EOF>  |
 834                  * -------------------------------------------------------
 835                  * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
 836                  * ^--------------------------------^-----------|---------
 837                  * |                                |      Straddles     |
 838                  * ---------------------------------^-----------|--------|
 839                  */
 840                 unsigned offset_into_page = offset & (PAGE_SIZE - 1);
 841
 842                 /*
 843                  * Skip the page if it is fully outside i_size, e.g. due to a
 844                  * truncate operation that is in progress. We must redirty the
 845                  * page so that reclaim stops reclaiming it. Otherwise
 846                  * xfs_vm_releasepage() is called on it and gets confused.
 847                  *
 848                  * Note that the end_index is unsigned long, it would overflow
 849                  * if the given offset is greater than 16TB on 32-bit system
 850                  * and if we do check the page is fully outside i_size or not
 851                  * via "if (page->index >= end_index + 1)" as "end_index + 1"
 852                  * will be evaluated to 0.  Hence this page will be redirtied
 853                  * and be written out repeatedly which would result in an
 854                  * infinite loop, the user program that perform this operation
 855                  * will hang.  Instead, we can verify this situation by checking
 856                  * if the page to write is totally beyond the i_size or if it's
 857                  * offset is just equal to the EOF.
 858                  */
 859                 if (page->index > end_index ||
 860                     (page->index == end_index && offset_into_page == 0))
 861                         goto redirty;
 862
 863                 /*
 864                  * The page straddles i_size.  It must be zeroed out on each
 865                  * and every writepage invocation because it may be mmapped.
 866                  * "A file is mapped in multiples of the page size.  For a file
 867                  * that is not a multiple of the page size, the remaining
 868                  * memory is zeroed when mapped, and writes to that region are
 869                  * not written out to the file."
 870                  */
 871                 zero_user_segment(page, offset_into_page, PAGE_SIZE);
 872
 873                 /* Adjust the end_offset to the end of file */
 874                 end_offset = offset;
 875         }
 876
 877         return xfs_writepage_map(wpc, wbc, inode, page, end_offset);
 878
 879 redirty:
 880         redirty_page_for_writepage(wbc, page);
 881         unlock_page(page);
 882         return 0;
 883 }
 884
 885 STATIC int
 886 xfs_vm_writepage(
 887         struct page             *page,
 888         struct writeback_control *wbc)
 889 {
 890         struct xfs_writepage_ctx wpc = {
 891                 .io_type = XFS_IO_INVALID,
 892         };
 893         int                     ret;
 894
 895         ret = xfs_do_writepage(page, wbc, &wpc);
 896         if (wpc.ioend)
 897                 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
 898         return ret;
 899 }
 900
 901 STATIC int
 902 xfs_vm_writepages(
 903         struct address_space    *mapping,
 904         struct writeback_control *wbc)
 905 {
 906         struct xfs_writepage_ctx wpc = {
 907                 .io_type = XFS_IO_INVALID,
 908         };
 909         int                     ret;
 910
 911         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 912         ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
 913         if (wpc.ioend)
 914                 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
 915         return ret;
 916 }
 917
 918 STATIC int
 919 xfs_dax_writepages(
 920         struct address_space    *mapping,
 921         struct writeback_control *wbc)
 922 {
 923         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
 924         return dax_writeback_mapping_range(mapping,
 925                         xfs_find_bdev_for_inode(mapping->host), wbc);
 926 }
 927
 928 STATIC int
 929 xfs_vm_releasepage(
 930         struct page             *page,
 931         gfp_t                   gfp_mask)
 932 {
 933         trace_xfs_releasepage(page->mapping->host, page, 0, 0);
 934         return iomap_releasepage(page, gfp_mask);
 935 }
 936
 937 STATIC sector_t
 938 xfs_vm_bmap(
 939         struct address_space    *mapping,
 940         sector_t                block)
 941 {
 942         struct xfs_inode        *ip = XFS_I(mapping->host);
 943
 944         trace_xfs_vm_bmap(ip);
 945
 946         /*
 947          * The swap code (ab-)uses ->bmap to get a block mapping and then
 948          * bypasses the file system for actual I/O.  We really can't allow
 949          * that on reflinks inodes, so we have to skip out here.  And yes,
 950          * 0 is the magic code for a bmap error.
 951          *
 952          * Since we don't pass back blockdev info, we can't return bmap
 953          * information for rt files either.
 954          */
 955         if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip))
 956                 return 0;
 957         return iomap_bmap(mapping, block, &xfs_iomap_ops);
 958 }
 959
 960 STATIC int
 961 xfs_vm_readpage(
 962         struct file             *unused,
 963         struct page             *page)
 964 {
 965         trace_xfs_vm_readpage(page->mapping->host, 1);
 966         return iomap_readpage(page, &xfs_iomap_ops);
 967 }
 968
 969 STATIC int
 970 xfs_vm_readpages(
 971         struct file             *unused,
 972         struct address_space    *mapping,
 973         struct list_head        *pages,
 974         unsigned                nr_pages)
 975 {
 976         trace_xfs_vm_readpages(mapping->host, nr_pages);
 977         return iomap_readpages(mapping, pages, nr_pages, &xfs_iomap_ops);
 978 }
 979
 980 static int
 981 xfs_iomap_swapfile_activate(
 982         struct swap_info_struct         *sis,
 983         struct file                     *swap_file,
 984         sector_t                        *span)
 985 {
 986         sis->bdev = xfs_find_bdev_for_inode(file_inode(swap_file));
 987         return iomap_swapfile_activate(sis, swap_file, span, &xfs_iomap_ops);
 988 }
 989
 990 const struct address_space_operations xfs_address_space_operations = {
 991         .readpage               = xfs_vm_readpage,
 992         .readpages              = xfs_vm_readpages,
 993         .writepage              = xfs_vm_writepage,
 994         .writepages             = xfs_vm_writepages,
 995         .set_page_dirty         = iomap_set_page_dirty,
 996         .releasepage            = xfs_vm_releasepage,
 997         .invalidatepage         = xfs_vm_invalidatepage,
 998         .bmap                   = xfs_vm_bmap,
 999         .direct_IO              = noop_direct_IO,
1000         .migratepage            = iomap_migrate_page,
1001         .is_partially_uptodate  = iomap_is_partially_uptodate,
1002         .error_remove_page      = generic_error_remove_page,
1003         .swap_activate          = xfs_iomap_swapfile_activate,
1004 };
1005
1006 const struct address_space_operations xfs_dax_aops = {
1007         .writepages             = xfs_dax_writepages,
1008         .direct_IO              = noop_direct_IO,
1009         .set_page_dirty         = noop_set_page_dirty,
1010         .invalidatepage         = noop_invalidatepage,
1011         .swap_activate          = xfs_iomap_swapfile_activate,
1012 };