block/io.c

   1 /*
   2  * Block layer I/O functions
   3  *
   4  * Copyright (c) 2003 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "qemu/osdep.h"
  26 #include "trace.h"
  27 #include "sysemu/block-backend.h"
  28 #include "block/blockjob.h"
  29 #include "block/blockjob_int.h"
  30 #include "block/block_int.h"
  31 #include "qemu/cutils.h"
  32 #include "qapi/error.h"
  33 #include "qemu/error-report.h"
  34
  35 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
  36
  37 /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */
  38 #define MAX_BOUNCE_BUFFER (32768 << BDRV_SECTOR_BITS)
  39
  40 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
  41     int64_t offset, int bytes, BdrvRequestFlags flags);
  42
  43 void bdrv_parent_drained_begin(BlockDriverState *bs)
  44 {
  45     BdrvChild *c, *next;
  46
  47     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  48         if (c->role->drained_begin) {
  49             c->role->drained_begin(c);
  50         }
  51     }
  52 }
  53
  54 void bdrv_parent_drained_end(BlockDriverState *bs)
  55 {
  56     BdrvChild *c, *next;
  57
  58     QLIST_FOREACH_SAFE(c, &bs->parents, next_parent, next) {
  59         if (c->role->drained_end) {
  60             c->role->drained_end(c);
  61         }
  62     }
  63 }
  64
  65 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
  66 {
  67     dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
  68     dst->max_transfer = MIN_NON_ZERO(dst->max_transfer, src->max_transfer);
  69     dst->opt_mem_alignment = MAX(dst->opt_mem_alignment,
  70                                  src->opt_mem_alignment);
  71     dst->min_mem_alignment = MAX(dst->min_mem_alignment,
  72                                  src->min_mem_alignment);
  73     dst->max_iov = MIN_NON_ZERO(dst->max_iov, src->max_iov);
  74 }
  75
  76 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
  77 {
  78     BlockDriver *drv = bs->drv;
  79     Error *local_err = NULL;
  80
  81     memset(&bs->bl, 0, sizeof(bs->bl));
  82
  83     if (!drv) {
  84         return;
  85     }
  86
  87     /* Default alignment based on whether driver has byte interface */
  88     bs->bl.request_alignment = drv->bdrv_co_preadv ? 1 : 512;
  89
  90     /* Take some limits from the children as a default */
  91     if (bs->file) {
  92         bdrv_refresh_limits(bs->file->bs, &local_err);
  93         if (local_err) {
  94             error_propagate(errp, local_err);
  95             return;
  96         }
  97         bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
  98     } else {
  99         bs->bl.min_mem_alignment = 512;
 100         bs->bl.opt_mem_alignment = getpagesize();
 101
 102         /* Safe default since most protocols use readv()/writev()/etc */
 103         bs->bl.max_iov = IOV_MAX;
 104     }
 105
 106     if (bs->backing) {
 107         bdrv_refresh_limits(bs->backing->bs, &local_err);
 108         if (local_err) {
 109             error_propagate(errp, local_err);
 110             return;
 111         }
 112         bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
 113     }
 114
 115     /* Then let the driver override it */
 116     if (drv->bdrv_refresh_limits) {
 117         drv->bdrv_refresh_limits(bs, errp);
 118     }
 119 }
 120
 121 /**
 122  * The copy-on-read flag is actually a reference count so multiple users may
 123  * use the feature without worrying about clobbering its previous state.
 124  * Copy-on-read stays enabled until all users have called to disable it.
 125  */
 126 void bdrv_enable_copy_on_read(BlockDriverState *bs)
 127 {
 128     atomic_inc(&bs->copy_on_read);
 129 }
 130
 131 void bdrv_disable_copy_on_read(BlockDriverState *bs)
 132 {
 133     int old = atomic_fetch_dec(&bs->copy_on_read);
 134     assert(old >= 1);
 135 }
 136
 137 /* Check if any requests are in-flight (including throttled requests) */
 138 bool bdrv_requests_pending(BlockDriverState *bs)
 139 {
 140     BdrvChild *child;
 141
 142     if (atomic_read(&bs->in_flight)) {
 143         return true;
 144     }
 145
 146     QLIST_FOREACH(child, &bs->children, next) {
 147         if (bdrv_requests_pending(child->bs)) {
 148             return true;
 149         }
 150     }
 151
 152     return false;
 153 }
 154
 155 typedef struct {
 156     Coroutine *co;
 157     BlockDriverState *bs;
 158     bool done;
 159     bool begin;
 160 } BdrvCoDrainData;
 161
 162 static void coroutine_fn bdrv_drain_invoke_entry(void *opaque)
 163 {
 164     BdrvCoDrainData *data = opaque;
 165     BlockDriverState *bs = data->bs;
 166
 167     if (data->begin) {
 168         bs->drv->bdrv_co_drain_begin(bs);
 169     } else {
 170         bs->drv->bdrv_co_drain_end(bs);
 171     }
 172
 173     /* Set data->done before reading bs->wakeup.  */
 174     atomic_mb_set(&data->done, true);
 175     bdrv_wakeup(bs);
 176 }
 177
 178 /* Recursively call BlockDriver.bdrv_co_drain_begin/end callbacks */
 179 static void bdrv_drain_invoke(BlockDriverState *bs, bool begin)
 180 {
 181     BdrvChild *child, *tmp;
 182     BdrvCoDrainData data = { .bs = bs, .done = false, .begin = begin};
 183
 184     if (!bs->drv || (begin && !bs->drv->bdrv_co_drain_begin) ||
 185             (!begin && !bs->drv->bdrv_co_drain_end)) {
 186         return;
 187     }
 188
 189     data.co = qemu_coroutine_create(bdrv_drain_invoke_entry, &data);
 190     bdrv_coroutine_enter(bs, data.co);
 191     BDRV_POLL_WHILE(bs, !data.done);
 192
 193     QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
 194         bdrv_drain_invoke(child->bs, begin);
 195     }
 196 }
 197
 198 static bool bdrv_drain_recurse(BlockDriverState *bs)
 199 {
 200     BdrvChild *child, *tmp;
 201     bool waited;
 202
 203     /* Wait for drained requests to finish */
 204     waited = BDRV_POLL_WHILE(bs, atomic_read(&bs->in_flight) > 0);
 205
 206     QLIST_FOREACH_SAFE(child, &bs->children, next, tmp) {
 207         BlockDriverState *bs = child->bs;
 208         bool in_main_loop =
 209             qemu_get_current_aio_context() == qemu_get_aio_context();
 210         assert(bs->refcnt > 0);
 211         if (in_main_loop) {
 212             /* In case the recursive bdrv_drain_recurse processes a
 213              * block_job_defer_to_main_loop BH and modifies the graph,
 214              * let's hold a reference to bs until we are done.
 215              *
 216              * IOThread doesn't have such a BH, and it is not safe to call
 217              * bdrv_unref without BQL, so skip doing it there.
 218              */
 219             bdrv_ref(bs);
 220         }
 221         waited |= bdrv_drain_recurse(bs);
 222         if (in_main_loop) {
 223             bdrv_unref(bs);
 224         }
 225     }
 226
 227     return waited;
 228 }
 229
 230 static void bdrv_co_drain_bh_cb(void *opaque)
 231 {
 232     BdrvCoDrainData *data = opaque;
 233     Coroutine *co = data->co;
 234     BlockDriverState *bs = data->bs;
 235
 236     bdrv_dec_in_flight(bs);
 237     if (data->begin) {
 238         bdrv_drained_begin(bs);
 239     } else {
 240         bdrv_drained_end(bs);
 241     }
 242
 243     data->done = true;
 244     aio_co_wake(co);
 245 }
 246
 247 static void coroutine_fn bdrv_co_yield_to_drain(BlockDriverState *bs,
 248                                                 bool begin)
 249 {
 250     BdrvCoDrainData data;
 251
 252     /* Calling bdrv_drain() from a BH ensures the current coroutine yields and
 253      * other coroutines run if they were queued from
 254      * qemu_co_queue_run_restart(). */
 255
 256     assert(qemu_in_coroutine());
 257     data = (BdrvCoDrainData) {
 258         .co = qemu_coroutine_self(),
 259         .bs = bs,
 260         .done = false,
 261         .begin = begin,
 262     };
 263     bdrv_inc_in_flight(bs);
 264     aio_bh_schedule_oneshot(bdrv_get_aio_context(bs),
 265                             bdrv_co_drain_bh_cb, &data);
 266
 267     qemu_coroutine_yield();
 268     /* If we are resumed from some other event (such as an aio completion or a
 269      * timer callback), it is a bug in the caller that should be fixed. */
 270     assert(data.done);
 271 }
 272
 273 void bdrv_drained_begin(BlockDriverState *bs)
 274 {
 275     if (qemu_in_coroutine()) {
 276         bdrv_co_yield_to_drain(bs, true);
 277         return;
 278     }
 279
 280     /* Stop things in parent-to-child order */
 281     if (atomic_fetch_inc(&bs->quiesce_counter) == 0) {
 282         aio_disable_external(bdrv_get_aio_context(bs));
 283         bdrv_parent_drained_begin(bs);
 284     }
 285
 286     bdrv_drain_invoke(bs, true);
 287     bdrv_drain_recurse(bs);
 288 }
 289
 290 void bdrv_drained_end(BlockDriverState *bs)
 291 {
 292     if (qemu_in_coroutine()) {
 293         bdrv_co_yield_to_drain(bs, false);
 294         return;
 295     }
 296     assert(bs->quiesce_counter > 0);
 297     if (atomic_fetch_dec(&bs->quiesce_counter) > 1) {
 298         return;
 299     }
 300
 301     /* Re-enable things in child-to-parent order */
 302     bdrv_drain_invoke(bs, false);
 303     bdrv_parent_drained_end(bs);
 304     aio_enable_external(bdrv_get_aio_context(bs));
 305 }
 306
 307 /*
 308  * Wait for pending requests to complete on a single BlockDriverState subtree,
 309  * and suspend block driver's internal I/O until next request arrives.
 310  *
 311  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
 312  * AioContext.
 313  *
 314  * Only this BlockDriverState's AioContext is run, so in-flight requests must
 315  * not depend on events in other AioContexts.  In that case, use
 316  * bdrv_drain_all() instead.
 317  */
 318 void coroutine_fn bdrv_co_drain(BlockDriverState *bs)
 319 {
 320     assert(qemu_in_coroutine());
 321     bdrv_drained_begin(bs);
 322     bdrv_drained_end(bs);
 323 }
 324
 325 void bdrv_drain(BlockDriverState *bs)
 326 {
 327     bdrv_drained_begin(bs);
 328     bdrv_drained_end(bs);
 329 }
 330
 331 /*
 332  * Wait for pending requests to complete across all BlockDriverStates
 333  *
 334  * This function does not flush data to disk, use bdrv_flush_all() for that
 335  * after calling this function.
 336  *
 337  * This pauses all block jobs and disables external clients. It must
 338  * be paired with bdrv_drain_all_end().
 339  *
 340  * NOTE: no new block jobs or BlockDriverStates can be created between
 341  * the bdrv_drain_all_begin() and bdrv_drain_all_end() calls.
 342  */
 343 void bdrv_drain_all_begin(void)
 344 {
 345     /* Always run first iteration so any pending completion BHs run */
 346     bool waited = true;
 347     BlockDriverState *bs;
 348     BdrvNextIterator it;
 349     GSList *aio_ctxs = NULL, *ctx;
 350
 351     block_job_pause_all();
 352
 353     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 354         AioContext *aio_context = bdrv_get_aio_context(bs);
 355
 356         /* Stop things in parent-to-child order */
 357         aio_context_acquire(aio_context);
 358         aio_disable_external(aio_context);
 359         bdrv_parent_drained_begin(bs);
 360         bdrv_drain_invoke(bs, true);
 361         aio_context_release(aio_context);
 362
 363         if (!g_slist_find(aio_ctxs, aio_context)) {
 364             aio_ctxs = g_slist_prepend(aio_ctxs, aio_context);
 365         }
 366     }
 367
 368     /* Note that completion of an asynchronous I/O operation can trigger any
 369      * number of other I/O operations on other devices---for example a
 370      * coroutine can submit an I/O request to another device in response to
 371      * request completion.  Therefore we must keep looping until there was no
 372      * more activity rather than simply draining each device independently.
 373      */
 374     while (waited) {
 375         waited = false;
 376
 377         for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) {
 378             AioContext *aio_context = ctx->data;
 379
 380             aio_context_acquire(aio_context);
 381             for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 382                 if (aio_context == bdrv_get_aio_context(bs)) {
 383                     waited |= bdrv_drain_recurse(bs);
 384                 }
 385             }
 386             aio_context_release(aio_context);
 387         }
 388     }
 389
 390     g_slist_free(aio_ctxs);
 391 }
 392
 393 void bdrv_drain_all_end(void)
 394 {
 395     BlockDriverState *bs;
 396     BdrvNextIterator it;
 397
 398     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 399         AioContext *aio_context = bdrv_get_aio_context(bs);
 400
 401         /* Re-enable things in child-to-parent order */
 402         aio_context_acquire(aio_context);
 403         bdrv_drain_invoke(bs, false);
 404         bdrv_parent_drained_end(bs);
 405         aio_enable_external(aio_context);
 406         aio_context_release(aio_context);
 407     }
 408
 409     block_job_resume_all();
 410 }
 411
 412 void bdrv_drain_all(void)
 413 {
 414     bdrv_drain_all_begin();
 415     bdrv_drain_all_end();
 416 }
 417
 418 /**
 419  * Remove an active request from the tracked requests list
 420  *
 421  * This function should be called when a tracked request is completing.
 422  */
 423 static void tracked_request_end(BdrvTrackedRequest *req)
 424 {
 425     if (req->serialising) {
 426         atomic_dec(&req->bs->serialising_in_flight);
 427     }
 428
 429     qemu_co_mutex_lock(&req->bs->reqs_lock);
 430     QLIST_REMOVE(req, list);
 431     qemu_co_queue_restart_all(&req->wait_queue);
 432     qemu_co_mutex_unlock(&req->bs->reqs_lock);
 433 }
 434
 435 /**
 436  * Add an active request to the tracked requests list
 437  */
 438 static void tracked_request_begin(BdrvTrackedRequest *req,
 439                                   BlockDriverState *bs,
 440                                   int64_t offset,
 441                                   unsigned int bytes,
 442                                   enum BdrvTrackedRequestType type)
 443 {
 444     *req = (BdrvTrackedRequest){
 445         .bs = bs,
 446         .offset         = offset,
 447         .bytes          = bytes,
 448         .type           = type,
 449         .co             = qemu_coroutine_self(),
 450         .serialising    = false,
 451         .overlap_offset = offset,
 452         .overlap_bytes  = bytes,
 453     };
 454
 455     qemu_co_queue_init(&req->wait_queue);
 456
 457     qemu_co_mutex_lock(&bs->reqs_lock);
 458     QLIST_INSERT_HEAD(&bs->tracked_requests, req, list);
 459     qemu_co_mutex_unlock(&bs->reqs_lock);
 460 }
 461
 462 static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align)
 463 {
 464     int64_t overlap_offset = req->offset & ~(align - 1);
 465     unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align)
 466                                - overlap_offset;
 467
 468     if (!req->serialising) {
 469         atomic_inc(&req->bs->serialising_in_flight);
 470         req->serialising = true;
 471     }
 472
 473     req->overlap_offset = MIN(req->overlap_offset, overlap_offset);
 474     req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes);
 475 }
 476
 477 /**
 478  * Round a region to cluster boundaries
 479  */
 480 void bdrv_round_to_clusters(BlockDriverState *bs,
 481                             int64_t offset, int64_t bytes,
 482                             int64_t *cluster_offset,
 483                             int64_t *cluster_bytes)
 484 {
 485     BlockDriverInfo bdi;
 486
 487     if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) {
 488         *cluster_offset = offset;
 489         *cluster_bytes = bytes;
 490     } else {
 491         int64_t c = bdi.cluster_size;
 492         *cluster_offset = QEMU_ALIGN_DOWN(offset, c);
 493         *cluster_bytes = QEMU_ALIGN_UP(offset - *cluster_offset + bytes, c);
 494     }
 495 }
 496
 497 static int bdrv_get_cluster_size(BlockDriverState *bs)
 498 {
 499     BlockDriverInfo bdi;
 500     int ret;
 501
 502     ret = bdrv_get_info(bs, &bdi);
 503     if (ret < 0 || bdi.cluster_size == 0) {
 504         return bs->bl.request_alignment;
 505     } else {
 506         return bdi.cluster_size;
 507     }
 508 }
 509
 510 static bool tracked_request_overlaps(BdrvTrackedRequest *req,
 511                                      int64_t offset, unsigned int bytes)
 512 {
 513     /*        aaaa   bbbb */
 514     if (offset >= req->overlap_offset + req->overlap_bytes) {
 515         return false;
 516     }
 517     /* bbbb   aaaa        */
 518     if (req->overlap_offset >= offset + bytes) {
 519         return false;
 520     }
 521     return true;
 522 }
 523
 524 void bdrv_inc_in_flight(BlockDriverState *bs)
 525 {
 526     atomic_inc(&bs->in_flight);
 527 }
 528
 529 static void dummy_bh_cb(void *opaque)
 530 {
 531 }
 532
 533 void bdrv_wakeup(BlockDriverState *bs)
 534 {
 535     /* The barrier (or an atomic op) is in the caller.  */
 536     if (atomic_read(&bs->wakeup)) {
 537         aio_bh_schedule_oneshot(qemu_get_aio_context(), dummy_bh_cb, NULL);
 538     }
 539 }
 540
 541 void bdrv_dec_in_flight(BlockDriverState *bs)
 542 {
 543     atomic_dec(&bs->in_flight);
 544     bdrv_wakeup(bs);
 545 }
 546
 547 static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self)
 548 {
 549     BlockDriverState *bs = self->bs;
 550     BdrvTrackedRequest *req;
 551     bool retry;
 552     bool waited = false;
 553
 554     if (!atomic_read(&bs->serialising_in_flight)) {
 555         return false;
 556     }
 557
 558     do {
 559         retry = false;
 560         qemu_co_mutex_lock(&bs->reqs_lock);
 561         QLIST_FOREACH(req, &bs->tracked_requests, list) {
 562             if (req == self || (!req->serialising && !self->serialising)) {
 563                 continue;
 564             }
 565             if (tracked_request_overlaps(req, self->overlap_offset,
 566                                          self->overlap_bytes))
 567             {
 568                 /* Hitting this means there was a reentrant request, for
 569                  * example, a block driver issuing nested requests.  This must
 570                  * never happen since it means deadlock.
 571                  */
 572                 assert(qemu_coroutine_self() != req->co);
 573
 574                 /* If the request is already (indirectly) waiting for us, or
 575                  * will wait for us as soon as it wakes up, then just go on
 576                  * (instead of producing a deadlock in the former case). */
 577                 if (!req->waiting_for) {
 578                     self->waiting_for = req;
 579                     qemu_co_queue_wait(&req->wait_queue, &bs->reqs_lock);
 580                     self->waiting_for = NULL;
 581                     retry = true;
 582                     waited = true;
 583                     break;
 584                 }
 585             }
 586         }
 587         qemu_co_mutex_unlock(&bs->reqs_lock);
 588     } while (retry);
 589
 590     return waited;
 591 }
 592
 593 static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset,
 594                                    size_t size)
 595 {
 596     if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) {
 597         return -EIO;
 598     }
 599
 600     if (!bdrv_is_inserted(bs)) {
 601         return -ENOMEDIUM;
 602     }
 603
 604     if (offset < 0) {
 605         return -EIO;
 606     }
 607
 608     return 0;
 609 }
 610
 611 typedef struct RwCo {
 612     BdrvChild *child;
 613     int64_t offset;
 614     QEMUIOVector *qiov;
 615     bool is_write;
 616     int ret;
 617     BdrvRequestFlags flags;
 618 } RwCo;
 619
 620 static void coroutine_fn bdrv_rw_co_entry(void *opaque)
 621 {
 622     RwCo *rwco = opaque;
 623
 624     if (!rwco->is_write) {
 625         rwco->ret = bdrv_co_preadv(rwco->child, rwco->offset,
 626                                    rwco->qiov->size, rwco->qiov,
 627                                    rwco->flags);
 628     } else {
 629         rwco->ret = bdrv_co_pwritev(rwco->child, rwco->offset,
 630                                     rwco->qiov->size, rwco->qiov,
 631                                     rwco->flags);
 632     }
 633 }
 634
 635 /*
 636  * Process a vectored synchronous request using coroutines
 637  */
 638 static int bdrv_prwv_co(BdrvChild *child, int64_t offset,
 639                         QEMUIOVector *qiov, bool is_write,
 640                         BdrvRequestFlags flags)
 641 {
 642     Coroutine *co;
 643     RwCo rwco = {
 644         .child = child,
 645         .offset = offset,
 646         .qiov = qiov,
 647         .is_write = is_write,
 648         .ret = NOT_DONE,
 649         .flags = flags,
 650     };
 651
 652     if (qemu_in_coroutine()) {
 653         /* Fast-path if already in coroutine context */
 654         bdrv_rw_co_entry(&rwco);
 655     } else {
 656         co = qemu_coroutine_create(bdrv_rw_co_entry, &rwco);
 657         bdrv_coroutine_enter(child->bs, co);
 658         BDRV_POLL_WHILE(child->bs, rwco.ret == NOT_DONE);
 659     }
 660     return rwco.ret;
 661 }
 662
 663 /*
 664  * Process a synchronous request using coroutines
 665  */
 666 static int bdrv_rw_co(BdrvChild *child, int64_t sector_num, uint8_t *buf,
 667                       int nb_sectors, bool is_write, BdrvRequestFlags flags)
 668 {
 669     QEMUIOVector qiov;
 670     struct iovec iov = {
 671         .iov_base = (void *)buf,
 672         .iov_len = nb_sectors * BDRV_SECTOR_SIZE,
 673     };
 674
 675     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
 676         return -EINVAL;
 677     }
 678
 679     qemu_iovec_init_external(&qiov, &iov, 1);
 680     return bdrv_prwv_co(child, sector_num << BDRV_SECTOR_BITS,
 681                         &qiov, is_write, flags);
 682 }
 683
 684 /* return < 0 if error. See bdrv_write() for the return codes */
 685 int bdrv_read(BdrvChild *child, int64_t sector_num,
 686               uint8_t *buf, int nb_sectors)
 687 {
 688     return bdrv_rw_co(child, sector_num, buf, nb_sectors, false, 0);
 689 }
 690
 691 /* Return < 0 if error. Important errors are:
 692   -EIO         generic I/O error (may happen for all errors)
 693   -ENOMEDIUM   No media inserted.
 694   -EINVAL      Invalid sector number or nb_sectors
 695   -EACCES      Trying to write a read-only device
 696 */
 697 int bdrv_write(BdrvChild *child, int64_t sector_num,
 698                const uint8_t *buf, int nb_sectors)
 699 {
 700     return bdrv_rw_co(child, sector_num, (uint8_t *)buf, nb_sectors, true, 0);
 701 }
 702
 703 int bdrv_pwrite_zeroes(BdrvChild *child, int64_t offset,
 704                        int bytes, BdrvRequestFlags flags)
 705 {
 706     QEMUIOVector qiov;
 707     struct iovec iov = {
 708         .iov_base = NULL,
 709         .iov_len = bytes,
 710     };
 711
 712     qemu_iovec_init_external(&qiov, &iov, 1);
 713     return bdrv_prwv_co(child, offset, &qiov, true,
 714                         BDRV_REQ_ZERO_WRITE | flags);
 715 }
 716
 717 /*
 718  * Completely zero out a block device with the help of bdrv_pwrite_zeroes.
 719  * The operation is sped up by checking the block status and only writing
 720  * zeroes to the device if they currently do not return zeroes. Optional
 721  * flags are passed through to bdrv_pwrite_zeroes (e.g. BDRV_REQ_MAY_UNMAP,
 722  * BDRV_REQ_FUA).
 723  *
 724  * Returns < 0 on error, 0 on success. For error codes see bdrv_write().
 725  */
 726 int bdrv_make_zero(BdrvChild *child, BdrvRequestFlags flags)
 727 {
 728     int ret;
 729     int64_t target_size, bytes, offset = 0;
 730     BlockDriverState *bs = child->bs;
 731
 732     target_size = bdrv_getlength(bs);
 733     if (target_size < 0) {
 734         return target_size;
 735     }
 736
 737     for (;;) {
 738         bytes = MIN(target_size - offset, BDRV_REQUEST_MAX_BYTES);
 739         if (bytes <= 0) {
 740             return 0;
 741         }
 742         ret = bdrv_block_status(bs, offset, bytes, &bytes, NULL, NULL);
 743         if (ret < 0) {
 744             error_report("error getting block status at offset %" PRId64 ": %s",
 745                          offset, strerror(-ret));
 746             return ret;
 747         }
 748         if (ret & BDRV_BLOCK_ZERO) {
 749             offset += bytes;
 750             continue;
 751         }
 752         ret = bdrv_pwrite_zeroes(child, offset, bytes, flags);
 753         if (ret < 0) {
 754             error_report("error writing zeroes at offset %" PRId64 ": %s",
 755                          offset, strerror(-ret));
 756             return ret;
 757         }
 758         offset += bytes;
 759     }
 760 }
 761
 762 int bdrv_preadv(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 763 {
 764     int ret;
 765
 766     ret = bdrv_prwv_co(child, offset, qiov, false, 0);
 767     if (ret < 0) {
 768         return ret;
 769     }
 770
 771     return qiov->size;
 772 }
 773
 774 int bdrv_pread(BdrvChild *child, int64_t offset, void *buf, int bytes)
 775 {
 776     QEMUIOVector qiov;
 777     struct iovec iov = {
 778         .iov_base = (void *)buf,
 779         .iov_len = bytes,
 780     };
 781
 782     if (bytes < 0) {
 783         return -EINVAL;
 784     }
 785
 786     qemu_iovec_init_external(&qiov, &iov, 1);
 787     return bdrv_preadv(child, offset, &qiov);
 788 }
 789
 790 int bdrv_pwritev(BdrvChild *child, int64_t offset, QEMUIOVector *qiov)
 791 {
 792     int ret;
 793
 794     ret = bdrv_prwv_co(child, offset, qiov, true, 0);
 795     if (ret < 0) {
 796         return ret;
 797     }
 798
 799     return qiov->size;
 800 }
 801
 802 int bdrv_pwrite(BdrvChild *child, int64_t offset, const void *buf, int bytes)
 803 {
 804     QEMUIOVector qiov;
 805     struct iovec iov = {
 806         .iov_base   = (void *) buf,
 807         .iov_len    = bytes,
 808     };
 809
 810     if (bytes < 0) {
 811         return -EINVAL;
 812     }
 813
 814     qemu_iovec_init_external(&qiov, &iov, 1);
 815     return bdrv_pwritev(child, offset, &qiov);
 816 }
 817
 818 /*
 819  * Writes to the file and ensures that no writes are reordered across this
 820  * request (acts as a barrier)
 821  *
 822  * Returns 0 on success, -errno in error cases.
 823  */
 824 int bdrv_pwrite_sync(BdrvChild *child, int64_t offset,
 825                      const void *buf, int count)
 826 {
 827     int ret;
 828
 829     ret = bdrv_pwrite(child, offset, buf, count);
 830     if (ret < 0) {
 831         return ret;
 832     }
 833
 834     ret = bdrv_flush(child->bs);
 835     if (ret < 0) {
 836         return ret;
 837     }
 838
 839     return 0;
 840 }
 841
 842 typedef struct CoroutineIOCompletion {
 843     Coroutine *coroutine;
 844     int ret;
 845 } CoroutineIOCompletion;
 846
 847 static void bdrv_co_io_em_complete(void *opaque, int ret)
 848 {
 849     CoroutineIOCompletion *co = opaque;
 850
 851     co->ret = ret;
 852     aio_co_wake(co->coroutine);
 853 }
 854
 855 static int coroutine_fn bdrv_driver_preadv(BlockDriverState *bs,
 856                                            uint64_t offset, uint64_t bytes,
 857                                            QEMUIOVector *qiov, int flags)
 858 {
 859     BlockDriver *drv = bs->drv;
 860     int64_t sector_num;
 861     unsigned int nb_sectors;
 862
 863     assert(!(flags & ~BDRV_REQ_MASK));
 864
 865     if (!drv) {
 866         return -ENOMEDIUM;
 867     }
 868
 869     if (drv->bdrv_co_preadv) {
 870         return drv->bdrv_co_preadv(bs, offset, bytes, qiov, flags);
 871     }
 872
 873     sector_num = offset >> BDRV_SECTOR_BITS;
 874     nb_sectors = bytes >> BDRV_SECTOR_BITS;
 875
 876     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 877     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 878     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 879
 880     if (drv->bdrv_co_readv) {
 881         return drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov);
 882     } else {
 883         BlockAIOCB *acb;
 884         CoroutineIOCompletion co = {
 885             .coroutine = qemu_coroutine_self(),
 886         };
 887
 888         acb = bs->drv->bdrv_aio_readv(bs, sector_num, qiov, nb_sectors,
 889                                       bdrv_co_io_em_complete, &co);
 890         if (acb == NULL) {
 891             return -EIO;
 892         } else {
 893             qemu_coroutine_yield();
 894             return co.ret;
 895         }
 896     }
 897 }
 898
 899 static int coroutine_fn bdrv_driver_pwritev(BlockDriverState *bs,
 900                                             uint64_t offset, uint64_t bytes,
 901                                             QEMUIOVector *qiov, int flags)
 902 {
 903     BlockDriver *drv = bs->drv;
 904     int64_t sector_num;
 905     unsigned int nb_sectors;
 906     int ret;
 907
 908     assert(!(flags & ~BDRV_REQ_MASK));
 909
 910     if (!drv) {
 911         return -ENOMEDIUM;
 912     }
 913
 914     if (drv->bdrv_co_pwritev) {
 915         ret = drv->bdrv_co_pwritev(bs, offset, bytes, qiov,
 916                                    flags & bs->supported_write_flags);
 917         flags &= ~bs->supported_write_flags;
 918         goto emulate_flags;
 919     }
 920
 921     sector_num = offset >> BDRV_SECTOR_BITS;
 922     nb_sectors = bytes >> BDRV_SECTOR_BITS;
 923
 924     assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
 925     assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
 926     assert((bytes >> BDRV_SECTOR_BITS) <= BDRV_REQUEST_MAX_SECTORS);
 927
 928     if (drv->bdrv_co_writev_flags) {
 929         ret = drv->bdrv_co_writev_flags(bs, sector_num, nb_sectors, qiov,
 930                                         flags & bs->supported_write_flags);
 931         flags &= ~bs->supported_write_flags;
 932     } else if (drv->bdrv_co_writev) {
 933         assert(!bs->supported_write_flags);
 934         ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov);
 935     } else {
 936         BlockAIOCB *acb;
 937         CoroutineIOCompletion co = {
 938             .coroutine = qemu_coroutine_self(),
 939         };
 940
 941         acb = bs->drv->bdrv_aio_writev(bs, sector_num, qiov, nb_sectors,
 942                                        bdrv_co_io_em_complete, &co);
 943         if (acb == NULL) {
 944             ret = -EIO;
 945         } else {
 946             qemu_coroutine_yield();
 947             ret = co.ret;
 948         }
 949     }
 950
 951 emulate_flags:
 952     if (ret == 0 && (flags & BDRV_REQ_FUA)) {
 953         ret = bdrv_co_flush(bs);
 954     }
 955
 956     return ret;
 957 }
 958
 959 static int coroutine_fn
 960 bdrv_driver_pwritev_compressed(BlockDriverState *bs, uint64_t offset,
 961                                uint64_t bytes, QEMUIOVector *qiov)
 962 {
 963     BlockDriver *drv = bs->drv;
 964
 965     if (!drv) {
 966         return -ENOMEDIUM;
 967     }
 968
 969     if (!drv->bdrv_co_pwritev_compressed) {
 970         return -ENOTSUP;
 971     }
 972
 973     return drv->bdrv_co_pwritev_compressed(bs, offset, bytes, qiov);
 974 }
 975
 976 static int coroutine_fn bdrv_co_do_copy_on_readv(BdrvChild *child,
 977         int64_t offset, unsigned int bytes, QEMUIOVector *qiov)
 978 {
 979     BlockDriverState *bs = child->bs;
 980
 981     /* Perform I/O through a temporary buffer so that users who scribble over
 982      * their read buffer while the operation is in progress do not end up
 983      * modifying the image file.  This is critical for zero-copy guest I/O
 984      * where anything might happen inside guest memory.
 985      */
 986     void *bounce_buffer;
 987
 988     BlockDriver *drv = bs->drv;
 989     struct iovec iov;
 990     QEMUIOVector local_qiov;
 991     int64_t cluster_offset;
 992     int64_t cluster_bytes;
 993     size_t skip_bytes;
 994     int ret;
 995     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer,
 996                                     BDRV_REQUEST_MAX_BYTES);
 997     unsigned int progress = 0;
 998
 999     if (!drv) {
1000         return -ENOMEDIUM;
1001     }
1002
1003     /* FIXME We cannot require callers to have write permissions when all they
1004      * are doing is a read request. If we did things right, write permissions
1005      * would be obtained anyway, but internally by the copy-on-read code. As
1006      * long as it is implemented here rather than in a separate filter driver,
1007      * the copy-on-read code doesn't have its own BdrvChild, however, for which
1008      * it could request permissions. Therefore we have to bypass the permission
1009      * system for the moment. */
1010     // assert(child->perm & (BLK_PERM_WRITE_UNCHANGED | BLK_PERM_WRITE));
1011
1012     /* Cover entire cluster so no additional backing file I/O is required when
1013      * allocating cluster in the image file.  Note that this value may exceed
1014      * BDRV_REQUEST_MAX_BYTES (even when the original read did not), which
1015      * is one reason we loop rather than doing it all at once.
1016      */
1017     bdrv_round_to_clusters(bs, offset, bytes, &cluster_offset, &cluster_bytes);
1018     skip_bytes = offset - cluster_offset;
1019
1020     trace_bdrv_co_do_copy_on_readv(bs, offset, bytes,
1021                                    cluster_offset, cluster_bytes);
1022
1023     bounce_buffer = qemu_try_blockalign(bs,
1024                                         MIN(MIN(max_transfer, cluster_bytes),
1025                                             MAX_BOUNCE_BUFFER));
1026     if (bounce_buffer == NULL) {
1027         ret = -ENOMEM;
1028         goto err;
1029     }
1030
1031     while (cluster_bytes) {
1032         int64_t pnum;
1033
1034         ret = bdrv_is_allocated(bs, cluster_offset,
1035                                 MIN(cluster_bytes, max_transfer), &pnum);
1036         if (ret < 0) {
1037             /* Safe to treat errors in querying allocation as if
1038              * unallocated; we'll probably fail again soon on the
1039              * read, but at least that will set a decent errno.
1040              */
1041             pnum = MIN(cluster_bytes, max_transfer);
1042         }
1043
1044         assert(skip_bytes < pnum);
1045
1046         if (ret <= 0) {
1047             /* Must copy-on-read; use the bounce buffer */
1048             iov.iov_base = bounce_buffer;
1049             iov.iov_len = pnum = MIN(pnum, MAX_BOUNCE_BUFFER);
1050             qemu_iovec_init_external(&local_qiov, &iov, 1);
1051
1052             ret = bdrv_driver_preadv(bs, cluster_offset, pnum,
1053                                      &local_qiov, 0);
1054             if (ret < 0) {
1055                 goto err;
1056             }
1057
1058             bdrv_debug_event(bs, BLKDBG_COR_WRITE);
1059             if (drv->bdrv_co_pwrite_zeroes &&
1060                 buffer_is_zero(bounce_buffer, pnum)) {
1061                 /* FIXME: Should we (perhaps conditionally) be setting
1062                  * BDRV_REQ_MAY_UNMAP, if it will allow for a sparser copy
1063                  * that still correctly reads as zero? */
1064                 ret = bdrv_co_do_pwrite_zeroes(bs, cluster_offset, pnum, 0);
1065             } else {
1066                 /* This does not change the data on the disk, it is not
1067                  * necessary to flush even in cache=writethrough mode.
1068                  */
1069                 ret = bdrv_driver_pwritev(bs, cluster_offset, pnum,
1070                                           &local_qiov, 0);
1071             }
1072
1073             if (ret < 0) {
1074                 /* It might be okay to ignore write errors for guest
1075                  * requests.  If this is a deliberate copy-on-read
1076                  * then we don't want to ignore the error.  Simply
1077                  * report it in all cases.
1078                  */
1079                 goto err;
1080             }
1081
1082             qemu_iovec_from_buf(qiov, progress, bounce_buffer + skip_bytes,
1083                                 pnum - skip_bytes);
1084         } else {
1085             /* Read directly into the destination */
1086             qemu_iovec_init(&local_qiov, qiov->niov);
1087             qemu_iovec_concat(&local_qiov, qiov, progress, pnum - skip_bytes);
1088             ret = bdrv_driver_preadv(bs, offset + progress, local_qiov.size,
1089                                      &local_qiov, 0);
1090             qemu_iovec_destroy(&local_qiov);
1091             if (ret < 0) {
1092                 goto err;
1093             }
1094         }
1095
1096         cluster_offset += pnum;
1097         cluster_bytes -= pnum;
1098         progress += pnum - skip_bytes;
1099         skip_bytes = 0;
1100     }
1101     ret = 0;
1102
1103 err:
1104     qemu_vfree(bounce_buffer);
1105     return ret;
1106 }
1107
1108 /*
1109  * Forwards an already correctly aligned request to the BlockDriver. This
1110  * handles copy on read, zeroing after EOF, and fragmentation of large
1111  * reads; any other features must be implemented by the caller.
1112  */
1113 static int coroutine_fn bdrv_aligned_preadv(BdrvChild *child,
1114     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1115     int64_t align, QEMUIOVector *qiov, int flags)
1116 {
1117     BlockDriverState *bs = child->bs;
1118     int64_t total_bytes, max_bytes;
1119     int ret = 0;
1120     uint64_t bytes_remaining = bytes;
1121     int max_transfer;
1122
1123     assert(is_power_of_2(align));
1124     assert((offset & (align - 1)) == 0);
1125     assert((bytes & (align - 1)) == 0);
1126     assert(!qiov || bytes == qiov->size);
1127     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1128     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1129                                    align);
1130
1131     /* TODO: We would need a per-BDS .supported_read_flags and
1132      * potential fallback support, if we ever implement any read flags
1133      * to pass through to drivers.  For now, there aren't any
1134      * passthrough flags.  */
1135     assert(!(flags & ~(BDRV_REQ_NO_SERIALISING | BDRV_REQ_COPY_ON_READ)));
1136
1137     /* Handle Copy on Read and associated serialisation */
1138     if (flags & BDRV_REQ_COPY_ON_READ) {
1139         /* If we touch the same cluster it counts as an overlap.  This
1140          * guarantees that allocating writes will be serialized and not race
1141          * with each other for the same cluster.  For example, in copy-on-read
1142          * it ensures that the CoR read and write operations are atomic and
1143          * guest writes cannot interleave between them. */
1144         mark_request_serialising(req, bdrv_get_cluster_size(bs));
1145     }
1146
1147     if (!(flags & BDRV_REQ_NO_SERIALISING)) {
1148         wait_serialising_requests(req);
1149     }
1150
1151     if (flags & BDRV_REQ_COPY_ON_READ) {
1152         int64_t pnum;
1153
1154         ret = bdrv_is_allocated(bs, offset, bytes, &pnum);
1155         if (ret < 0) {
1156             goto out;
1157         }
1158
1159         if (!ret || pnum != bytes) {
1160             ret = bdrv_co_do_copy_on_readv(child, offset, bytes, qiov);
1161             goto out;
1162         }
1163     }
1164
1165     /* Forward the request to the BlockDriver, possibly fragmenting it */
1166     total_bytes = bdrv_getlength(bs);
1167     if (total_bytes < 0) {
1168         ret = total_bytes;
1169         goto out;
1170     }
1171
1172     max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
1173     if (bytes <= max_bytes && bytes <= max_transfer) {
1174         ret = bdrv_driver_preadv(bs, offset, bytes, qiov, 0);
1175         goto out;
1176     }
1177
1178     while (bytes_remaining) {
1179         int num;
1180
1181         if (max_bytes) {
1182             QEMUIOVector local_qiov;
1183
1184             num = MIN(bytes_remaining, MIN(max_bytes, max_transfer));
1185             assert(num);
1186             qemu_iovec_init(&local_qiov, qiov->niov);
1187             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1188
1189             ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
1190                                      num, &local_qiov, 0);
1191             max_bytes -= num;
1192             qemu_iovec_destroy(&local_qiov);
1193         } else {
1194             num = bytes_remaining;
1195             ret = qemu_iovec_memset(qiov, bytes - bytes_remaining, 0,
1196                                     bytes_remaining);
1197         }
1198         if (ret < 0) {
1199             goto out;
1200         }
1201         bytes_remaining -= num;
1202     }
1203
1204 out:
1205     return ret < 0 ? ret : 0;
1206 }
1207
1208 /*
1209  * Handle a read request in coroutine context
1210  */
1211 int coroutine_fn bdrv_co_preadv(BdrvChild *child,
1212     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1213     BdrvRequestFlags flags)
1214 {
1215     BlockDriverState *bs = child->bs;
1216     BlockDriver *drv = bs->drv;
1217     BdrvTrackedRequest req;
1218
1219     uint64_t align = bs->bl.request_alignment;
1220     uint8_t *head_buf = NULL;
1221     uint8_t *tail_buf = NULL;
1222     QEMUIOVector local_qiov;
1223     bool use_local_qiov = false;
1224     int ret;
1225
1226     trace_bdrv_co_preadv(child->bs, offset, bytes, flags);
1227
1228     if (!drv) {
1229         return -ENOMEDIUM;
1230     }
1231
1232     ret = bdrv_check_byte_request(bs, offset, bytes);
1233     if (ret < 0) {
1234         return ret;
1235     }
1236
1237     bdrv_inc_in_flight(bs);
1238
1239     /* Don't do copy-on-read if we read data before write operation */
1240     if (atomic_read(&bs->copy_on_read) && !(flags & BDRV_REQ_NO_SERIALISING)) {
1241         flags |= BDRV_REQ_COPY_ON_READ;
1242     }
1243
1244     /* Align read if necessary by padding qiov */
1245     if (offset & (align - 1)) {
1246         head_buf = qemu_blockalign(bs, align);
1247         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1248         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1249         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1250         use_local_qiov = true;
1251
1252         bytes += offset & (align - 1);
1253         offset = offset & ~(align - 1);
1254     }
1255
1256     if ((offset + bytes) & (align - 1)) {
1257         if (!use_local_qiov) {
1258             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1259             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1260             use_local_qiov = true;
1261         }
1262         tail_buf = qemu_blockalign(bs, align);
1263         qemu_iovec_add(&local_qiov, tail_buf,
1264                        align - ((offset + bytes) & (align - 1)));
1265
1266         bytes = ROUND_UP(bytes, align);
1267     }
1268
1269     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
1270     ret = bdrv_aligned_preadv(child, &req, offset, bytes, align,
1271                               use_local_qiov ? &local_qiov : qiov,
1272                               flags);
1273     tracked_request_end(&req);
1274     bdrv_dec_in_flight(bs);
1275
1276     if (use_local_qiov) {
1277         qemu_iovec_destroy(&local_qiov);
1278         qemu_vfree(head_buf);
1279         qemu_vfree(tail_buf);
1280     }
1281
1282     return ret;
1283 }
1284
1285 static int coroutine_fn bdrv_co_do_readv(BdrvChild *child,
1286     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1287     BdrvRequestFlags flags)
1288 {
1289     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1290         return -EINVAL;
1291     }
1292
1293     return bdrv_co_preadv(child, sector_num << BDRV_SECTOR_BITS,
1294                           nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1295 }
1296
1297 int coroutine_fn bdrv_co_readv(BdrvChild *child, int64_t sector_num,
1298                                int nb_sectors, QEMUIOVector *qiov)
1299 {
1300     return bdrv_co_do_readv(child, sector_num, nb_sectors, qiov, 0);
1301 }
1302
1303 static int coroutine_fn bdrv_co_do_pwrite_zeroes(BlockDriverState *bs,
1304     int64_t offset, int bytes, BdrvRequestFlags flags)
1305 {
1306     BlockDriver *drv = bs->drv;
1307     QEMUIOVector qiov;
1308     struct iovec iov = {0};
1309     int ret = 0;
1310     bool need_flush = false;
1311     int head = 0;
1312     int tail = 0;
1313
1314     int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX);
1315     int alignment = MAX(bs->bl.pwrite_zeroes_alignment,
1316                         bs->bl.request_alignment);
1317     int max_transfer = MIN_NON_ZERO(bs->bl.max_transfer, MAX_BOUNCE_BUFFER);
1318
1319     if (!drv) {
1320         return -ENOMEDIUM;
1321     }
1322
1323     assert(alignment % bs->bl.request_alignment == 0);
1324     head = offset % alignment;
1325     tail = (offset + bytes) % alignment;
1326     max_write_zeroes = QEMU_ALIGN_DOWN(max_write_zeroes, alignment);
1327     assert(max_write_zeroes >= bs->bl.request_alignment);
1328
1329     while (bytes > 0 && !ret) {
1330         int num = bytes;
1331
1332         /* Align request.  Block drivers can expect the "bulk" of the request
1333          * to be aligned, and that unaligned requests do not cross cluster
1334          * boundaries.
1335          */
1336         if (head) {
1337             /* Make a small request up to the first aligned sector. For
1338              * convenience, limit this request to max_transfer even if
1339              * we don't need to fall back to writes.  */
1340             num = MIN(MIN(bytes, max_transfer), alignment - head);
1341             head = (head + num) % alignment;
1342             assert(num < max_write_zeroes);
1343         } else if (tail && num > alignment) {
1344             /* Shorten the request to the last aligned sector.  */
1345             num -= tail;
1346         }
1347
1348         /* limit request size */
1349         if (num > max_write_zeroes) {
1350             num = max_write_zeroes;
1351         }
1352
1353         ret = -ENOTSUP;
1354         /* First try the efficient write zeroes operation */
1355         if (drv->bdrv_co_pwrite_zeroes) {
1356             ret = drv->bdrv_co_pwrite_zeroes(bs, offset, num,
1357                                              flags & bs->supported_zero_flags);
1358             if (ret != -ENOTSUP && (flags & BDRV_REQ_FUA) &&
1359                 !(bs->supported_zero_flags & BDRV_REQ_FUA)) {
1360                 need_flush = true;
1361             }
1362         } else {
1363             assert(!bs->supported_zero_flags);
1364         }
1365
1366         if (ret == -ENOTSUP) {
1367             /* Fall back to bounce buffer if write zeroes is unsupported */
1368             BdrvRequestFlags write_flags = flags & ~BDRV_REQ_ZERO_WRITE;
1369
1370             if ((flags & BDRV_REQ_FUA) &&
1371                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1372                 /* No need for bdrv_driver_pwrite() to do a fallback
1373                  * flush on each chunk; use just one at the end */
1374                 write_flags &= ~BDRV_REQ_FUA;
1375                 need_flush = true;
1376             }
1377             num = MIN(num, max_transfer);
1378             iov.iov_len = num;
1379             if (iov.iov_base == NULL) {
1380                 iov.iov_base = qemu_try_blockalign(bs, num);
1381                 if (iov.iov_base == NULL) {
1382                     ret = -ENOMEM;
1383                     goto fail;
1384                 }
1385                 memset(iov.iov_base, 0, num);
1386             }
1387             qemu_iovec_init_external(&qiov, &iov, 1);
1388
1389             ret = bdrv_driver_pwritev(bs, offset, num, &qiov, write_flags);
1390
1391             /* Keep bounce buffer around if it is big enough for all
1392              * all future requests.
1393              */
1394             if (num < max_transfer) {
1395                 qemu_vfree(iov.iov_base);
1396                 iov.iov_base = NULL;
1397             }
1398         }
1399
1400         offset += num;
1401         bytes -= num;
1402     }
1403
1404 fail:
1405     if (ret == 0 && need_flush) {
1406         ret = bdrv_co_flush(bs);
1407     }
1408     qemu_vfree(iov.iov_base);
1409     return ret;
1410 }
1411
1412 /*
1413  * Forwards an already correctly aligned write request to the BlockDriver,
1414  * after possibly fragmenting it.
1415  */
1416 static int coroutine_fn bdrv_aligned_pwritev(BdrvChild *child,
1417     BdrvTrackedRequest *req, int64_t offset, unsigned int bytes,
1418     int64_t align, QEMUIOVector *qiov, int flags)
1419 {
1420     BlockDriverState *bs = child->bs;
1421     BlockDriver *drv = bs->drv;
1422     bool waited;
1423     int ret;
1424
1425     int64_t end_sector = DIV_ROUND_UP(offset + bytes, BDRV_SECTOR_SIZE);
1426     uint64_t bytes_remaining = bytes;
1427     int max_transfer;
1428
1429     if (!drv) {
1430         return -ENOMEDIUM;
1431     }
1432
1433     if (bdrv_has_readonly_bitmaps(bs)) {
1434         return -EPERM;
1435     }
1436
1437     assert(is_power_of_2(align));
1438     assert((offset & (align - 1)) == 0);
1439     assert((bytes & (align - 1)) == 0);
1440     assert(!qiov || bytes == qiov->size);
1441     assert((bs->open_flags & BDRV_O_NO_IO) == 0);
1442     assert(!(flags & ~BDRV_REQ_MASK));
1443     max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
1444                                    align);
1445
1446     waited = wait_serialising_requests(req);
1447     assert(!waited || !req->serialising);
1448     assert(req->overlap_offset <= offset);
1449     assert(offset + bytes <= req->overlap_offset + req->overlap_bytes);
1450     assert(child->perm & BLK_PERM_WRITE);
1451     assert(end_sector <= bs->total_sectors || child->perm & BLK_PERM_RESIZE);
1452
1453     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req);
1454
1455     if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF &&
1456         !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_pwrite_zeroes &&
1457         qemu_iovec_is_zero(qiov)) {
1458         flags |= BDRV_REQ_ZERO_WRITE;
1459         if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) {
1460             flags |= BDRV_REQ_MAY_UNMAP;
1461         }
1462     }
1463
1464     if (ret < 0) {
1465         /* Do nothing, write notifier decided to fail this request */
1466     } else if (flags & BDRV_REQ_ZERO_WRITE) {
1467         bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO);
1468         ret = bdrv_co_do_pwrite_zeroes(bs, offset, bytes, flags);
1469     } else if (flags & BDRV_REQ_WRITE_COMPRESSED) {
1470         ret = bdrv_driver_pwritev_compressed(bs, offset, bytes, qiov);
1471     } else if (bytes <= max_transfer) {
1472         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1473         ret = bdrv_driver_pwritev(bs, offset, bytes, qiov, flags);
1474     } else {
1475         bdrv_debug_event(bs, BLKDBG_PWRITEV);
1476         while (bytes_remaining) {
1477             int num = MIN(bytes_remaining, max_transfer);
1478             QEMUIOVector local_qiov;
1479             int local_flags = flags;
1480
1481             assert(num);
1482             if (num < bytes_remaining && (flags & BDRV_REQ_FUA) &&
1483                 !(bs->supported_write_flags & BDRV_REQ_FUA)) {
1484                 /* If FUA is going to be emulated by flush, we only
1485                  * need to flush on the last iteration */
1486                 local_flags &= ~BDRV_REQ_FUA;
1487             }
1488             qemu_iovec_init(&local_qiov, qiov->niov);
1489             qemu_iovec_concat(&local_qiov, qiov, bytes - bytes_remaining, num);
1490
1491             ret = bdrv_driver_pwritev(bs, offset + bytes - bytes_remaining,
1492                                       num, &local_qiov, local_flags);
1493             qemu_iovec_destroy(&local_qiov);
1494             if (ret < 0) {
1495                 break;
1496             }
1497             bytes_remaining -= num;
1498         }
1499     }
1500     bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE);
1501
1502     atomic_inc(&bs->write_gen);
1503     bdrv_set_dirty(bs, offset, bytes);
1504
1505     stat64_max(&bs->wr_highest_offset, offset + bytes);
1506
1507     if (ret >= 0) {
1508         bs->total_sectors = MAX(bs->total_sectors, end_sector);
1509         ret = 0;
1510     }
1511
1512     return ret;
1513 }
1514
1515 static int coroutine_fn bdrv_co_do_zero_pwritev(BdrvChild *child,
1516                                                 int64_t offset,
1517                                                 unsigned int bytes,
1518                                                 BdrvRequestFlags flags,
1519                                                 BdrvTrackedRequest *req)
1520 {
1521     BlockDriverState *bs = child->bs;
1522     uint8_t *buf = NULL;
1523     QEMUIOVector local_qiov;
1524     struct iovec iov;
1525     uint64_t align = bs->bl.request_alignment;
1526     unsigned int head_padding_bytes, tail_padding_bytes;
1527     int ret = 0;
1528
1529     head_padding_bytes = offset & (align - 1);
1530     tail_padding_bytes = (align - (offset + bytes)) & (align - 1);
1531
1532
1533     assert(flags & BDRV_REQ_ZERO_WRITE);
1534     if (head_padding_bytes || tail_padding_bytes) {
1535         buf = qemu_blockalign(bs, align);
1536         iov = (struct iovec) {
1537             .iov_base   = buf,
1538             .iov_len    = align,
1539         };
1540         qemu_iovec_init_external(&local_qiov, &iov, 1);
1541     }
1542     if (head_padding_bytes) {
1543         uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes);
1544
1545         /* RMW the unaligned part before head. */
1546         mark_request_serialising(req, align);
1547         wait_serialising_requests(req);
1548         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1549         ret = bdrv_aligned_preadv(child, req, offset & ~(align - 1), align,
1550                                   align, &local_qiov, 0);
1551         if (ret < 0) {
1552             goto fail;
1553         }
1554         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1555
1556         memset(buf + head_padding_bytes, 0, zero_bytes);
1557         ret = bdrv_aligned_pwritev(child, req, offset & ~(align - 1), align,
1558                                    align, &local_qiov,
1559                                    flags & ~BDRV_REQ_ZERO_WRITE);
1560         if (ret < 0) {
1561             goto fail;
1562         }
1563         offset += zero_bytes;
1564         bytes -= zero_bytes;
1565     }
1566
1567     assert(!bytes || (offset & (align - 1)) == 0);
1568     if (bytes >= align) {
1569         /* Write the aligned part in the middle. */
1570         uint64_t aligned_bytes = bytes & ~(align - 1);
1571         ret = bdrv_aligned_pwritev(child, req, offset, aligned_bytes, align,
1572                                    NULL, flags);
1573         if (ret < 0) {
1574             goto fail;
1575         }
1576         bytes -= aligned_bytes;
1577         offset += aligned_bytes;
1578     }
1579
1580     assert(!bytes || (offset & (align - 1)) == 0);
1581     if (bytes) {
1582         assert(align == tail_padding_bytes + bytes);
1583         /* RMW the unaligned part after tail. */
1584         mark_request_serialising(req, align);
1585         wait_serialising_requests(req);
1586         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1587         ret = bdrv_aligned_preadv(child, req, offset, align,
1588                                   align, &local_qiov, 0);
1589         if (ret < 0) {
1590             goto fail;
1591         }
1592         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1593
1594         memset(buf, 0, bytes);
1595         ret = bdrv_aligned_pwritev(child, req, offset, align, align,
1596                                    &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE);
1597     }
1598 fail:
1599     qemu_vfree(buf);
1600     return ret;
1601
1602 }
1603
1604 /*
1605  * Handle a write request in coroutine context
1606  */
1607 int coroutine_fn bdrv_co_pwritev(BdrvChild *child,
1608     int64_t offset, unsigned int bytes, QEMUIOVector *qiov,
1609     BdrvRequestFlags flags)
1610 {
1611     BlockDriverState *bs = child->bs;
1612     BdrvTrackedRequest req;
1613     uint64_t align = bs->bl.request_alignment;
1614     uint8_t *head_buf = NULL;
1615     uint8_t *tail_buf = NULL;
1616     QEMUIOVector local_qiov;
1617     bool use_local_qiov = false;
1618     int ret;
1619
1620     trace_bdrv_co_pwritev(child->bs, offset, bytes, flags);
1621
1622     if (!bs->drv) {
1623         return -ENOMEDIUM;
1624     }
1625     if (bs->read_only) {
1626         return -EPERM;
1627     }
1628     assert(!(bs->open_flags & BDRV_O_INACTIVE));
1629
1630     ret = bdrv_check_byte_request(bs, offset, bytes);
1631     if (ret < 0) {
1632         return ret;
1633     }
1634
1635     bdrv_inc_in_flight(bs);
1636     /*
1637      * Align write if necessary by performing a read-modify-write cycle.
1638      * Pad qiov with the read parts and be sure to have a tracked request not
1639      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
1640      */
1641     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
1642
1643     if (!qiov) {
1644         ret = bdrv_co_do_zero_pwritev(child, offset, bytes, flags, &req);
1645         goto out;
1646     }
1647
1648     if (offset & (align - 1)) {
1649         QEMUIOVector head_qiov;
1650         struct iovec head_iov;
1651
1652         mark_request_serialising(&req, align);
1653         wait_serialising_requests(&req);
1654
1655         head_buf = qemu_blockalign(bs, align);
1656         head_iov = (struct iovec) {
1657             .iov_base   = head_buf,
1658             .iov_len    = align,
1659         };
1660         qemu_iovec_init_external(&head_qiov, &head_iov, 1);
1661
1662         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD);
1663         ret = bdrv_aligned_preadv(child, &req, offset & ~(align - 1), align,
1664                                   align, &head_qiov, 0);
1665         if (ret < 0) {
1666             goto fail;
1667         }
1668         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD);
1669
1670         qemu_iovec_init(&local_qiov, qiov->niov + 2);
1671         qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1));
1672         qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1673         use_local_qiov = true;
1674
1675         bytes += offset & (align - 1);
1676         offset = offset & ~(align - 1);
1677
1678         /* We have read the tail already if the request is smaller
1679          * than one aligned block.
1680          */
1681         if (bytes < align) {
1682             qemu_iovec_add(&local_qiov, head_buf + bytes, align - bytes);
1683             bytes = align;
1684         }
1685     }
1686
1687     if ((offset + bytes) & (align - 1)) {
1688         QEMUIOVector tail_qiov;
1689         struct iovec tail_iov;
1690         size_t tail_bytes;
1691         bool waited;
1692
1693         mark_request_serialising(&req, align);
1694         waited = wait_serialising_requests(&req);
1695         assert(!waited || !use_local_qiov);
1696
1697         tail_buf = qemu_blockalign(bs, align);
1698         tail_iov = (struct iovec) {
1699             .iov_base   = tail_buf,
1700             .iov_len    = align,
1701         };
1702         qemu_iovec_init_external(&tail_qiov, &tail_iov, 1);
1703
1704         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL);
1705         ret = bdrv_aligned_preadv(child, &req, (offset + bytes) & ~(align - 1),
1706                                   align, align, &tail_qiov, 0);
1707         if (ret < 0) {
1708             goto fail;
1709         }
1710         bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL);
1711
1712         if (!use_local_qiov) {
1713             qemu_iovec_init(&local_qiov, qiov->niov + 1);
1714             qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size);
1715             use_local_qiov = true;
1716         }
1717
1718         tail_bytes = (offset + bytes) & (align - 1);
1719         qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes);
1720
1721         bytes = ROUND_UP(bytes, align);
1722     }
1723
1724     ret = bdrv_aligned_pwritev(child, &req, offset, bytes, align,
1725                                use_local_qiov ? &local_qiov : qiov,
1726                                flags);
1727
1728 fail:
1729
1730     if (use_local_qiov) {
1731         qemu_iovec_destroy(&local_qiov);
1732     }
1733     qemu_vfree(head_buf);
1734     qemu_vfree(tail_buf);
1735 out:
1736     tracked_request_end(&req);
1737     bdrv_dec_in_flight(bs);
1738     return ret;
1739 }
1740
1741 static int coroutine_fn bdrv_co_do_writev(BdrvChild *child,
1742     int64_t sector_num, int nb_sectors, QEMUIOVector *qiov,
1743     BdrvRequestFlags flags)
1744 {
1745     if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) {
1746         return -EINVAL;
1747     }
1748
1749     return bdrv_co_pwritev(child, sector_num << BDRV_SECTOR_BITS,
1750                            nb_sectors << BDRV_SECTOR_BITS, qiov, flags);
1751 }
1752
1753 int coroutine_fn bdrv_co_writev(BdrvChild *child, int64_t sector_num,
1754     int nb_sectors, QEMUIOVector *qiov)
1755 {
1756     return bdrv_co_do_writev(child, sector_num, nb_sectors, qiov, 0);
1757 }
1758
1759 int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset,
1760                                        int bytes, BdrvRequestFlags flags)
1761 {
1762     trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags);
1763
1764     if (!(child->bs->open_flags & BDRV_O_UNMAP)) {
1765         flags &= ~BDRV_REQ_MAY_UNMAP;
1766     }
1767
1768     return bdrv_co_pwritev(child, offset, bytes, NULL,
1769                            BDRV_REQ_ZERO_WRITE | flags);
1770 }
1771
1772 /*
1773  * Flush ALL BDSes regardless of if they are reachable via a BlkBackend or not.
1774  */
1775 int bdrv_flush_all(void)
1776 {
1777     BdrvNextIterator it;
1778     BlockDriverState *bs = NULL;
1779     int result = 0;
1780
1781     for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
1782         AioContext *aio_context = bdrv_get_aio_context(bs);
1783         int ret;
1784
1785         aio_context_acquire(aio_context);
1786         ret = bdrv_flush(bs);
1787         if (ret < 0 && !result) {
1788             result = ret;
1789         }
1790         aio_context_release(aio_context);
1791     }
1792
1793     return result;
1794 }
1795
1796
1797 typedef struct BdrvCoBlockStatusData {
1798     BlockDriverState *bs;
1799     BlockDriverState *base;
1800     bool want_zero;
1801     int64_t offset;
1802     int64_t bytes;
1803     int64_t *pnum;
1804     int64_t *map;
1805     BlockDriverState **file;
1806     int ret;
1807     bool done;
1808 } BdrvCoBlockStatusData;
1809
1810 int64_t coroutine_fn bdrv_co_get_block_status_from_file(BlockDriverState *bs,
1811                                                         int64_t sector_num,
1812                                                         int nb_sectors,
1813                                                         int *pnum,
1814                                                         BlockDriverState **file)
1815 {
1816     assert(bs->file && bs->file->bs);
1817     *pnum = nb_sectors;
1818     *file = bs->file->bs;
1819     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
1820            (sector_num << BDRV_SECTOR_BITS);
1821 }
1822
1823 int64_t coroutine_fn bdrv_co_get_block_status_from_backing(BlockDriverState *bs,
1824                                                            int64_t sector_num,
1825                                                            int nb_sectors,
1826                                                            int *pnum,
1827                                                            BlockDriverState **file)
1828 {
1829     assert(bs->backing && bs->backing->bs);
1830     *pnum = nb_sectors;
1831     *file = bs->backing->bs;
1832     return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID |
1833            (sector_num << BDRV_SECTOR_BITS);
1834 }
1835
1836 /*
1837  * Returns the allocation status of the specified sectors.
1838  * Drivers not implementing the functionality are assumed to not support
1839  * backing files, hence all their sectors are reported as allocated.
1840  *
1841  * If 'want_zero' is true, the caller is querying for mapping purposes,
1842  * and the result should include BDRV_BLOCK_OFFSET_VALID and
1843  * BDRV_BLOCK_ZERO where possible; otherwise, the result may omit those
1844  * bits particularly if it allows for a larger value in 'pnum'.
1845  *
1846  * If 'offset' is beyond the end of the disk image the return value is
1847  * BDRV_BLOCK_EOF and 'pnum' is set to 0.
1848  *
1849  * 'bytes' is the max value 'pnum' should be set to.  If bytes goes
1850  * beyond the end of the disk image it will be clamped; if 'pnum' is set to
1851  * the end of the image, then the returned value will include BDRV_BLOCK_EOF.
1852  *
1853  * 'pnum' is set to the number of bytes (including and immediately
1854  * following the specified offset) that are easily known to be in the
1855  * same allocated/unallocated state.  Note that a second call starting
1856  * at the original offset plus returned pnum may have the same status.
1857  * The returned value is non-zero on success except at end-of-file.
1858  *
1859  * Returns negative errno on failure.  Otherwise, if the
1860  * BDRV_BLOCK_OFFSET_VALID bit is set, 'map' and 'file' (if non-NULL) are
1861  * set to the host mapping and BDS corresponding to the guest offset.
1862  */
1863 static int coroutine_fn bdrv_co_block_status(BlockDriverState *bs,
1864                                              bool want_zero,
1865                                              int64_t offset, int64_t bytes,
1866                                              int64_t *pnum, int64_t *map,
1867                                              BlockDriverState **file)
1868 {
1869     int64_t total_size;
1870     int64_t n; /* bytes */
1871     int ret;
1872     int64_t local_map = 0;
1873     BlockDriverState *local_file = NULL;
1874     int64_t aligned_offset, aligned_bytes;
1875     uint32_t align;
1876
1877     assert(pnum);
1878     *pnum = 0;
1879     total_size = bdrv_getlength(bs);
1880     if (total_size < 0) {
1881         ret = total_size;
1882         goto early_out;
1883     }
1884
1885     if (offset >= total_size) {
1886         ret = BDRV_BLOCK_EOF;
1887         goto early_out;
1888     }
1889     if (!bytes) {
1890         ret = 0;
1891         goto early_out;
1892     }
1893
1894     n = total_size - offset;
1895     if (n < bytes) {
1896         bytes = n;
1897     }
1898
1899     /* Must be non-NULL or bdrv_getlength() would have failed */
1900     assert(bs->drv);
1901     if (!bs->drv->bdrv_co_get_block_status) {
1902         *pnum = bytes;
1903         ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED;
1904         if (offset + bytes == total_size) {
1905             ret |= BDRV_BLOCK_EOF;
1906         }
1907         if (bs->drv->protocol_name) {
1908             ret |= BDRV_BLOCK_OFFSET_VALID;
1909             local_map = offset;
1910             local_file = bs;
1911         }
1912         goto early_out;
1913     }
1914
1915     bdrv_inc_in_flight(bs);
1916
1917     /* Round out to request_alignment boundaries */
1918     /* TODO: until we have a byte-based driver callback, we also have to
1919      * round out to sectors, even if that is bigger than request_alignment */
1920     align = MAX(bs->bl.request_alignment, BDRV_SECTOR_SIZE);
1921     aligned_offset = QEMU_ALIGN_DOWN(offset, align);
1922     aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset;
1923
1924     {
1925         int count; /* sectors */
1926         int64_t longret;
1927
1928         assert(QEMU_IS_ALIGNED(aligned_offset | aligned_bytes,
1929                                BDRV_SECTOR_SIZE));
1930         /*
1931          * The contract allows us to return pnum smaller than bytes, even
1932          * if the next query would see the same status; we truncate the
1933          * request to avoid overflowing the driver's 32-bit interface.
1934          */
1935         longret = bs->drv->bdrv_co_get_block_status(
1936             bs, aligned_offset >> BDRV_SECTOR_BITS,
1937             MIN(INT_MAX, aligned_bytes) >> BDRV_SECTOR_BITS, &count,
1938             &local_file);
1939         if (longret < 0) {
1940             assert(INT_MIN <= longret);
1941             ret = longret;
1942             goto out;
1943         }
1944         if (longret & BDRV_BLOCK_OFFSET_VALID) {
1945             local_map = longret & BDRV_BLOCK_OFFSET_MASK;
1946         }
1947         ret = longret & ~BDRV_BLOCK_OFFSET_MASK;
1948         *pnum = count * BDRV_SECTOR_SIZE;
1949     }
1950
1951     /*
1952      * The driver's result must be a multiple of request_alignment.
1953      * Clamp pnum and adjust map to original request.
1954      */
1955     assert(QEMU_IS_ALIGNED(*pnum, align) && align > offset - aligned_offset);
1956     *pnum -= offset - aligned_offset;
1957     if (*pnum > bytes) {
1958         *pnum = bytes;
1959     }
1960     if (ret & BDRV_BLOCK_OFFSET_VALID) {
1961         local_map += offset - aligned_offset;
1962     }
1963
1964     if (ret & BDRV_BLOCK_RAW) {
1965         assert(ret & BDRV_BLOCK_OFFSET_VALID && local_file);
1966         ret = bdrv_co_block_status(local_file, want_zero, local_map,
1967                                    *pnum, pnum, &local_map, &local_file);
1968         goto out;
1969     }
1970
1971     if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
1972         ret |= BDRV_BLOCK_ALLOCATED;
1973     } else if (want_zero) {
1974         if (bdrv_unallocated_blocks_are_zero(bs)) {
1975             ret |= BDRV_BLOCK_ZERO;
1976         } else if (bs->backing) {
1977             BlockDriverState *bs2 = bs->backing->bs;
1978             int64_t size2 = bdrv_getlength(bs2);
1979
1980             if (size2 >= 0 && offset >= size2) {
1981                 ret |= BDRV_BLOCK_ZERO;
1982             }
1983         }
1984     }
1985
1986     if (want_zero && local_file && local_file != bs &&
1987         (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) &&
1988         (ret & BDRV_BLOCK_OFFSET_VALID)) {
1989         int64_t file_pnum;
1990         int ret2;
1991
1992         ret2 = bdrv_co_block_status(local_file, want_zero, local_map,
1993                                     *pnum, &file_pnum, NULL, NULL);
1994         if (ret2 >= 0) {
1995             /* Ignore errors.  This is just providing extra information, it
1996              * is useful but not necessary.
1997              */
1998             if (ret2 & BDRV_BLOCK_EOF &&
1999                 (!file_pnum || ret2 & BDRV_BLOCK_ZERO)) {
2000                 /*
2001                  * It is valid for the format block driver to read
2002                  * beyond the end of the underlying file's current
2003                  * size; such areas read as zero.
2004                  */
2005                 ret |= BDRV_BLOCK_ZERO;
2006             } else {
2007                 /* Limit request to the range reported by the protocol driver */
2008                 *pnum = file_pnum;
2009                 ret |= (ret2 & BDRV_BLOCK_ZERO);
2010             }
2011         }
2012     }
2013
2014 out:
2015     bdrv_dec_in_flight(bs);
2016     if (ret >= 0 && offset + *pnum == total_size) {
2017         ret |= BDRV_BLOCK_EOF;
2018     }
2019 early_out:
2020     if (file) {
2021         *file = local_file;
2022     }
2023     if (map) {
2024         *map = local_map;
2025     }
2026     return ret;
2027 }
2028
2029 static int coroutine_fn bdrv_co_block_status_above(BlockDriverState *bs,
2030                                                    BlockDriverState *base,
2031                                                    bool want_zero,
2032                                                    int64_t offset,
2033                                                    int64_t bytes,
2034                                                    int64_t *pnum,
2035                                                    int64_t *map,
2036                                                    BlockDriverState **file)
2037 {
2038     BlockDriverState *p;
2039     int ret = 0;
2040     bool first = true;
2041
2042     assert(bs != base);
2043     for (p = bs; p != base; p = backing_bs(p)) {
2044         ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
2045                                    file);
2046         if (ret < 0) {
2047             break;
2048         }
2049         if (ret & BDRV_BLOCK_ZERO && ret & BDRV_BLOCK_EOF && !first) {
2050             /*
2051              * Reading beyond the end of the file continues to read
2052              * zeroes, but we can only widen the result to the
2053              * unallocated length we learned from an earlier
2054              * iteration.
2055              */
2056             *pnum = bytes;
2057         }
2058         if (ret & (BDRV_BLOCK_ZERO | BDRV_BLOCK_DATA)) {
2059             break;
2060         }
2061         /* [offset, pnum] unallocated on this layer, which could be only
2062          * the first part of [offset, bytes].  */
2063         bytes = MIN(bytes, *pnum);
2064         first = false;
2065     }
2066     return ret;
2067 }
2068
2069 /* Coroutine wrapper for bdrv_block_status_above() */
2070 static void coroutine_fn bdrv_block_status_above_co_entry(void *opaque)
2071 {
2072     BdrvCoBlockStatusData *data = opaque;
2073
2074     data->ret = bdrv_co_block_status_above(data->bs, data->base,
2075                                            data->want_zero,
2076                                            data->offset, data->bytes,
2077                                            data->pnum, data->map, data->file);
2078     data->done = true;
2079 }
2080
2081 /*
2082  * Synchronous wrapper around bdrv_co_block_status_above().
2083  *
2084  * See bdrv_co_block_status_above() for details.
2085  */
2086 static int bdrv_common_block_status_above(BlockDriverState *bs,
2087                                           BlockDriverState *base,
2088                                           bool want_zero, int64_t offset,
2089                                           int64_t bytes, int64_t *pnum,
2090                                           int64_t *map,
2091                                           BlockDriverState **file)
2092 {
2093     Coroutine *co;
2094     BdrvCoBlockStatusData data = {
2095         .bs = bs,
2096         .base = base,
2097         .want_zero = want_zero,
2098         .offset = offset,
2099         .bytes = bytes,
2100         .pnum = pnum,
2101         .map = map,
2102         .file = file,
2103         .done = false,
2104     };
2105
2106     if (qemu_in_coroutine()) {
2107         /* Fast-path if already in coroutine context */
2108         bdrv_block_status_above_co_entry(&data);
2109     } else {
2110         co = qemu_coroutine_create(bdrv_block_status_above_co_entry, &data);
2111         bdrv_coroutine_enter(bs, co);
2112         BDRV_POLL_WHILE(bs, !data.done);
2113     }
2114     return data.ret;
2115 }
2116
2117 int bdrv_block_status_above(BlockDriverState *bs, BlockDriverState *base,
2118                             int64_t offset, int64_t bytes, int64_t *pnum,
2119                             int64_t *map, BlockDriverState **file)
2120 {
2121     return bdrv_common_block_status_above(bs, base, true, offset, bytes,
2122                                           pnum, map, file);
2123 }
2124
2125 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
2126                       int64_t *pnum, int64_t *map, BlockDriverState **file)
2127 {
2128     return bdrv_block_status_above(bs, backing_bs(bs),
2129                                    offset, bytes, pnum, map, file);
2130 }
2131
2132 int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t offset,
2133                                    int64_t bytes, int64_t *pnum)
2134 {
2135     int ret;
2136     int64_t dummy;
2137
2138     ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
2139                                          bytes, pnum ? pnum : &dummy, NULL,
2140                                          NULL);
2141     if (ret < 0) {
2142         return ret;
2143     }
2144     return !!(ret & BDRV_BLOCK_ALLOCATED);
2145 }
2146
2147 /*
2148  * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP]
2149  *
2150  * Return true if (a prefix of) the given range is allocated in any image
2151  * between BASE and TOP (inclusive).  BASE can be NULL to check if the given
2152  * offset is allocated in any image of the chain.  Return false otherwise,
2153  * or negative errno on failure.
2154  *
2155  * 'pnum' is set to the number of bytes (including and immediately
2156  * following the specified offset) that are known to be in the same
2157  * allocated/unallocated state.  Note that a subsequent call starting
2158  * at 'offset + *pnum' may return the same allocation status (in other
2159  * words, the result is not necessarily the maximum possible range);
2160  * but 'pnum' will only be 0 when end of file is reached.
2161  *
2162  */
2163 int bdrv_is_allocated_above(BlockDriverState *top,
2164                             BlockDriverState *base,
2165                             int64_t offset, int64_t bytes, int64_t *pnum)
2166 {
2167     BlockDriverState *intermediate;
2168     int ret;
2169     int64_t n = bytes;
2170
2171     intermediate = top;
2172     while (intermediate && intermediate != base) {
2173         int64_t pnum_inter;
2174         int64_t size_inter;
2175
2176         ret = bdrv_is_allocated(intermediate, offset, bytes, &pnum_inter);
2177         if (ret < 0) {
2178             return ret;
2179         }
2180         if (ret) {
2181             *pnum = pnum_inter;
2182             return 1;
2183         }
2184
2185         size_inter = bdrv_getlength(intermediate);
2186         if (size_inter < 0) {
2187             return size_inter;
2188         }
2189         if (n > pnum_inter &&
2190             (intermediate == top || offset + pnum_inter < size_inter)) {
2191             n = pnum_inter;
2192         }
2193
2194         intermediate = backing_bs(intermediate);
2195     }
2196
2197     *pnum = n;
2198     return 0;
2199 }
2200
2201 typedef struct BdrvVmstateCo {
2202     BlockDriverState    *bs;
2203     QEMUIOVector        *qiov;
2204     int64_t             pos;
2205     bool                is_read;
2206     int                 ret;
2207 } BdrvVmstateCo;
2208
2209 static int coroutine_fn
2210 bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2211                    bool is_read)
2212 {
2213     BlockDriver *drv = bs->drv;
2214     int ret = -ENOTSUP;
2215
2216     bdrv_inc_in_flight(bs);
2217
2218     if (!drv) {
2219         ret = -ENOMEDIUM;
2220     } else if (drv->bdrv_load_vmstate) {
2221         if (is_read) {
2222             ret = drv->bdrv_load_vmstate(bs, qiov, pos);
2223         } else {
2224             ret = drv->bdrv_save_vmstate(bs, qiov, pos);
2225         }
2226     } else if (bs->file) {
2227         ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
2228     }
2229
2230     bdrv_dec_in_flight(bs);
2231     return ret;
2232 }
2233
2234 static void coroutine_fn bdrv_co_rw_vmstate_entry(void *opaque)
2235 {
2236     BdrvVmstateCo *co = opaque;
2237     co->ret = bdrv_co_rw_vmstate(co->bs, co->qiov, co->pos, co->is_read);
2238 }
2239
2240 static inline int
2241 bdrv_rw_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos,
2242                 bool is_read)
2243 {
2244     if (qemu_in_coroutine()) {
2245         return bdrv_co_rw_vmstate(bs, qiov, pos, is_read);
2246     } else {
2247         BdrvVmstateCo data = {
2248             .bs         = bs,
2249             .qiov       = qiov,
2250             .pos        = pos,
2251             .is_read    = is_read,
2252             .ret        = -EINPROGRESS,
2253         };
2254         Coroutine *co = qemu_coroutine_create(bdrv_co_rw_vmstate_entry, &data);
2255
2256         bdrv_coroutine_enter(bs, co);
2257         BDRV_POLL_WHILE(bs, data.ret == -EINPROGRESS);
2258         return data.ret;
2259     }
2260 }
2261
2262 int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf,
2263                       int64_t pos, int size)
2264 {
2265     QEMUIOVector qiov;
2266     struct iovec iov = {
2267         .iov_base   = (void *) buf,
2268         .iov_len    = size,
2269     };
2270     int ret;
2271
2272     qemu_iovec_init_external(&qiov, &iov, 1);
2273
2274     ret = bdrv_writev_vmstate(bs, &qiov, pos);
2275     if (ret < 0) {
2276         return ret;
2277     }
2278
2279     return size;
2280 }
2281
2282 int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2283 {
2284     return bdrv_rw_vmstate(bs, qiov, pos, false);
2285 }
2286
2287 int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf,
2288                       int64_t pos, int size)
2289 {
2290     QEMUIOVector qiov;
2291     struct iovec iov = {
2292         .iov_base   = buf,
2293         .iov_len    = size,
2294     };
2295     int ret;
2296
2297     qemu_iovec_init_external(&qiov, &iov, 1);
2298     ret = bdrv_readv_vmstate(bs, &qiov, pos);
2299     if (ret < 0) {
2300         return ret;
2301     }
2302
2303     return size;
2304 }
2305
2306 int bdrv_readv_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos)
2307 {
2308     return bdrv_rw_vmstate(bs, qiov, pos, true);
2309 }
2310
2311 /**************************************************************/
2312 /* async I/Os */
2313
2314 void bdrv_aio_cancel(BlockAIOCB *acb)
2315 {
2316     qemu_aio_ref(acb);
2317     bdrv_aio_cancel_async(acb);
2318     while (acb->refcnt > 1) {
2319         if (acb->aiocb_info->get_aio_context) {
2320             aio_poll(acb->aiocb_info->get_aio_context(acb), true);
2321         } else if (acb->bs) {
2322             /* qemu_aio_ref and qemu_aio_unref are not thread-safe, so
2323              * assert that we're not using an I/O thread.  Thread-safe
2324              * code should use bdrv_aio_cancel_async exclusively.
2325              */
2326             assert(bdrv_get_aio_context(acb->bs) == qemu_get_aio_context());
2327             aio_poll(bdrv_get_aio_context(acb->bs), true);
2328         } else {
2329             abort();
2330         }
2331     }
2332     qemu_aio_unref(acb);
2333 }
2334
2335 /* Async version of aio cancel. The caller is not blocked if the acb implements
2336  * cancel_async, otherwise we do nothing and let the request normally complete.
2337  * In either case the completion callback must be called. */
2338 void bdrv_aio_cancel_async(BlockAIOCB *acb)
2339 {
2340     if (acb->aiocb_info->cancel_async) {
2341         acb->aiocb_info->cancel_async(acb);
2342     }
2343 }
2344
2345 /**************************************************************/
2346 /* Coroutine block device emulation */
2347
2348 typedef struct FlushCo {
2349     BlockDriverState *bs;
2350     int ret;
2351 } FlushCo;
2352
2353
2354 static void coroutine_fn bdrv_flush_co_entry(void *opaque)
2355 {
2356     FlushCo *rwco = opaque;
2357
2358     rwco->ret = bdrv_co_flush(rwco->bs);
2359 }
2360
2361 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
2362 {
2363     int current_gen;
2364     int ret = 0;
2365
2366     bdrv_inc_in_flight(bs);
2367
2368     if (!bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
2369         bdrv_is_sg(bs)) {
2370         goto early_exit;
2371     }
2372
2373     qemu_co_mutex_lock(&bs->reqs_lock);
2374     current_gen = atomic_read(&bs->write_gen);
2375
2376     /* Wait until any previous flushes are completed */
2377     while (bs->active_flush_req) {
2378         qemu_co_queue_wait(&bs->flush_queue, &bs->reqs_lock);
2379     }
2380
2381     /* Flushes reach this point in nondecreasing current_gen order.  */
2382     bs->active_flush_req = true;
2383     qemu_co_mutex_unlock(&bs->reqs_lock);
2384
2385     /* Write back all layers by calling one driver function */
2386     if (bs->drv->bdrv_co_flush) {
2387         ret = bs->drv->bdrv_co_flush(bs);
2388         goto out;
2389     }
2390
2391     /* Write back cached data to the OS even with cache=unsafe */
2392     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
2393     if (bs->drv->bdrv_co_flush_to_os) {
2394         ret = bs->drv->bdrv_co_flush_to_os(bs);
2395         if (ret < 0) {
2396             goto out;
2397         }
2398     }
2399
2400     /* But don't actually force it to the disk with cache=unsafe */
2401     if (bs->open_flags & BDRV_O_NO_FLUSH) {
2402         goto flush_parent;
2403     }
2404
2405     /* Check if we really need to flush anything */
2406     if (bs->flushed_gen == current_gen) {
2407         goto flush_parent;
2408     }
2409
2410     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
2411     if (!bs->drv) {
2412         /* bs->drv->bdrv_co_flush() might have ejected the BDS
2413          * (even in case of apparent success) */
2414         ret = -ENOMEDIUM;
2415         goto out;
2416     }
2417     if (bs->drv->bdrv_co_flush_to_disk) {
2418         ret = bs->drv->bdrv_co_flush_to_disk(bs);
2419     } else if (bs->drv->bdrv_aio_flush) {
2420         BlockAIOCB *acb;
2421         CoroutineIOCompletion co = {
2422             .coroutine = qemu_coroutine_self(),
2423         };
2424
2425         acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co);
2426         if (acb == NULL) {
2427             ret = -EIO;
2428         } else {
2429             qemu_coroutine_yield();
2430             ret = co.ret;
2431         }
2432     } else {
2433         /*
2434          * Some block drivers always operate in either writethrough or unsafe
2435          * mode and don't support bdrv_flush therefore. Usually qemu doesn't
2436          * know how the server works (because the behaviour is hardcoded or
2437          * depends on server-side configuration), so we can't ensure that
2438          * everything is safe on disk. Returning an error doesn't work because
2439          * that would break guests even if the server operates in writethrough
2440          * mode.
2441          *
2442          * Let's hope the user knows what he's doing.
2443          */
2444         ret = 0;
2445     }
2446
2447     if (ret < 0) {
2448         goto out;
2449     }
2450
2451     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
2452      * in the case of cache=unsafe, so there are no useless flushes.
2453      */
2454 flush_parent:
2455     ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
2456 out:
2457     /* Notify any pending flushes that we have completed */
2458     if (ret == 0) {
2459         bs->flushed_gen = current_gen;
2460     }
2461
2462     qemu_co_mutex_lock(&bs->reqs_lock);
2463     bs->active_flush_req = false;
2464     /* Return value is ignored - it's ok if wait queue is empty */
2465     qemu_co_queue_next(&bs->flush_queue);
2466     qemu_co_mutex_unlock(&bs->reqs_lock);
2467
2468 early_exit:
2469     bdrv_dec_in_flight(bs);
2470     return ret;
2471 }
2472
2473 int bdrv_flush(BlockDriverState *bs)
2474 {
2475     Coroutine *co;
2476     FlushCo flush_co = {
2477         .bs = bs,
2478         .ret = NOT_DONE,
2479     };
2480
2481     if (qemu_in_coroutine()) {
2482         /* Fast-path if already in coroutine context */
2483         bdrv_flush_co_entry(&flush_co);
2484     } else {
2485         co = qemu_coroutine_create(bdrv_flush_co_entry, &flush_co);
2486         bdrv_coroutine_enter(bs, co);
2487         BDRV_POLL_WHILE(bs, flush_co.ret == NOT_DONE);
2488     }
2489
2490     return flush_co.ret;
2491 }
2492
2493 typedef struct DiscardCo {
2494     BlockDriverState *bs;
2495     int64_t offset;
2496     int bytes;
2497     int ret;
2498 } DiscardCo;
2499 static void coroutine_fn bdrv_pdiscard_co_entry(void *opaque)
2500 {
2501     DiscardCo *rwco = opaque;
2502
2503     rwco->ret = bdrv_co_pdiscard(rwco->bs, rwco->offset, rwco->bytes);
2504 }
2505
2506 int coroutine_fn bdrv_co_pdiscard(BlockDriverState *bs, int64_t offset,
2507                                   int bytes)
2508 {
2509     BdrvTrackedRequest req;
2510     int max_pdiscard, ret;
2511     int head, tail, align;
2512
2513     if (!bs->drv) {
2514         return -ENOMEDIUM;
2515     }
2516
2517     if (bdrv_has_readonly_bitmaps(bs)) {
2518         return -EPERM;
2519     }
2520
2521     ret = bdrv_check_byte_request(bs, offset, bytes);
2522     if (ret < 0) {
2523         return ret;
2524     } else if (bs->read_only) {
2525         return -EPERM;
2526     }
2527     assert(!(bs->open_flags & BDRV_O_INACTIVE));
2528
2529     /* Do nothing if disabled.  */
2530     if (!(bs->open_flags & BDRV_O_UNMAP)) {
2531         return 0;
2532     }
2533
2534     if (!bs->drv->bdrv_co_pdiscard && !bs->drv->bdrv_aio_pdiscard) {
2535         return 0;
2536     }
2537
2538     /* Discard is advisory, but some devices track and coalesce
2539      * unaligned requests, so we must pass everything down rather than
2540      * round here.  Still, most devices will just silently ignore
2541      * unaligned requests (by returning -ENOTSUP), so we must fragment
2542      * the request accordingly.  */
2543     align = MAX(bs->bl.pdiscard_alignment, bs->bl.request_alignment);
2544     assert(align % bs->bl.request_alignment == 0);
2545     head = offset % align;
2546     tail = (offset + bytes) % align;
2547
2548     bdrv_inc_in_flight(bs);
2549     tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_DISCARD);
2550
2551     ret = notifier_with_return_list_notify(&bs->before_write_notifiers, &req);
2552     if (ret < 0) {
2553         goto out;
2554     }
2555
2556     max_pdiscard = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_pdiscard, INT_MAX),
2557                                    align);
2558     assert(max_pdiscard >= bs->bl.request_alignment);
2559
2560     while (bytes > 0) {
2561         int num = bytes;
2562
2563         if (head) {
2564             /* Make small requests to get to alignment boundaries. */
2565             num = MIN(bytes, align - head);
2566             if (!QEMU_IS_ALIGNED(num, bs->bl.request_alignment)) {
2567                 num %= bs->bl.request_alignment;
2568             }
2569             head = (head + num) % align;
2570             assert(num < max_pdiscard);
2571         } else if (tail) {
2572             if (num > align) {
2573                 /* Shorten the request to the last aligned cluster.  */
2574                 num -= tail;
2575             } else if (!QEMU_IS_ALIGNED(tail, bs->bl.request_alignment) &&
2576                        tail > bs->bl.request_alignment) {
2577                 tail %= bs->bl.request_alignment;
2578                 num -= tail;
2579             }
2580         }
2581         /* limit request size */
2582         if (num > max_pdiscard) {
2583             num = max_pdiscard;
2584         }
2585
2586         if (!bs->drv) {
2587             ret = -ENOMEDIUM;
2588             goto out;
2589         }
2590         if (bs->drv->bdrv_co_pdiscard) {
2591             ret = bs->drv->bdrv_co_pdiscard(bs, offset, num);
2592         } else {
2593             BlockAIOCB *acb;
2594             CoroutineIOCompletion co = {
2595                 .coroutine = qemu_coroutine_self(),
2596             };
2597
2598             acb = bs->drv->bdrv_aio_pdiscard(bs, offset, num,
2599                                              bdrv_co_io_em_complete, &co);
2600             if (acb == NULL) {
2601                 ret = -EIO;
2602                 goto out;
2603             } else {
2604                 qemu_coroutine_yield();
2605                 ret = co.ret;
2606             }
2607         }
2608         if (ret && ret != -ENOTSUP) {
2609             goto out;
2610         }
2611
2612         offset += num;
2613         bytes -= num;
2614     }
2615     ret = 0;
2616 out:
2617     atomic_inc(&bs->write_gen);
2618     bdrv_set_dirty(bs, req.offset, req.bytes);
2619     tracked_request_end(&req);
2620     bdrv_dec_in_flight(bs);
2621     return ret;
2622 }
2623
2624 int bdrv_pdiscard(BlockDriverState *bs, int64_t offset, int bytes)
2625 {
2626     Coroutine *co;
2627     DiscardCo rwco = {
2628         .bs = bs,
2629         .offset = offset,
2630         .bytes = bytes,
2631         .ret = NOT_DONE,
2632     };
2633
2634     if (qemu_in_coroutine()) {
2635         /* Fast-path if already in coroutine context */
2636         bdrv_pdiscard_co_entry(&rwco);
2637     } else {
2638         co = qemu_coroutine_create(bdrv_pdiscard_co_entry, &rwco);
2639         bdrv_coroutine_enter(bs, co);
2640         BDRV_POLL_WHILE(bs, rwco.ret == NOT_DONE);
2641     }
2642
2643     return rwco.ret;
2644 }
2645
2646 int bdrv_co_ioctl(BlockDriverState *bs, int req, void *buf)
2647 {
2648     BlockDriver *drv = bs->drv;
2649     CoroutineIOCompletion co = {
2650         .coroutine = qemu_coroutine_self(),
2651     };
2652     BlockAIOCB *acb;
2653
2654     bdrv_inc_in_flight(bs);
2655     if (!drv || (!drv->bdrv_aio_ioctl && !drv->bdrv_co_ioctl)) {
2656         co.ret = -ENOTSUP;
2657         goto out;
2658     }
2659
2660     if (drv->bdrv_co_ioctl) {
2661         co.ret = drv->bdrv_co_ioctl(bs, req, buf);
2662     } else {
2663         acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
2664         if (!acb) {
2665             co.ret = -ENOTSUP;
2666             goto out;
2667         }
2668         qemu_coroutine_yield();
2669     }
2670 out:
2671     bdrv_dec_in_flight(bs);
2672     return co.ret;
2673 }
2674
2675 void *qemu_blockalign(BlockDriverState *bs, size_t size)
2676 {
2677     return qemu_memalign(bdrv_opt_mem_align(bs), size);
2678 }
2679
2680 void *qemu_blockalign0(BlockDriverState *bs, size_t size)
2681 {
2682     return memset(qemu_blockalign(bs, size), 0, size);
2683 }
2684
2685 void *qemu_try_blockalign(BlockDriverState *bs, size_t size)
2686 {
2687     size_t align = bdrv_opt_mem_align(bs);
2688
2689     /* Ensure that NULL is never returned on success */
2690     assert(align > 0);
2691     if (size == 0) {
2692         size = align;
2693     }
2694
2695     return qemu_try_memalign(align, size);
2696 }
2697
2698 void *qemu_try_blockalign0(BlockDriverState *bs, size_t size)
2699 {
2700     void *mem = qemu_try_blockalign(bs, size);
2701
2702     if (mem) {
2703         memset(mem, 0, size);
2704     }
2705
2706     return mem;
2707 }
2708
2709 /*
2710  * Check if all memory in this vector is sector aligned.
2711  */
2712 bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov)
2713 {
2714     int i;
2715     size_t alignment = bdrv_min_mem_align(bs);
2716
2717     for (i = 0; i < qiov->niov; i++) {
2718         if ((uintptr_t) qiov->iov[i].iov_base % alignment) {
2719             return false;
2720         }
2721         if (qiov->iov[i].iov_len % alignment) {
2722             return false;
2723         }
2724     }
2725
2726     return true;
2727 }
2728
2729 void bdrv_add_before_write_notifier(BlockDriverState *bs,
2730                                     NotifierWithReturn *notifier)
2731 {
2732     notifier_with_return_list_add(&bs->before_write_notifiers, notifier);
2733 }
2734
2735 void bdrv_io_plug(BlockDriverState *bs)
2736 {
2737     BdrvChild *child;
2738
2739     QLIST_FOREACH(child, &bs->children, next) {
2740         bdrv_io_plug(child->bs);
2741     }
2742
2743     if (atomic_fetch_inc(&bs->io_plugged) == 0) {
2744         BlockDriver *drv = bs->drv;
2745         if (drv && drv->bdrv_io_plug) {
2746             drv->bdrv_io_plug(bs);
2747         }
2748     }
2749 }
2750
2751 void bdrv_io_unplug(BlockDriverState *bs)
2752 {
2753     BdrvChild *child;
2754
2755     assert(bs->io_plugged);
2756     if (atomic_fetch_dec(&bs->io_plugged) == 1) {
2757         BlockDriver *drv = bs->drv;
2758         if (drv && drv->bdrv_io_unplug) {
2759             drv->bdrv_io_unplug(bs);
2760         }
2761     }
2762
2763     QLIST_FOREACH(child, &bs->children, next) {
2764         bdrv_io_unplug(child->bs);
2765     }
2766 }