src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.137 2008/04/18 06:48:38 heikki Exp $
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <unistd.h>
  18 #include <fcntl.h>
  19 #include <sys/file.h>
  20
  21 #include "catalog/catalog.h"
  22 #include "miscadmin.h"
  23 #include "postmaster/bgwriter.h"
  24 #include "storage/fd.h"
  25 #include "storage/bufmgr.h"
  26 #include "storage/smgr.h"
  27 #include "utils/hsearch.h"
  28 #include "utils/memutils.h"
  29
  30
  31 /* interval for calling AbsorbFsyncRequests in mdsync */
  32 #define FSYNCS_PER_ABSORB               10
  33
  34 /* special values for the segno arg to RememberFsyncRequest */
  35 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
  36 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
  37 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
  38
  39 /*
  40  * On Windows, we have to interpret EACCES as possibly meaning the same as
  41  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  42  * that's what you get.  Ugh.  This code is designed so that we don't
  43  * actually believe these cases are okay without further evidence (namely,
  44  * a pending fsync request getting revoked ... see mdsync).
  45  */
  46 #ifndef WIN32
  47 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
  48 #else
  49 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
  50 #endif
  51
  52 /*
  53  *      The magnetic disk storage manager keeps track of open file
  54  *      descriptors in its own descriptor pool.  This is done to make it
  55  *      easier to support relations that are larger than the operating
  56  *      system's file size limit (often 2GBytes).  In order to do that,
  57  *      we break relations up into "segment" files that are each shorter than
  58  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
  59  *      configuration constant in pg_config_manual.h.
  60  *
  61  *      On disk, a relation must consist of consecutively numbered segment
  62  *      files in the pattern
  63  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
  64  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
  65  *              -- Optionally, any number of inactive segments of size 0 blocks.
  66  *      The full and partial segments are collectively the "active" segments.
  67  *      Inactive segments are those that once contained data but are currently
  68  *      not needed because of an mdtruncate() operation.  The reason for leaving
  69  *      them present at size zero, rather than unlinking them, is that other
  70  *      backends and/or the bgwriter might be holding open file references to
  71  *      such segments.  If the relation expands again after mdtruncate(), such
  72  *      that a deactivated segment becomes active again, it is important that
  73  *      such file references still be valid --- else data might get written
  74  *      out to an unlinked old copy of a segment file that will eventually
  75  *      disappear.
  76  *
  77  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
  78  *      cache is, therefore, just the head of a list of MdfdVec objects, one
  79  *      per segment.  But note the md_fd pointer can be NULL, indicating
  80  *      relation not open.
  81  *
  82  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
  83  *      doesn't have another segment after this one; we may just not have
  84  *      opened the next segment yet.  (We could not have "all segments are
  85  *      in the chain" as an invariant anyway, since another backend could
  86  *      extend the relation when we weren't looking.)  We do not make chain
  87  *      entries for inactive segments, however; as soon as we find a partial
  88  *      segment, we assume that any subsequent segments are inactive.
  89  *
  90  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
  91  *
  92  *      On platforms that support large files, USE_SEGMENTED_FILES can be
  93  *      #undef'd to disable the segmentation logic.  In that case each
  94  *      relation is a single operating-system file.
  95  */
  96
  97 typedef struct _MdfdVec
  98 {
  99         File            mdfd_vfd;               /* fd number in fd.c's pool */
 100         BlockNumber mdfd_segno;         /* segment number, from 0 */
 101 #ifdef USE_SEGMENTED_FILES
 102         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
 103 #endif
 104 } MdfdVec;
 105
 106 static MemoryContext MdCxt;             /* context for all md.c allocations */
 107
 108
 109 /*
 110  * In some contexts (currently, standalone backends and the bgwriter process)
 111  * we keep track of pending fsync operations: we need to remember all relation
 112  * segments that have been written since the last checkpoint, so that we can
 113  * fsync them down to disk before completing the next checkpoint.  This hash
 114  * table remembers the pending operations.      We use a hash table mostly as
 115  * a convenient way of eliminating duplicate requests.
 116  *
 117  * We use a similar mechanism to remember no-longer-needed files that can
 118  * be deleted after the next checkpoint, but we use a linked list instead of
 119  * a hash table, because we don't expect there to be any duplicate requests.
 120  *
 121  * (Regular backends do not track pending operations locally, but forward
 122  * them to the bgwriter.)
 123  */
 124 typedef struct
 125 {
 126         RelFileNode rnode;                      /* the targeted relation */
 127         BlockNumber segno;                      /* which segment */
 128 } PendingOperationTag;
 129
 130 typedef uint16 CycleCtr;                /* can be any convenient integer size */
 131
 132 typedef struct
 133 {
 134         PendingOperationTag tag;        /* hash table key (must be first!) */
 135         bool            canceled;               /* T => request canceled, not yet removed */
 136         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
 137 } PendingOperationEntry;
 138
 139 typedef struct
 140 {
 141         RelFileNode rnode;                      /* the dead relation to delete */
 142         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
 143 } PendingUnlinkEntry;
 144
 145 static HTAB *pendingOpsTable = NULL;
 146 static List *pendingUnlinks = NIL;
 147
 148 static CycleCtr mdsync_cycle_ctr = 0;
 149 static CycleCtr mdckpt_cycle_ctr = 0;
 150
 151
 152 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
 153 {
 154         EXTENSION_FAIL,                         /* ereport if segment not present */
 155         EXTENSION_RETURN_NULL,          /* return NULL if not present */
 156         EXTENSION_CREATE                        /* create new segments as needed */
 157 } ExtensionBehavior;
 158
 159 /* local routines */
 160 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
 161 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
 162 static void register_unlink(RelFileNode rnode);
 163 static MdfdVec *_fdvec_alloc(void);
 164
 165 #ifdef USE_SEGMENTED_FILES
 166 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
 167                           int oflags);
 168 #endif
 169 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
 170                          bool isTemp, ExtensionBehavior behavior);
 171 static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
 172
 173
 174 /*
 175  *      mdinit() -- Initialize private state for magnetic disk storage manager.
 176  */
 177 void
 178 mdinit(void)
 179 {
 180         MdCxt = AllocSetContextCreate(TopMemoryContext,
 181                                                                   "MdSmgr",
 182                                                                   ALLOCSET_DEFAULT_MINSIZE,
 183                                                                   ALLOCSET_DEFAULT_INITSIZE,
 184                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 185
 186         /*
 187          * Create pending-operations hashtable if we need it.  Currently, we need
 188          * it if we are standalone (not under a postmaster) OR if we are a
 189          * bootstrap-mode subprocess of a postmaster (that is, a startup or
 190          * bgwriter process).
 191          */
 192         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
 193         {
 194                 HASHCTL         hash_ctl;
 195
 196                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 197                 hash_ctl.keysize = sizeof(PendingOperationTag);
 198                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
 199                 hash_ctl.hash = tag_hash;
 200                 hash_ctl.hcxt = MdCxt;
 201                 pendingOpsTable = hash_create("Pending Ops Table",
 202                                                                           100L,
 203                                                                           &hash_ctl,
 204                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 205                 pendingUnlinks = NIL;
 206         }
 207 }
 208
 209 /*
 210  *      mdcreate() -- Create a new relation on magnetic disk.
 211  *
 212  * If isRedo is true, it's okay for the relation to exist already.
 213  */
 214 void
 215 mdcreate(SMgrRelation reln, bool isRedo)
 216 {
 217         char       *path;
 218         File            fd;
 219
 220         if (isRedo && reln->md_fd != NULL)
 221                 return;                                 /* created and opened already... */
 222
 223         Assert(reln->md_fd == NULL);
 224
 225         path = relpath(reln->smgr_rnode);
 226
 227         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 228
 229         if (fd < 0)
 230         {
 231                 int                     save_errno = errno;
 232
 233                 /*
 234                  * During bootstrap, there are cases where a system relation will be
 235                  * accessed (by internal backend processes) before the bootstrap
 236                  * script nominally creates it.  Therefore, allow the file to exist
 237                  * already, even if isRedo is not set.  (See also mdopen)
 238                  */
 239                 if (isRedo || IsBootstrapProcessingMode())
 240                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 241                 if (fd < 0)
 242                 {
 243                         pfree(path);
 244                         /* be sure to report the error reported by create, not open */
 245                         errno = save_errno;
 246                         ereport(ERROR,
 247                                         (errcode_for_file_access(),
 248                                          errmsg("could not create relation %u/%u/%u: %m",
 249                                                         reln->smgr_rnode.spcNode,
 250                                                         reln->smgr_rnode.dbNode,
 251                                                         reln->smgr_rnode.relNode)));
 252                 }
 253         }
 254
 255         pfree(path);
 256
 257         reln->md_fd = _fdvec_alloc();
 258
 259         reln->md_fd->mdfd_vfd = fd;
 260         reln->md_fd->mdfd_segno = 0;
 261 #ifdef USE_SEGMENTED_FILES
 262         reln->md_fd->mdfd_chain = NULL;
 263 #endif
 264 }
 265
 266 /*
 267  *      mdunlink() -- Unlink a relation.
 268  *
 269  * Note that we're passed a RelFileNode --- by the time this is called,
 270  * there won't be an SMgrRelation hashtable entry anymore.
 271  *
 272  * Actually, we don't unlink the first segment file of the relation, but
 273  * just truncate it to zero length, and record a request to unlink it after
 274  * the next checkpoint.  Additional segments can be unlinked immediately,
 275  * however.  Leaving the empty file in place prevents that relfilenode
 276  * number from being reused.  The scenario this protects us from is:
 277  * 1. We delete a relation (and commit, and actually remove its file).
 278  * 2. We create a new relation, which by chance gets the same relfilenode as
 279  *        the just-deleted one (OIDs must've wrapped around for that to happen).
 280  * 3. We crash before another checkpoint occurs.
 281  * During replay, we would delete the file and then recreate it, which is fine
 282  * if the contents of the file were repopulated by subsequent WAL entries.
 283  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
 284  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
 285  * the contents of the file would be lost forever.      By leaving the empty file
 286  * until after the next checkpoint, we prevent reassignment of the relfilenode
 287  * number until it's safe, because relfilenode assignment skips over any
 288  * existing file.
 289  *
 290  * If isRedo is true, it's okay for the relation to be already gone.
 291  * Also, we should remove the file immediately instead of queuing a request
 292  * for later, since during redo there's no possibility of creating a
 293  * conflicting relation.
 294  *
 295  * Note: any failure should be reported as WARNING not ERROR, because
 296  * we are usually not in a transaction anymore when this is called.
 297  */
 298 void
 299 mdunlink(RelFileNode rnode, bool isRedo)
 300 {
 301         char       *path;
 302         int                     ret;
 303
 304         /*
 305          * We have to clean out any pending fsync requests for the doomed
 306          * relation, else the next mdsync() will fail.
 307          */
 308         ForgetRelationFsyncRequests(rnode);
 309
 310         path = relpath(rnode);
 311
 312         /*
 313          * Delete or truncate the first segment, or only segment if not doing
 314          * segmenting
 315          */
 316         if (isRedo)
 317                 ret = unlink(path);
 318         else
 319         {
 320                 /* truncate(2) would be easier here, but Windows hasn't got it */
 321                 int                     fd;
 322
 323                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
 324                 if (fd >= 0)
 325                 {
 326                         int                     save_errno;
 327
 328                         ret = ftruncate(fd, 0);
 329                         save_errno = errno;
 330                         close(fd);
 331                         errno = save_errno;
 332                 }
 333                 else
 334                         ret = -1;
 335         }
 336         if (ret < 0)
 337         {
 338                 if (!isRedo || errno != ENOENT)
 339                         ereport(WARNING,
 340                                         (errcode_for_file_access(),
 341                                          errmsg("could not remove relation %u/%u/%u: %m",
 342                                                         rnode.spcNode,
 343                                                         rnode.dbNode,
 344                                                         rnode.relNode)));
 345         }
 346
 347 #ifdef USE_SEGMENTED_FILES
 348         /* Delete the additional segments, if any */
 349         else
 350         {
 351                 char       *segpath = (char *) palloc(strlen(path) + 12);
 352                 BlockNumber segno;
 353
 354                 /*
 355                  * Note that because we loop until getting ENOENT, we will correctly
 356                  * remove all inactive segments as well as active ones.
 357                  */
 358                 for (segno = 1;; segno++)
 359                 {
 360                         sprintf(segpath, "%s.%u", path, segno);
 361                         if (unlink(segpath) < 0)
 362                         {
 363                                 /* ENOENT is expected after the last segment... */
 364                                 if (errno != ENOENT)
 365                                         ereport(WARNING,
 366                                                         (errcode_for_file_access(),
 367                                                          errmsg("could not remove segment %u of relation %u/%u/%u: %m",
 368                                                                         segno,
 369                                                                         rnode.spcNode,
 370                                                                         rnode.dbNode,
 371                                                                         rnode.relNode)));
 372                                 break;
 373                         }
 374                 }
 375                 pfree(segpath);
 376         }
 377 #endif
 378
 379         pfree(path);
 380
 381         /* Register request to unlink first segment later */
 382         if (!isRedo)
 383                 register_unlink(rnode);
 384 }
 385
 386 /*
 387  *      mdextend() -- Add a block to the specified relation.
 388  *
 389  *              The semantics are nearly the same as mdwrite(): write at the
 390  *              specified position.  However, this is to be used for the case of
 391  *              extending a relation (i.e., blocknum is at or beyond the current
 392  *              EOF).  Note that we assume writing a block beyond current EOF
 393  *              causes intervening file space to become filled with zeroes.
 394  */
 395 void
 396 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 397 {
 398         off_t           seekpos;
 399         int                     nbytes;
 400         MdfdVec    *v;
 401
 402         /* This assert is too expensive to have on normally ... */
 403 #ifdef CHECK_WRITE_VS_EXTEND
 404         Assert(blocknum >= mdnblocks(reln));
 405 #endif
 406
 407         /*
 408          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
 409          * more --- we mustn't create a block whose number actually is
 410          * InvalidBlockNumber.
 411          */
 412         if (blocknum == InvalidBlockNumber)
 413                 ereport(ERROR,
 414                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 415                                  errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
 416                                                 reln->smgr_rnode.spcNode,
 417                                                 reln->smgr_rnode.dbNode,
 418                                                 reln->smgr_rnode.relNode,
 419                                                 InvalidBlockNumber)));
 420
 421         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
 422
 423 #ifdef USE_SEGMENTED_FILES
 424         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 425         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 426 #else
 427         seekpos = (off_t) BLCKSZ * blocknum;
 428 #endif
 429
 430         /*
 431          * Note: because caller usually obtained blocknum by calling mdnblocks,
 432          * which did a seek(SEEK_END), this seek is often redundant and will be
 433          * optimized away by fd.c.      It's not redundant, however, if there is a
 434          * partial page at the end of the file. In that case we want to try to
 435          * overwrite the partial page with a full page.  It's also not redundant
 436          * if bufmgr.c had to dump another buffer of the same file to make room
 437          * for the new page's buffer.
 438          */
 439         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 440                 ereport(ERROR,
 441                                 (errcode_for_file_access(),
 442                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 443                                                 blocknum,
 444                                                 reln->smgr_rnode.spcNode,
 445                                                 reln->smgr_rnode.dbNode,
 446                                                 reln->smgr_rnode.relNode)));
 447
 448         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 449         {
 450                 if (nbytes < 0)
 451                         ereport(ERROR,
 452                                         (errcode_for_file_access(),
 453                                          errmsg("could not extend relation %u/%u/%u: %m",
 454                                                         reln->smgr_rnode.spcNode,
 455                                                         reln->smgr_rnode.dbNode,
 456                                                         reln->smgr_rnode.relNode),
 457                                          errhint("Check free disk space.")));
 458                 /* short write: complain appropriately */
 459                 ereport(ERROR,
 460                                 (errcode(ERRCODE_DISK_FULL),
 461                                  errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
 462                                                 reln->smgr_rnode.spcNode,
 463                                                 reln->smgr_rnode.dbNode,
 464                                                 reln->smgr_rnode.relNode,
 465                                                 nbytes, BLCKSZ, blocknum),
 466                                  errhint("Check free disk space.")));
 467         }
 468
 469         if (!isTemp)
 470                 register_dirty_segment(reln, v);
 471
 472 #ifdef USE_SEGMENTED_FILES
 473         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
 474 #endif
 475 }
 476
 477 /*
 478  *      mdopen() -- Open the specified relation.
 479  *
 480  * Note we only open the first segment, when there are multiple segments.
 481  *
 482  * If first segment is not present, either ereport or return NULL according
 483  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
 484  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 485  * invent one out of whole cloth.
 486  */
 487 static MdfdVec *
 488 mdopen(SMgrRelation reln, ExtensionBehavior behavior)
 489 {
 490         MdfdVec    *mdfd;
 491         char       *path;
 492         File            fd;
 493
 494         /* No work if already open */
 495         if (reln->md_fd)
 496                 return reln->md_fd;
 497
 498         path = relpath(reln->smgr_rnode);
 499
 500         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 501
 502         if (fd < 0)
 503         {
 504                 /*
 505                  * During bootstrap, there are cases where a system relation will be
 506                  * accessed (by internal backend processes) before the bootstrap
 507                  * script nominally creates it.  Therefore, accept mdopen() as a
 508                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
 509                  */
 510                 if (IsBootstrapProcessingMode())
 511                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 512                 if (fd < 0)
 513                 {
 514                         pfree(path);
 515                         if (behavior == EXTENSION_RETURN_NULL &&
 516                                 FILE_POSSIBLY_DELETED(errno))
 517                                 return NULL;
 518                         ereport(ERROR,
 519                                         (errcode_for_file_access(),
 520                                          errmsg("could not open relation %u/%u/%u: %m",
 521                                                         reln->smgr_rnode.spcNode,
 522                                                         reln->smgr_rnode.dbNode,
 523                                                         reln->smgr_rnode.relNode)));
 524                 }
 525         }
 526
 527         pfree(path);
 528
 529         reln->md_fd = mdfd = _fdvec_alloc();
 530
 531         mdfd->mdfd_vfd = fd;
 532         mdfd->mdfd_segno = 0;
 533 #ifdef USE_SEGMENTED_FILES
 534         mdfd->mdfd_chain = NULL;
 535         Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 536 #endif
 537
 538         return mdfd;
 539 }
 540
 541 /*
 542  *      mdclose() -- Close the specified relation, if it isn't closed already.
 543  */
 544 void
 545 mdclose(SMgrRelation reln)
 546 {
 547         MdfdVec    *v = reln->md_fd;
 548
 549         /* No work if already closed */
 550         if (v == NULL)
 551                 return;
 552
 553         reln->md_fd = NULL;                     /* prevent dangling pointer after error */
 554
 555 #ifdef USE_SEGMENTED_FILES
 556         while (v != NULL)
 557         {
 558                 MdfdVec    *ov = v;
 559
 560                 /* if not closed already */
 561                 if (v->mdfd_vfd >= 0)
 562                         FileClose(v->mdfd_vfd);
 563                 /* Now free vector */
 564                 v = v->mdfd_chain;
 565                 pfree(ov);
 566         }
 567 #else
 568         if (v->mdfd_vfd >= 0)
 569                 FileClose(v->mdfd_vfd);
 570         pfree(v);
 571 #endif
 572 }
 573
 574 /*
 575  *      mdread() -- Read the specified block from a relation.
 576  */
 577 void
 578 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 579 {
 580         off_t           seekpos;
 581         int                     nbytes;
 582         MdfdVec    *v;
 583
 584         v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
 585
 586 #ifdef USE_SEGMENTED_FILES
 587         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 588         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 589 #else
 590         seekpos = (off_t) BLCKSZ * blocknum;
 591 #endif
 592
 593         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 594                 ereport(ERROR,
 595                                 (errcode_for_file_access(),
 596                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 597                                                 blocknum,
 598                                                 reln->smgr_rnode.spcNode,
 599                                                 reln->smgr_rnode.dbNode,
 600                                                 reln->smgr_rnode.relNode)));
 601
 602         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 603         {
 604                 if (nbytes < 0)
 605                         ereport(ERROR,
 606                                         (errcode_for_file_access(),
 607                                    errmsg("could not read block %u of relation %u/%u/%u: %m",
 608                                                   blocknum,
 609                                                   reln->smgr_rnode.spcNode,
 610                                                   reln->smgr_rnode.dbNode,
 611                                                   reln->smgr_rnode.relNode)));
 612
 613                 /*
 614                  * Short read: we are at or past EOF, or we read a partial block at
 615                  * EOF.  Normally this is an error; upper levels should never try to
 616                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
 617                  * we are InRecovery, we should instead return zeroes without
 618                  * complaining.  This allows, for example, the case of trying to
 619                  * update a block that was later truncated away.
 620                  */
 621                 if (zero_damaged_pages || InRecovery)
 622                         MemSet(buffer, 0, BLCKSZ);
 623                 else
 624                         ereport(ERROR,
 625                                         (errcode(ERRCODE_DATA_CORRUPTED),
 626                                          errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
 627                                                         blocknum,
 628                                                         reln->smgr_rnode.spcNode,
 629                                                         reln->smgr_rnode.dbNode,
 630                                                         reln->smgr_rnode.relNode,
 631                                                         nbytes, BLCKSZ)));
 632         }
 633 }
 634
 635 /*
 636  *      mdwrite() -- Write the supplied block at the appropriate location.
 637  *
 638  *              This is to be used only for updating already-existing blocks of a
 639  *              relation (ie, those before the current EOF).  To extend a relation,
 640  *              use mdextend().
 641  */
 642 void
 643 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
 644 {
 645         off_t           seekpos;
 646         int                     nbytes;
 647         MdfdVec    *v;
 648
 649         /* This assert is too expensive to have on normally ... */
 650 #ifdef CHECK_WRITE_VS_EXTEND
 651         Assert(blocknum < mdnblocks(reln));
 652 #endif
 653
 654         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
 655
 656 #ifdef USE_SEGMENTED_FILES
 657         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
 658         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 659 #else
 660         seekpos = (off_t) BLCKSZ * blocknum;
 661 #endif
 662
 663         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 664                 ereport(ERROR,
 665                                 (errcode_for_file_access(),
 666                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
 667                                                 blocknum,
 668                                                 reln->smgr_rnode.spcNode,
 669                                                 reln->smgr_rnode.dbNode,
 670                                                 reln->smgr_rnode.relNode)));
 671
 672         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 673         {
 674                 if (nbytes < 0)
 675                         ereport(ERROR,
 676                                         (errcode_for_file_access(),
 677                                   errmsg("could not write block %u of relation %u/%u/%u: %m",
 678                                                  blocknum,
 679                                                  reln->smgr_rnode.spcNode,
 680                                                  reln->smgr_rnode.dbNode,
 681                                                  reln->smgr_rnode.relNode)));
 682                 /* short write: complain appropriately */
 683                 ereport(ERROR,
 684                                 (errcode(ERRCODE_DISK_FULL),
 685                                  errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
 686                                                 blocknum,
 687                                                 reln->smgr_rnode.spcNode,
 688                                                 reln->smgr_rnode.dbNode,
 689                                                 reln->smgr_rnode.relNode,
 690                                                 nbytes, BLCKSZ),
 691                                  errhint("Check free disk space.")));
 692         }
 693
 694         if (!isTemp)
 695                 register_dirty_segment(reln, v);
 696 }
 697
 698 /*
 699  *      mdnblocks() -- Get the number of blocks stored in a relation.
 700  *
 701  *              Important side effect: all active segments of the relation are opened
 702  *              and added to the mdfd_chain list.  If this routine has not been
 703  *              called, then only segments up to the last one actually touched
 704  *              are present in the chain.
 705  */
 706 BlockNumber
 707 mdnblocks(SMgrRelation reln)
 708 {
 709         MdfdVec    *v = mdopen(reln, EXTENSION_FAIL);
 710
 711 #ifdef USE_SEGMENTED_FILES
 712         BlockNumber nblocks;
 713         BlockNumber segno = 0;
 714
 715         /*
 716          * Skip through any segments that aren't the last one, to avoid redundant
 717          * seeks on them.  We have previously verified that these segments are
 718          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
 719          *
 720          * NOTE: this assumption could only be wrong if another backend has
 721          * truncated the relation.      We rely on higher code levels to handle that
 722          * scenario by closing and re-opening the md fd, which is handled via
 723          * relcache flush.      (Since the bgwriter doesn't participate in relcache
 724          * flush, it could have segment chain entries for inactive segments;
 725          * that's OK because the bgwriter never needs to compute relation size.)
 726          */
 727         while (v->mdfd_chain != NULL)
 728         {
 729                 segno++;
 730                 v = v->mdfd_chain;
 731         }
 732
 733         for (;;)
 734         {
 735                 nblocks = _mdnblocks(reln, v);
 736                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
 737                         elog(FATAL, "segment too big");
 738                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
 739                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 740
 741                 /*
 742                  * If segment is exactly RELSEG_SIZE, advance to next one.
 743                  */
 744                 segno++;
 745
 746                 if (v->mdfd_chain == NULL)
 747                 {
 748                         /*
 749                          * Because we pass O_CREAT, we will create the next segment (with
 750                          * zero length) immediately, if the last segment is of length
 751                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
 752                          * the logic simple.
 753                          */
 754                         v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
 755                         if (v->mdfd_chain == NULL)
 756                                 ereport(ERROR,
 757                                                 (errcode_for_file_access(),
 758                                  errmsg("could not open segment %u of relation %u/%u/%u: %m",
 759                                                 segno,
 760                                                 reln->smgr_rnode.spcNode,
 761                                                 reln->smgr_rnode.dbNode,
 762                                                 reln->smgr_rnode.relNode)));
 763                 }
 764
 765                 v = v->mdfd_chain;
 766         }
 767 #else
 768         return _mdnblocks(reln, v);
 769 #endif
 770 }
 771
 772 /*
 773  *      mdtruncate() -- Truncate relation to specified number of blocks.
 774  */
 775 void
 776 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 777 {
 778         MdfdVec    *v;
 779         BlockNumber curnblk;
 780
 781 #ifdef USE_SEGMENTED_FILES
 782         BlockNumber priorblocks;
 783 #endif
 784
 785         /*
 786          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 787          * truncation loop will get them all!
 788          */
 789         curnblk = mdnblocks(reln);
 790         if (nblocks > curnblk)
 791         {
 792                 /* Bogus request ... but no complaint if InRecovery */
 793                 if (InRecovery)
 794                         return;
 795                 ereport(ERROR,
 796                                 (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
 797                                                 reln->smgr_rnode.spcNode,
 798                                                 reln->smgr_rnode.dbNode,
 799                                                 reln->smgr_rnode.relNode,
 800                                                 nblocks, curnblk)));
 801         }
 802         if (nblocks == curnblk)
 803                 return;                                 /* no work */
 804
 805         v = mdopen(reln, EXTENSION_FAIL);
 806
 807 #ifdef USE_SEGMENTED_FILES
 808         priorblocks = 0;
 809         while (v != NULL)
 810         {
 811                 MdfdVec    *ov = v;
 812
 813                 if (priorblocks > nblocks)
 814                 {
 815                         /*
 816                          * This segment is no longer active (and has already been unlinked
 817                          * from the mdfd_chain). We truncate the file, but do not delete
 818                          * it, for reasons explained in the header comments.
 819                          */
 820                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
 821                                 ereport(ERROR,
 822                                                 (errcode_for_file_access(),
 823                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 824                                                                 reln->smgr_rnode.spcNode,
 825                                                                 reln->smgr_rnode.dbNode,
 826                                                                 reln->smgr_rnode.relNode,
 827                                                                 nblocks)));
 828                         if (!isTemp)
 829                                 register_dirty_segment(reln, v);
 830                         v = v->mdfd_chain;
 831                         Assert(ov != reln->md_fd);      /* we never drop the 1st segment */
 832                         pfree(ov);
 833                 }
 834                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 835                 {
 836                         /*
 837                          * This is the last segment we want to keep. Truncate the file to
 838                          * the right length, and clear chain link that points to any
 839                          * remaining segments (which we shall zap). NOTE: if nblocks is
 840                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
 841                          * segment to 0 length but keep it. This adheres to the invariant
 842                          * given in the header comments.
 843                          */
 844                         BlockNumber lastsegblocks = nblocks - priorblocks;
 845
 846                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
 847                                 ereport(ERROR,
 848                                                 (errcode_for_file_access(),
 849                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 850                                                                 reln->smgr_rnode.spcNode,
 851                                                                 reln->smgr_rnode.dbNode,
 852                                                                 reln->smgr_rnode.relNode,
 853                                                                 nblocks)));
 854                         if (!isTemp)
 855                                 register_dirty_segment(reln, v);
 856                         v = v->mdfd_chain;
 857                         ov->mdfd_chain = NULL;
 858                 }
 859                 else
 860                 {
 861                         /*
 862                          * We still need this segment and 0 or more blocks beyond it, so
 863                          * nothing to do here.
 864                          */
 865                         v = v->mdfd_chain;
 866                 }
 867                 priorblocks += RELSEG_SIZE;
 868         }
 869 #else
 870         /* For unsegmented files, it's a lot easier */
 871         if (FileTruncate(v->mdfd_vfd, (off_t) nblocks * BLCKSZ) < 0)
 872                 ereport(ERROR,
 873                                 (errcode_for_file_access(),
 874                           errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
 875                                          reln->smgr_rnode.spcNode,
 876                                          reln->smgr_rnode.dbNode,
 877                                          reln->smgr_rnode.relNode,
 878                                          nblocks)));
 879         if (!isTemp)
 880                 register_dirty_segment(reln, v);
 881 #endif
 882 }
 883
 884 /*
 885  *      mdimmedsync() -- Immediately sync a relation to stable storage.
 886  *
 887  * Note that only writes already issued are synced; this routine knows
 888  * nothing of dirty buffers that may exist inside the buffer manager.
 889  */
 890 void
 891 mdimmedsync(SMgrRelation reln)
 892 {
 893         MdfdVec    *v;
 894         BlockNumber curnblk;
 895
 896         /*
 897          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 898          * fsync loop will get them all!
 899          */
 900         curnblk = mdnblocks(reln);
 901
 902         v = mdopen(reln, EXTENSION_FAIL);
 903
 904 #ifdef USE_SEGMENTED_FILES
 905         while (v != NULL)
 906         {
 907                 if (FileSync(v->mdfd_vfd) < 0)
 908                         ereport(ERROR,
 909                                         (errcode_for_file_access(),
 910                                 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
 911                                            v->mdfd_segno,
 912                                            reln->smgr_rnode.spcNode,
 913                                            reln->smgr_rnode.dbNode,
 914                                            reln->smgr_rnode.relNode)));
 915                 v = v->mdfd_chain;
 916         }
 917 #else
 918         if (FileSync(v->mdfd_vfd) < 0)
 919                 ereport(ERROR,
 920                                 (errcode_for_file_access(),
 921                                  errmsg("could not fsync relation %u/%u/%u: %m",
 922                                                 reln->smgr_rnode.spcNode,
 923                                                 reln->smgr_rnode.dbNode,
 924                                                 reln->smgr_rnode.relNode)));
 925 #endif
 926 }
 927
 928 /*
 929  *      mdsync() -- Sync previous writes to stable storage.
 930  */
 931 void
 932 mdsync(void)
 933 {
 934         static bool mdsync_in_progress = false;
 935
 936         HASH_SEQ_STATUS hstat;
 937         PendingOperationEntry *entry;
 938         int                     absorb_counter;
 939
 940         /*
 941          * This is only called during checkpoints, and checkpoints should only
 942          * occur in processes that have created a pendingOpsTable.
 943          */
 944         if (!pendingOpsTable)
 945                 elog(ERROR, "cannot sync without a pendingOpsTable");
 946
 947         /*
 948          * If we are in the bgwriter, the sync had better include all fsync
 949          * requests that were queued by backends up to this point.      The tightest
 950          * race condition that could occur is that a buffer that must be written
 951          * and fsync'd for the checkpoint could have been dumped by a backend just
 952          * before it was visited by BufferSync().  We know the backend will have
 953          * queued an fsync request before clearing the buffer's dirtybit, so we
 954          * are safe as long as we do an Absorb after completing BufferSync().
 955          */
 956         AbsorbFsyncRequests();
 957
 958         /*
 959          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
 960          * checkpoint), we want to ignore fsync requests that are entered into the
 961          * hashtable after this point --- they should be processed next time,
 962          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
 963          * ones: new ones will have cycle_ctr equal to the incremented value of
 964          * mdsync_cycle_ctr.
 965          *
 966          * In normal circumstances, all entries present in the table at this point
 967          * will have cycle_ctr exactly equal to the current (about to be old)
 968          * value of mdsync_cycle_ctr.  However, if we fail partway through the
 969          * fsync'ing loop, then older values of cycle_ctr might remain when we
 970          * come back here to try again.  Repeated checkpoint failures would
 971          * eventually wrap the counter around to the point where an old entry
 972          * might appear new, causing us to skip it, possibly allowing a checkpoint
 973          * to succeed that should not have.  To forestall wraparound, any time the
 974          * previous mdsync() failed to complete, run through the table and
 975          * forcibly set cycle_ctr = mdsync_cycle_ctr.
 976          *
 977          * Think not to merge this loop with the main loop, as the problem is
 978          * exactly that that loop may fail before having visited all the entries.
 979          * From a performance point of view it doesn't matter anyway, as this path
 980          * will never be taken in a system that's functioning normally.
 981          */
 982         if (mdsync_in_progress)
 983         {
 984                 /* prior try failed, so update any stale cycle_ctr values */
 985                 hash_seq_init(&hstat, pendingOpsTable);
 986                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 987                 {
 988                         entry->cycle_ctr = mdsync_cycle_ctr;
 989                 }
 990         }
 991
 992         /* Advance counter so that new hashtable entries are distinguishable */
 993         mdsync_cycle_ctr++;
 994
 995         /* Set flag to detect failure if we don't reach the end of the loop */
 996         mdsync_in_progress = true;
 997
 998         /* Now scan the hashtable for fsync requests to process */
 999         absorb_counter = FSYNCS_PER_ABSORB;
1000         hash_seq_init(&hstat, pendingOpsTable);
1001         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1002         {
1003                 /*
1004                  * If the entry is new then don't process it this time.  Note that
1005                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1006                  */
1007                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1008                         continue;
1009
1010                 /* Else assert we haven't missed it */
1011                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1012
1013                 /*
1014                  * If fsync is off then we don't have to bother opening the file at
1015                  * all.  (We delay checking until this point so that changing fsync on
1016                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1017                  * fall through to delete it.
1018                  */
1019                 if (enableFsync && !entry->canceled)
1020                 {
1021                         int                     failures;
1022
1023                         /*
1024                          * If in bgwriter, we want to absorb pending requests every so
1025                          * often to prevent overflow of the fsync request queue.  It is
1026                          * unspecified whether newly-added entries will be visited by
1027                          * hash_seq_search, but we don't care since we don't need to
1028                          * process them anyway.
1029                          */
1030                         if (--absorb_counter <= 0)
1031                         {
1032                                 AbsorbFsyncRequests();
1033                                 absorb_counter = FSYNCS_PER_ABSORB;
1034                         }
1035
1036                         /*
1037                          * The fsync table could contain requests to fsync segments that
1038                          * have been deleted (unlinked) by the time we get to them. Rather
1039                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1040                          * ignored, what we do on error is absorb pending requests and
1041                          * then retry.  Since mdunlink() queues a "revoke" message before
1042                          * actually unlinking, the fsync request is guaranteed to be
1043                          * marked canceled after the absorb if it really was this case.
1044                          * DROP DATABASE likewise has to tell us to forget fsync requests
1045                          * before it starts deletions.
1046                          */
1047                         for (failures = 0;; failures++)         /* loop exits at "break" */
1048                         {
1049                                 SMgrRelation reln;
1050                                 MdfdVec    *seg;
1051
1052                                 /*
1053                                  * Find or create an smgr hash entry for this relation. This
1054                                  * may seem a bit unclean -- md calling smgr?  But it's really
1055                                  * the best solution.  It ensures that the open file reference
1056                                  * isn't permanently leaked if we get an error here. (You may
1057                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1058                                  * really, because the only case in which a checkpoint is done
1059                                  * by a process that isn't about to shut down is in the
1060                                  * bgwriter, and it will periodically do smgrcloseall(). This
1061                                  * fact justifies our not closing the reln in the success path
1062                                  * either, which is a good thing since in non-bgwriter cases
1063                                  * we couldn't safely do that.)  Furthermore, in many cases
1064                                  * the relation will have been dirtied through this same smgr
1065                                  * relation, and so we can save a file open/close cycle.
1066                                  */
1067                                 reln = smgropen(entry->tag.rnode);
1068
1069                                 /*
1070                                  * It is possible that the relation has been dropped or
1071                                  * truncated since the fsync request was entered.  Therefore,
1072                                  * allow ENOENT, but only if we didn't fail already on this
1073                                  * file.  This applies both during _mdfd_getseg() and during
1074                                  * FileSync, since fd.c might have closed the file behind our
1075                                  * back.
1076                                  */
1077                                 seg = _mdfd_getseg(reln,
1078                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1079                                                                    false, EXTENSION_RETURN_NULL);
1080                                 if (seg != NULL &&
1081                                         FileSync(seg->mdfd_vfd) >= 0)
1082                                         break;          /* success; break out of retry loop */
1083
1084                                 /*
1085                                  * XXX is there any point in allowing more than one retry?
1086                                  * Don't see one at the moment, but easy to change the test
1087                                  * here if so.
1088                                  */
1089                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1090                                         failures > 0)
1091                                         ereport(ERROR,
1092                                                         (errcode_for_file_access(),
1093                                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1094                                                                         entry->tag.segno,
1095                                                                         entry->tag.rnode.spcNode,
1096                                                                         entry->tag.rnode.dbNode,
1097                                                                         entry->tag.rnode.relNode)));
1098                                 else
1099                                         ereport(DEBUG1,
1100                                                         (errcode_for_file_access(),
1101                                                          errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
1102                                                                         entry->tag.segno,
1103                                                                         entry->tag.rnode.spcNode,
1104                                                                         entry->tag.rnode.dbNode,
1105                                                                         entry->tag.rnode.relNode)));
1106
1107                                 /*
1108                                  * Absorb incoming requests and check to see if canceled.
1109                                  */
1110                                 AbsorbFsyncRequests();
1111                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1112
1113                                 if (entry->canceled)
1114                                         break;
1115                         }                                       /* end retry loop */
1116                 }
1117
1118                 /*
1119                  * If we get here, either we fsync'd successfully, or we don't have to
1120                  * because enableFsync is off, or the entry is (now) marked canceled.
1121                  * Okay to delete it.
1122                  */
1123                 if (hash_search(pendingOpsTable, &entry->tag,
1124                                                 HASH_REMOVE, NULL) == NULL)
1125                         elog(ERROR, "pendingOpsTable corrupted");
1126         }                                                       /* end loop over hashtable entries */
1127
1128         /* Flag successful completion of mdsync */
1129         mdsync_in_progress = false;
1130 }
1131
1132 /*
1133  * mdpreckpt() -- Do pre-checkpoint work
1134  *
1135  * To distinguish unlink requests that arrived before this checkpoint
1136  * started from those that arrived during the checkpoint, we use a cycle
1137  * counter similar to the one we use for fsync requests. That cycle
1138  * counter is incremented here.
1139  *
1140  * This must be called *before* the checkpoint REDO point is determined.
1141  * That ensures that we won't delete files too soon.
1142  *
1143  * Note that we can't do anything here that depends on the assumption
1144  * that the checkpoint will be completed.
1145  */
1146 void
1147 mdpreckpt(void)
1148 {
1149         ListCell   *cell;
1150
1151         /*
1152          * In case the prior checkpoint wasn't completed, stamp all entries in the
1153          * list with the current cycle counter.  Anything that's in the list at
1154          * the start of checkpoint can surely be deleted after the checkpoint is
1155          * finished, regardless of when the request was made.
1156          */
1157         foreach(cell, pendingUnlinks)
1158         {
1159                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1160
1161                 entry->cycle_ctr = mdckpt_cycle_ctr;
1162         }
1163
1164         /*
1165          * Any unlink requests arriving after this point will be assigned the next
1166          * cycle counter, and won't be unlinked until next checkpoint.
1167          */
1168         mdckpt_cycle_ctr++;
1169 }
1170
1171 /*
1172  * mdpostckpt() -- Do post-checkpoint work
1173  *
1174  * Remove any lingering files that can now be safely removed.
1175  */
1176 void
1177 mdpostckpt(void)
1178 {
1179         while (pendingUnlinks != NIL)
1180         {
1181                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1182                 char       *path;
1183
1184                 /*
1185                  * New entries are appended to the end, so if the entry is new we've
1186                  * reached the end of old entries.
1187                  */
1188                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1189                         break;
1190
1191                 /* Else assert we haven't missed it */
1192                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1193
1194                 /* Unlink the file */
1195                 path = relpath(entry->rnode);
1196                 if (unlink(path) < 0)
1197                 {
1198                         /*
1199                          * There's a race condition, when the database is dropped at the
1200                          * same time that we process the pending unlink requests. If the
1201                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1202                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1203                          * the possibility that we delete the file first.
1204                          */
1205                         if (errno != ENOENT)
1206                                 ereport(WARNING,
1207                                                 (errcode_for_file_access(),
1208                                                  errmsg("could not remove relation %u/%u/%u: %m",
1209                                                                 entry->rnode.spcNode,
1210                                                                 entry->rnode.dbNode,
1211                                                                 entry->rnode.relNode)));
1212                 }
1213                 pfree(path);
1214
1215                 pendingUnlinks = list_delete_first(pendingUnlinks);
1216                 pfree(entry);
1217         }
1218 }
1219
1220 /*
1221  * register_dirty_segment() -- Mark a relation segment as needing fsync
1222  *
1223  * If there is a local pending-ops table, just make an entry in it for
1224  * mdsync to process later.  Otherwise, try to pass off the fsync request
1225  * to the background writer process.  If that fails, just do the fsync
1226  * locally before returning (we expect this will not happen often enough
1227  * to be a performance problem).
1228  */
1229 static void
1230 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1231 {
1232         if (pendingOpsTable)
1233         {
1234                 /* push it into local pending-ops table */
1235                 RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
1236         }
1237         else
1238         {
1239                 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
1240                         return;                         /* passed it off successfully */
1241
1242                 if (FileSync(seg->mdfd_vfd) < 0)
1243                         ereport(ERROR,
1244                                         (errcode_for_file_access(),
1245                                 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1246                                            seg->mdfd_segno,
1247                                            reln->smgr_rnode.spcNode,
1248                                            reln->smgr_rnode.dbNode,
1249                                            reln->smgr_rnode.relNode)));
1250         }
1251 }
1252
1253 /*
1254  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1255  *
1256  * As with register_dirty_segment, this could involve either a local or
1257  * a remote pending-ops table.
1258  */
1259 static void
1260 register_unlink(RelFileNode rnode)
1261 {
1262         if (pendingOpsTable)
1263         {
1264                 /* push it into local pending-ops table */
1265                 RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
1266         }
1267         else
1268         {
1269                 /*
1270                  * Notify the bgwriter about it.  If we fail to queue the request
1271                  * message, we have to sleep and try again, because we can't simply
1272                  * delete the file now.  Ugly, but hopefully won't happen often.
1273                  *
1274                  * XXX should we just leave the file orphaned instead?
1275                  */
1276                 Assert(IsUnderPostmaster);
1277                 while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
1278                         pg_usleep(10000L);      /* 10 msec seems a good number */
1279         }
1280 }
1281
1282 /*
1283  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1284  *
1285  * We stuff most fsync requests into the local hash table for execution
1286  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1287  * separate linked list, however, because they get processed separately.
1288  *
1289  * The range of possible segment numbers is way less than the range of
1290  * BlockNumber, so we can reserve high values of segno for special purposes.
1291  * We define three:
1292  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1293  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1294  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1295  *       checkpoint.
1296  *
1297  * (Handling the FORGET_* requests is a tad slow because the hash table has
1298  * to be searched linearly, but it doesn't seem worth rethinking the table
1299  * structure for them.)
1300  */
1301 void
1302 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1303 {
1304         Assert(pendingOpsTable);
1305
1306         if (segno == FORGET_RELATION_FSYNC)
1307         {
1308                 /* Remove any pending requests for the entire relation */
1309                 HASH_SEQ_STATUS hstat;
1310                 PendingOperationEntry *entry;
1311
1312                 hash_seq_init(&hstat, pendingOpsTable);
1313                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1314                 {
1315                         if (RelFileNodeEquals(entry->tag.rnode, rnode))
1316                         {
1317                                 /* Okay, cancel this entry */
1318                                 entry->canceled = true;
1319                         }
1320                 }
1321         }
1322         else if (segno == FORGET_DATABASE_FSYNC)
1323         {
1324                 /* Remove any pending requests for the entire database */
1325                 HASH_SEQ_STATUS hstat;
1326                 PendingOperationEntry *entry;
1327                 ListCell   *cell,
1328                                    *prev,
1329                                    *next;
1330
1331                 /* Remove fsync requests */
1332                 hash_seq_init(&hstat, pendingOpsTable);
1333                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1334                 {
1335                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1336                         {
1337                                 /* Okay, cancel this entry */
1338                                 entry->canceled = true;
1339                         }
1340                 }
1341
1342                 /* Remove unlink requests */
1343                 prev = NULL;
1344                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1345                 {
1346                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1347
1348                         next = lnext(cell);
1349                         if (entry->rnode.dbNode == rnode.dbNode)
1350                         {
1351                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1352                                 pfree(entry);
1353                         }
1354                         else
1355                                 prev = cell;
1356                 }
1357         }
1358         else if (segno == UNLINK_RELATION_REQUEST)
1359         {
1360                 /* Unlink request: put it in the linked list */
1361                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1362                 PendingUnlinkEntry *entry;
1363
1364                 entry = palloc(sizeof(PendingUnlinkEntry));
1365                 entry->rnode = rnode;
1366                 entry->cycle_ctr = mdckpt_cycle_ctr;
1367
1368                 pendingUnlinks = lappend(pendingUnlinks, entry);
1369
1370                 MemoryContextSwitchTo(oldcxt);
1371         }
1372         else
1373         {
1374                 /* Normal case: enter a request to fsync this segment */
1375                 PendingOperationTag key;
1376                 PendingOperationEntry *entry;
1377                 bool            found;
1378
1379                 /* ensure any pad bytes in the hash key are zeroed */
1380                 MemSet(&key, 0, sizeof(key));
1381                 key.rnode = rnode;
1382                 key.segno = segno;
1383
1384                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1385                                                                                                           &key,
1386                                                                                                           HASH_ENTER,
1387                                                                                                           &found);
1388                 /* if new or previously canceled entry, initialize it */
1389                 if (!found || entry->canceled)
1390                 {
1391                         entry->canceled = false;
1392                         entry->cycle_ctr = mdsync_cycle_ctr;
1393                 }
1394
1395                 /*
1396                  * NB: it's intentional that we don't change cycle_ctr if the entry
1397                  * already exists.      The fsync request must be treated as old, even
1398                  * though the new request will be satisfied too by any subsequent
1399                  * fsync.
1400                  *
1401                  * However, if the entry is present but is marked canceled, we should
1402                  * act just as though it wasn't there.  The only case where this could
1403                  * happen would be if a file had been deleted, we received but did not
1404                  * yet act on the cancel request, and the same relfilenode was then
1405                  * assigned to a new file.      We mustn't lose the new request, but it
1406                  * should be considered new not old.
1407                  */
1408         }
1409 }
1410
1411 /*
1412  * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1413  */
1414 void
1415 ForgetRelationFsyncRequests(RelFileNode rnode)
1416 {
1417         if (pendingOpsTable)
1418         {
1419                 /* standalone backend or startup process: fsync state is local */
1420                 RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
1421         }
1422         else if (IsUnderPostmaster)
1423         {
1424                 /*
1425                  * Notify the bgwriter about it.  If we fail to queue the revoke
1426                  * message, we have to sleep and try again ... ugly, but hopefully
1427                  * won't happen often.
1428                  *
1429                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1430                  * error would leave the no-longer-used file still present on disk,
1431                  * which would be bad, so I'm inclined to assume that the bgwriter
1432                  * will always empty the queue soon.
1433                  */
1434                 while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
1435                         pg_usleep(10000L);      /* 10 msec seems a good number */
1436
1437                 /*
1438                  * Note we don't wait for the bgwriter to actually absorb the revoke
1439                  * message; see mdsync() for the implications.
1440                  */
1441         }
1442 }
1443
1444 /*
1445  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1446  */
1447 void
1448 ForgetDatabaseFsyncRequests(Oid dbid)
1449 {
1450         RelFileNode rnode;
1451
1452         rnode.dbNode = dbid;
1453         rnode.spcNode = 0;
1454         rnode.relNode = 0;
1455
1456         if (pendingOpsTable)
1457         {
1458                 /* standalone backend or startup process: fsync state is local */
1459                 RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
1460         }
1461         else if (IsUnderPostmaster)
1462         {
1463                 /* see notes in ForgetRelationFsyncRequests */
1464                 while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
1465                         pg_usleep(10000L);      /* 10 msec seems a good number */
1466         }
1467 }
1468
1469
1470 /*
1471  *      _fdvec_alloc() -- Make a MdfdVec object.
1472  */
1473 static MdfdVec *
1474 _fdvec_alloc(void)
1475 {
1476         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1477 }
1478
1479 #ifdef USE_SEGMENTED_FILES
1480
1481 /*
1482  * Open the specified segment of the relation,
1483  * and make a MdfdVec object for it.  Returns NULL on failure.
1484  */
1485 static MdfdVec *
1486 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
1487 {
1488         MdfdVec    *v;
1489         int                     fd;
1490         char       *path,
1491                            *fullpath;
1492
1493         path = relpath(reln->smgr_rnode);
1494
1495         if (segno > 0)
1496         {
1497                 /* be sure we have enough space for the '.segno' */
1498                 fullpath = (char *) palloc(strlen(path) + 12);
1499                 sprintf(fullpath, "%s.%u", path, segno);
1500                 pfree(path);
1501         }
1502         else
1503                 fullpath = path;
1504
1505         /* open the file */
1506         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1507
1508         pfree(fullpath);
1509
1510         if (fd < 0)
1511                 return NULL;
1512
1513         /* allocate an mdfdvec entry for it */
1514         v = _fdvec_alloc();
1515
1516         /* fill the entry */
1517         v->mdfd_vfd = fd;
1518         v->mdfd_segno = segno;
1519         v->mdfd_chain = NULL;
1520         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
1521
1522         /* all done */
1523         return v;
1524 }
1525 #endif   /* USE_SEGMENTED_FILES */
1526
1527 /*
1528  *      _mdfd_getseg() -- Find the segment of the relation holding the
1529  *              specified block.
1530  *
1531  * If the segment doesn't exist, we ereport, return NULL, or create the
1532  * segment, according to "behavior".  Note: isTemp need only be correct
1533  * in the EXTENSION_CREATE case.
1534  */
1535 static MdfdVec *
1536 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
1537                          ExtensionBehavior behavior)
1538 {
1539         MdfdVec    *v = mdopen(reln, behavior);
1540
1541 #ifdef USE_SEGMENTED_FILES
1542         BlockNumber targetseg;
1543         BlockNumber nextsegno;
1544
1545         if (!v)
1546                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1547
1548         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1549         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1550         {
1551                 Assert(nextsegno == v->mdfd_segno + 1);
1552
1553                 if (v->mdfd_chain == NULL)
1554                 {
1555                         /*
1556                          * Normally we will create new segments only if authorized by the
1557                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1558                          * recovery, create segments anyway; this allows cases such as
1559                          * replaying WAL data that has a write into a high-numbered
1560                          * segment of a relation that was later deleted.  We want to go
1561                          * ahead and create the segments so we can finish out the replay.
1562                          *
1563                          * We have to maintain the invariant that segments before the last
1564                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1565                          * with zeroes if needed.  (This only matters if caller is
1566                          * extending the relation discontiguously, but that can happen in
1567                          * hash indexes.)
1568                          */
1569                         if (behavior == EXTENSION_CREATE || InRecovery)
1570                         {
1571                                 if (_mdnblocks(reln, v) < RELSEG_SIZE)
1572                                 {
1573                                         char       *zerobuf = palloc0(BLCKSZ);
1574
1575                                         mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1576                                                          zerobuf, isTemp);
1577                                         pfree(zerobuf);
1578                                 }
1579                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
1580                         }
1581                         else
1582                         {
1583                                 /* We won't create segment if not existent */
1584                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
1585                         }
1586                         if (v->mdfd_chain == NULL)
1587                         {
1588                                 if (behavior == EXTENSION_RETURN_NULL &&
1589                                         FILE_POSSIBLY_DELETED(errno))
1590                                         return NULL;
1591                                 ereport(ERROR,
1592                                                 (errcode_for_file_access(),
1593                                                  errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
1594                                                                 nextsegno,
1595                                                                 reln->smgr_rnode.spcNode,
1596                                                                 reln->smgr_rnode.dbNode,
1597                                                                 reln->smgr_rnode.relNode,
1598                                                                 blkno)));
1599                         }
1600                 }
1601                 v = v->mdfd_chain;
1602         }
1603 #endif
1604
1605         return v;
1606 }
1607
1608 /*
1609  * Get number of blocks present in a single disk file
1610  */
1611 static BlockNumber
1612 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
1613 {
1614         off_t           len;
1615
1616         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1617         if (len < 0)
1618                 ereport(ERROR,
1619                                 (errcode_for_file_access(),
1620                 errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
1621                            seg->mdfd_segno,
1622                            reln->smgr_rnode.spcNode,
1623                            reln->smgr_rnode.dbNode,
1624                            reln->smgr_rnode.relNode)));
1625         /* note that this calculation will ignore any partial block at EOF */
1626         return (BlockNumber) (len / BLCKSZ);
1627 }