src/backend/storage/smgr/md.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * md.c
   4  *        This code manages relations that reside on magnetic disk.
   5  *
   6  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
   7  * Portions Copyright (c) 1994, Regents of the University of California
   8  *
   9  *
  10  * IDENTIFICATION
  11  *        src/backend/storage/smgr/md.c
  12  *
  13  *-------------------------------------------------------------------------
  14  */
  15 #include "postgres.h"
  16
  17 #include <unistd.h>
  18 #include <fcntl.h>
  19 #include <sys/file.h>
  20
  21 #include "catalog/catalog.h"
  22 #include "miscadmin.h"
  23 #include "portability/instr_time.h"
  24 #include "postmaster/bgwriter.h"
  25 #include "storage/fd.h"
  26 #include "storage/bufmgr.h"
  27 #include "storage/relfilenode.h"
  28 #include "storage/smgr.h"
  29 #include "utils/hsearch.h"
  30 #include "utils/memutils.h"
  31 #include "pg_trace.h"
  32
  33
  34 /* interval for calling AbsorbFsyncRequests in mdsync */
  35 #define FSYNCS_PER_ABSORB               10
  36
  37 /*
  38  * Special values for the segno arg to RememberFsyncRequest.
  39  *
  40  * Note that CompactBgwriterRequestQueue assumes that it's OK to remove an
  41  * fsync request from the queue if an identical, subsequent request is found.
  42  * See comments there before making changes here.
  43  */
  44 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
  45 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
  46 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
  47
  48 /*
  49  * On Windows, we have to interpret EACCES as possibly meaning the same as
  50  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
  51  * that's what you get.  Ugh.  This code is designed so that we don't
  52  * actually believe these cases are okay without further evidence (namely,
  53  * a pending fsync request getting revoked ... see mdsync).
  54  */
  55 #ifndef WIN32
  56 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
  57 #else
  58 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
  59 #endif
  60
  61 /*
  62  *      The magnetic disk storage manager keeps track of open file
  63  *      descriptors in its own descriptor pool.  This is done to make it
  64  *      easier to support relations that are larger than the operating
  65  *      system's file size limit (often 2GBytes).  In order to do that,
  66  *      we break relations up into "segment" files that are each shorter than
  67  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
  68  *      configuration constant in pg_config.h.
  69  *
  70  *      On disk, a relation must consist of consecutively numbered segment
  71  *      files in the pattern
  72  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
  73  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
  74  *              -- Optionally, any number of inactive segments of size 0 blocks.
  75  *      The full and partial segments are collectively the "active" segments.
  76  *      Inactive segments are those that once contained data but are currently
  77  *      not needed because of an mdtruncate() operation.  The reason for leaving
  78  *      them present at size zero, rather than unlinking them, is that other
  79  *      backends and/or the bgwriter might be holding open file references to
  80  *      such segments.  If the relation expands again after mdtruncate(), such
  81  *      that a deactivated segment becomes active again, it is important that
  82  *      such file references still be valid --- else data might get written
  83  *      out to an unlinked old copy of a segment file that will eventually
  84  *      disappear.
  85  *
  86  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
  87  *      cache is, therefore, just the head of a list of MdfdVec objects, one
  88  *      per segment.  But note the md_fd pointer can be NULL, indicating
  89  *      relation not open.
  90  *
  91  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
  92  *      doesn't have another segment after this one; we may just not have
  93  *      opened the next segment yet.  (We could not have "all segments are
  94  *      in the chain" as an invariant anyway, since another backend could
  95  *      extend the relation when we weren't looking.)  We do not make chain
  96  *      entries for inactive segments, however; as soon as we find a partial
  97  *      segment, we assume that any subsequent segments are inactive.
  98  *
  99  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
 100  */
 101
 102 typedef struct _MdfdVec
 103 {
 104         File            mdfd_vfd;               /* fd number in fd.c's pool */
 105         BlockNumber mdfd_segno;         /* segment number, from 0 */
 106         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
 107 } MdfdVec;
 108
 109 static MemoryContext MdCxt;             /* context for all md.c allocations */
 110
 111
 112 /*
 113  * In some contexts (currently, standalone backends and the bgwriter process)
 114  * we keep track of pending fsync operations: we need to remember all relation
 115  * segments that have been written since the last checkpoint, so that we can
 116  * fsync them down to disk before completing the next checkpoint.  This hash
 117  * table remembers the pending operations.      We use a hash table mostly as
 118  * a convenient way of eliminating duplicate requests.
 119  *
 120  * We use a similar mechanism to remember no-longer-needed files that can
 121  * be deleted after the next checkpoint, but we use a linked list instead of
 122  * a hash table, because we don't expect there to be any duplicate requests.
 123  *
 124  * (Regular backends do not track pending operations locally, but forward
 125  * them to the bgwriter.)
 126  */
 127 typedef struct
 128 {
 129         RelFileNodeBackend rnode;       /* the targeted relation */
 130         ForkNumber      forknum;
 131         BlockNumber segno;                      /* which segment */
 132 } PendingOperationTag;
 133
 134 typedef uint16 CycleCtr;                /* can be any convenient integer size */
 135
 136 typedef struct
 137 {
 138         PendingOperationTag tag;        /* hash table key (must be first!) */
 139         bool            canceled;               /* T => request canceled, not yet removed */
 140         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
 141 } PendingOperationEntry;
 142
 143 typedef struct
 144 {
 145         RelFileNodeBackend rnode;       /* the dead relation to delete */
 146         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
 147 } PendingUnlinkEntry;
 148
 149 static HTAB *pendingOpsTable = NULL;
 150 static List *pendingUnlinks = NIL;
 151
 152 static CycleCtr mdsync_cycle_ctr = 0;
 153 static CycleCtr mdckpt_cycle_ctr = 0;
 154
 155
 156 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
 157 {
 158         EXTENSION_FAIL,                         /* ereport if segment not present */
 159         EXTENSION_RETURN_NULL,          /* return NULL if not present */
 160         EXTENSION_CREATE                        /* create new segments as needed */
 161 } ExtensionBehavior;
 162
 163 /* local routines */
 164 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
 165            ExtensionBehavior behavior);
 166 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 167                                            MdfdVec *seg);
 168 static void register_unlink(RelFileNodeBackend rnode);
 169 static MdfdVec *_fdvec_alloc(void);
 170 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
 171                           BlockNumber segno);
 172 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
 173                           BlockNumber segno, int oflags);
 174 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
 175                          BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
 176 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
 177                    MdfdVec *seg);
 178
 179
 180 /*
 181  *      mdinit() -- Initialize private state for magnetic disk storage manager.
 182  */
 183 void
 184 mdinit(void)
 185 {
 186         MdCxt = AllocSetContextCreate(TopMemoryContext,
 187                                                                   "MdSmgr",
 188                                                                   ALLOCSET_DEFAULT_MINSIZE,
 189                                                                   ALLOCSET_DEFAULT_INITSIZE,
 190                                                                   ALLOCSET_DEFAULT_MAXSIZE);
 191
 192         /*
 193          * Create pending-operations hashtable if we need it.  Currently, we need
 194          * it if we are standalone (not under a postmaster) OR if we are a
 195          * bootstrap-mode subprocess of a postmaster (that is, a startup or
 196          * bgwriter process).
 197          */
 198         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
 199         {
 200                 HASHCTL         hash_ctl;
 201
 202                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
 203                 hash_ctl.keysize = sizeof(PendingOperationTag);
 204                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
 205                 hash_ctl.hash = tag_hash;
 206                 hash_ctl.hcxt = MdCxt;
 207                 pendingOpsTable = hash_create("Pending Ops Table",
 208                                                                           100L,
 209                                                                           &hash_ctl,
 210                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
 211                 pendingUnlinks = NIL;
 212         }
 213 }
 214
 215 /*
 216  * In archive recovery, we rely on bgwriter to do fsyncs, but we will have
 217  * already created the pendingOpsTable during initialization of the startup
 218  * process.  Calling this function drops the local pendingOpsTable so that
 219  * subsequent requests will be forwarded to bgwriter.
 220  */
 221 void
 222 SetForwardFsyncRequests(void)
 223 {
 224         /* Perform any pending ops we may have queued up */
 225         if (pendingOpsTable)
 226                 mdsync();
 227         pendingOpsTable = NULL;
 228 }
 229
 230 /*
 231  *      mdexists() -- Does the physical file exist?
 232  *
 233  * Note: this will return true for lingering files, with pending deletions
 234  */
 235 bool
 236 mdexists(SMgrRelation reln, ForkNumber forkNum)
 237 {
 238         /*
 239          * Close it first, to ensure that we notice if the fork has been unlinked
 240          * since we opened it.
 241          */
 242         mdclose(reln, forkNum);
 243
 244         return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
 245 }
 246
 247 /*
 248  *      mdcreate() -- Create a new relation on magnetic disk.
 249  *
 250  * If isRedo is true, it's okay for the relation to exist already.
 251  */
 252 void
 253 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 254 {
 255         char       *path;
 256         File            fd;
 257
 258         if (isRedo && reln->md_fd[forkNum] != NULL)
 259                 return;                                 /* created and opened already... */
 260
 261         Assert(reln->md_fd[forkNum] == NULL);
 262
 263         path = relpath(reln->smgr_rnode, forkNum);
 264
 265         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 266
 267         if (fd < 0)
 268         {
 269                 int                     save_errno = errno;
 270
 271                 /*
 272                  * During bootstrap, there are cases where a system relation will be
 273                  * accessed (by internal backend processes) before the bootstrap
 274                  * script nominally creates it.  Therefore, allow the file to exist
 275                  * already, even if isRedo is not set.  (See also mdopen)
 276                  */
 277                 if (isRedo || IsBootstrapProcessingMode())
 278                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 279                 if (fd < 0)
 280                 {
 281                         /* be sure to report the error reported by create, not open */
 282                         errno = save_errno;
 283                         ereport(ERROR,
 284                                         (errcode_for_file_access(),
 285                                          errmsg("could not create file \"%s\": %m", path)));
 286                 }
 287         }
 288
 289         pfree(path);
 290
 291         if (reln->smgr_transient)
 292                 FileSetTransient(fd);
 293
 294         reln->md_fd[forkNum] = _fdvec_alloc();
 295
 296         reln->md_fd[forkNum]->mdfd_vfd = fd;
 297         reln->md_fd[forkNum]->mdfd_segno = 0;
 298         reln->md_fd[forkNum]->mdfd_chain = NULL;
 299 }
 300
 301 /*
 302  *      mdunlink() -- Unlink a relation.
 303  *
 304  * Note that we're passed a RelFileNode --- by the time this is called,
 305  * there won't be an SMgrRelation hashtable entry anymore.
 306  *
 307  * Actually, we don't unlink the first segment file of the relation, but
 308  * just truncate it to zero length, and record a request to unlink it after
 309  * the next checkpoint.  Additional segments can be unlinked immediately,
 310  * however.  Leaving the empty file in place prevents that relfilenode
 311  * number from being reused.  The scenario this protects us from is:
 312  * 1. We delete a relation (and commit, and actually remove its file).
 313  * 2. We create a new relation, which by chance gets the same relfilenode as
 314  *        the just-deleted one (OIDs must've wrapped around for that to happen).
 315  * 3. We crash before another checkpoint occurs.
 316  * During replay, we would delete the file and then recreate it, which is fine
 317  * if the contents of the file were repopulated by subsequent WAL entries.
 318  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
 319  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
 320  * the contents of the file would be lost forever.      By leaving the empty file
 321  * until after the next checkpoint, we prevent reassignment of the relfilenode
 322  * number until it's safe, because relfilenode assignment skips over any
 323  * existing file.
 324  *
 325  * If isRedo is true, it's okay for the relation to be already gone.
 326  * Also, we should remove the file immediately instead of queuing a request
 327  * for later, since during redo there's no possibility of creating a
 328  * conflicting relation.
 329  *
 330  * Note: any failure should be reported as WARNING not ERROR, because
 331  * we are usually not in a transaction anymore when this is called.
 332  */
 333 void
 334 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
 335 {
 336         char       *path;
 337         int                     ret;
 338
 339         /*
 340          * We have to clean out any pending fsync requests for the doomed
 341          * relation, else the next mdsync() will fail.
 342          */
 343         ForgetRelationFsyncRequests(rnode, forkNum);
 344
 345         path = relpath(rnode, forkNum);
 346
 347         /*
 348          * Delete or truncate the first segment.
 349          */
 350         if (isRedo || forkNum != MAIN_FORKNUM)
 351         {
 352                 ret = unlink(path);
 353                 if (ret < 0)
 354                 {
 355                         if (!isRedo || errno != ENOENT)
 356                                 ereport(WARNING,
 357                                                 (errcode_for_file_access(),
 358                                                  errmsg("could not remove file \"%s\": %m", path)));
 359                 }
 360         }
 361         else
 362         {
 363                 /* truncate(2) would be easier here, but Windows hasn't got it */
 364                 int                     fd;
 365
 366                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
 367                 if (fd >= 0)
 368                 {
 369                         int                     save_errno;
 370
 371                         ret = ftruncate(fd, 0);
 372                         save_errno = errno;
 373                         close(fd);
 374                         errno = save_errno;
 375                 }
 376                 else
 377                         ret = -1;
 378                 if (ret < 0 && errno != ENOENT)
 379                         ereport(WARNING,
 380                                         (errcode_for_file_access(),
 381                                          errmsg("could not truncate file \"%s\": %m", path)));
 382         }
 383
 384         /*
 385          * Delete any additional segments.
 386          */
 387         if (ret >= 0)
 388         {
 389                 char       *segpath = (char *) palloc(strlen(path) + 12);
 390                 BlockNumber segno;
 391
 392                 /*
 393                  * Note that because we loop until getting ENOENT, we will correctly
 394                  * remove all inactive segments as well as active ones.
 395                  */
 396                 for (segno = 1;; segno++)
 397                 {
 398                         sprintf(segpath, "%s.%u", path, segno);
 399                         if (unlink(segpath) < 0)
 400                         {
 401                                 /* ENOENT is expected after the last segment... */
 402                                 if (errno != ENOENT)
 403                                         ereport(WARNING,
 404                                                         (errcode_for_file_access(),
 405                                            errmsg("could not remove file \"%s\": %m", segpath)));
 406                                 break;
 407                         }
 408                 }
 409                 pfree(segpath);
 410         }
 411
 412         pfree(path);
 413
 414         /* Register request to unlink first segment later */
 415         if (!isRedo && forkNum == MAIN_FORKNUM)
 416                 register_unlink(rnode);
 417 }
 418
 419 /*
 420  *      mdextend() -- Add a block to the specified relation.
 421  *
 422  *              The semantics are nearly the same as mdwrite(): write at the
 423  *              specified position.  However, this is to be used for the case of
 424  *              extending a relation (i.e., blocknum is at or beyond the current
 425  *              EOF).  Note that we assume writing a block beyond current EOF
 426  *              causes intervening file space to become filled with zeroes.
 427  */
 428 void
 429 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 430                  char *buffer, bool skipFsync)
 431 {
 432         off_t           seekpos;
 433         int                     nbytes;
 434         MdfdVec    *v;
 435
 436         /* This assert is too expensive to have on normally ... */
 437 #ifdef CHECK_WRITE_VS_EXTEND
 438         Assert(blocknum >= mdnblocks(reln, forknum));
 439 #endif
 440
 441         /*
 442          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
 443          * more --- we mustn't create a block whose number actually is
 444          * InvalidBlockNumber.
 445          */
 446         if (blocknum == InvalidBlockNumber)
 447                 ereport(ERROR,
 448                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 449                                  errmsg("cannot extend file \"%s\" beyond %u blocks",
 450                                                 relpath(reln->smgr_rnode, forknum),
 451                                                 InvalidBlockNumber)));
 452
 453         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
 454
 455         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 456
 457         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 458
 459         /*
 460          * Note: because caller usually obtained blocknum by calling mdnblocks,
 461          * which did a seek(SEEK_END), this seek is often redundant and will be
 462          * optimized away by fd.c.      It's not redundant, however, if there is a
 463          * partial page at the end of the file. In that case we want to try to
 464          * overwrite the partial page with a full page.  It's also not redundant
 465          * if bufmgr.c had to dump another buffer of the same file to make room
 466          * for the new page's buffer.
 467          */
 468         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 469                 ereport(ERROR,
 470                                 (errcode_for_file_access(),
 471                                  errmsg("could not seek to block %u in file \"%s\": %m",
 472                                                 blocknum, FilePathName(v->mdfd_vfd))));
 473
 474         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
 475         {
 476                 if (nbytes < 0)
 477                         ereport(ERROR,
 478                                         (errcode_for_file_access(),
 479                                          errmsg("could not extend file \"%s\": %m",
 480                                                         FilePathName(v->mdfd_vfd)),
 481                                          errhint("Check free disk space.")));
 482                 /* short write: complain appropriately */
 483                 ereport(ERROR,
 484                                 (errcode(ERRCODE_DISK_FULL),
 485                                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
 486                                                 FilePathName(v->mdfd_vfd),
 487                                                 nbytes, BLCKSZ, blocknum),
 488                                  errhint("Check free disk space.")));
 489         }
 490
 491         if (!skipFsync && !SmgrIsTemp(reln))
 492                 register_dirty_segment(reln, forknum, v);
 493
 494         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 495 }
 496
 497 /*
 498  *      mdopen() -- Open the specified relation.
 499  *
 500  * Note we only open the first segment, when there are multiple segments.
 501  *
 502  * If first segment is not present, either ereport or return NULL according
 503  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
 504  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
 505  * invent one out of whole cloth.
 506  */
 507 static MdfdVec *
 508 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
 509 {
 510         MdfdVec    *mdfd;
 511         char       *path;
 512         File            fd;
 513
 514         /* No work if already open */
 515         if (reln->md_fd[forknum])
 516                 return reln->md_fd[forknum];
 517
 518         path = relpath(reln->smgr_rnode, forknum);
 519
 520         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
 521
 522         if (fd < 0)
 523         {
 524                 /*
 525                  * During bootstrap, there are cases where a system relation will be
 526                  * accessed (by internal backend processes) before the bootstrap
 527                  * script nominally creates it.  Therefore, accept mdopen() as a
 528                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
 529                  */
 530                 if (IsBootstrapProcessingMode())
 531                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
 532                 if (fd < 0)
 533                 {
 534                         if (behavior == EXTENSION_RETURN_NULL &&
 535                                 FILE_POSSIBLY_DELETED(errno))
 536                         {
 537                                 pfree(path);
 538                                 return NULL;
 539                         }
 540                         ereport(ERROR,
 541                                         (errcode_for_file_access(),
 542                                          errmsg("could not open file \"%s\": %m", path)));
 543                 }
 544         }
 545
 546         pfree(path);
 547
 548         if (reln->smgr_transient)
 549                 FileSetTransient(fd);
 550
 551         reln->md_fd[forknum] = mdfd = _fdvec_alloc();
 552
 553         mdfd->mdfd_vfd = fd;
 554         mdfd->mdfd_segno = 0;
 555         mdfd->mdfd_chain = NULL;
 556         Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 557
 558         return mdfd;
 559 }
 560
 561 /*
 562  *      mdclose() -- Close the specified relation, if it isn't closed already.
 563  */
 564 void
 565 mdclose(SMgrRelation reln, ForkNumber forknum)
 566 {
 567         MdfdVec    *v = reln->md_fd[forknum];
 568
 569         /* No work if already closed */
 570         if (v == NULL)
 571                 return;
 572
 573         reln->md_fd[forknum] = NULL;    /* prevent dangling pointer after error */
 574
 575         while (v != NULL)
 576         {
 577                 MdfdVec    *ov = v;
 578
 579                 /* if not closed already */
 580                 if (v->mdfd_vfd >= 0)
 581                         FileClose(v->mdfd_vfd);
 582                 /* Now free vector */
 583                 v = v->mdfd_chain;
 584                 pfree(ov);
 585         }
 586 }
 587
 588 /*
 589  *      mdprefetch() -- Initiate asynchronous read of the specified block of a relation
 590  */
 591 void
 592 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
 593 {
 594 #ifdef USE_PREFETCH
 595         off_t           seekpos;
 596         MdfdVec    *v;
 597
 598         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 599
 600         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 601
 602         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 603
 604         (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
 605 #endif   /* USE_PREFETCH */
 606 }
 607
 608
 609 /*
 610  *      mdread() -- Read the specified block from a relation.
 611  */
 612 void
 613 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 614            char *buffer)
 615 {
 616         off_t           seekpos;
 617         int                     nbytes;
 618         MdfdVec    *v;
 619
 620         TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
 621                                                                                 reln->smgr_rnode.node.spcNode,
 622                                                                                 reln->smgr_rnode.node.dbNode,
 623                                                                                 reln->smgr_rnode.node.relNode,
 624                                                                                 reln->smgr_rnode.backend);
 625
 626         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
 627
 628         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 629
 630         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 631
 632         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 633                 ereport(ERROR,
 634                                 (errcode_for_file_access(),
 635                                  errmsg("could not seek to block %u in file \"%s\": %m",
 636                                                 blocknum, FilePathName(v->mdfd_vfd))));
 637
 638         nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
 639
 640         TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
 641                                                                            reln->smgr_rnode.node.spcNode,
 642                                                                            reln->smgr_rnode.node.dbNode,
 643                                                                            reln->smgr_rnode.node.relNode,
 644                                                                            reln->smgr_rnode.backend,
 645                                                                            nbytes,
 646                                                                            BLCKSZ);
 647
 648         if (nbytes != BLCKSZ)
 649         {
 650                 if (nbytes < 0)
 651                         ereport(ERROR,
 652                                         (errcode_for_file_access(),
 653                                          errmsg("could not read block %u in file \"%s\": %m",
 654                                                         blocknum, FilePathName(v->mdfd_vfd))));
 655
 656                 /*
 657                  * Short read: we are at or past EOF, or we read a partial block at
 658                  * EOF.  Normally this is an error; upper levels should never try to
 659                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
 660                  * we are InRecovery, we should instead return zeroes without
 661                  * complaining.  This allows, for example, the case of trying to
 662                  * update a block that was later truncated away.
 663                  */
 664                 if (zero_damaged_pages || InRecovery)
 665                         MemSet(buffer, 0, BLCKSZ);
 666                 else
 667                         ereport(ERROR,
 668                                         (errcode(ERRCODE_DATA_CORRUPTED),
 669                                          errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
 670                                                         blocknum, FilePathName(v->mdfd_vfd),
 671                                                         nbytes, BLCKSZ)));
 672         }
 673 }
 674
 675 /*
 676  *      mdwrite() -- Write the supplied block at the appropriate location.
 677  *
 678  *              This is to be used only for updating already-existing blocks of a
 679  *              relation (ie, those before the current EOF).  To extend a relation,
 680  *              use mdextend().
 681  */
 682 void
 683 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
 684                 char *buffer, bool skipFsync)
 685 {
 686         off_t           seekpos;
 687         int                     nbytes;
 688         MdfdVec    *v;
 689
 690         /* This assert is too expensive to have on normally ... */
 691 #ifdef CHECK_WRITE_VS_EXTEND
 692         Assert(blocknum < mdnblocks(reln, forknum));
 693 #endif
 694
 695         TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
 696                                                                                  reln->smgr_rnode.node.spcNode,
 697                                                                                  reln->smgr_rnode.node.dbNode,
 698                                                                                  reln->smgr_rnode.node.relNode,
 699                                                                                  reln->smgr_rnode.backend);
 700
 701         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL);
 702
 703         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
 704
 705         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
 706
 707         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
 708                 ereport(ERROR,
 709                                 (errcode_for_file_access(),
 710                                  errmsg("could not seek to block %u in file \"%s\": %m",
 711                                                 blocknum, FilePathName(v->mdfd_vfd))));
 712
 713         nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
 714
 715         TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
 716                                                                                 reln->smgr_rnode.node.spcNode,
 717                                                                                 reln->smgr_rnode.node.dbNode,
 718                                                                                 reln->smgr_rnode.node.relNode,
 719                                                                                 reln->smgr_rnode.backend,
 720                                                                                 nbytes,
 721                                                                                 BLCKSZ);
 722
 723         if (nbytes != BLCKSZ)
 724         {
 725                 if (nbytes < 0)
 726                         ereport(ERROR,
 727                                         (errcode_for_file_access(),
 728                                          errmsg("could not write block %u in file \"%s\": %m",
 729                                                         blocknum, FilePathName(v->mdfd_vfd))));
 730                 /* short write: complain appropriately */
 731                 ereport(ERROR,
 732                                 (errcode(ERRCODE_DISK_FULL),
 733                                  errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
 734                                                 blocknum,
 735                                                 FilePathName(v->mdfd_vfd),
 736                                                 nbytes, BLCKSZ),
 737                                  errhint("Check free disk space.")));
 738         }
 739
 740         if (!skipFsync && !SmgrIsTemp(reln))
 741                 register_dirty_segment(reln, forknum, v);
 742 }
 743
 744 /*
 745  *      mdnblocks() -- Get the number of blocks stored in a relation.
 746  *
 747  *              Important side effect: all active segments of the relation are opened
 748  *              and added to the mdfd_chain list.  If this routine has not been
 749  *              called, then only segments up to the last one actually touched
 750  *              are present in the chain.
 751  */
 752 BlockNumber
 753 mdnblocks(SMgrRelation reln, ForkNumber forknum)
 754 {
 755         MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
 756         BlockNumber nblocks;
 757         BlockNumber segno = 0;
 758
 759         /*
 760          * Skip through any segments that aren't the last one, to avoid redundant
 761          * seeks on them.  We have previously verified that these segments are
 762          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
 763          *
 764          * NOTE: this assumption could only be wrong if another backend has
 765          * truncated the relation.      We rely on higher code levels to handle that
 766          * scenario by closing and re-opening the md fd, which is handled via
 767          * relcache flush.      (Since the bgwriter doesn't participate in relcache
 768          * flush, it could have segment chain entries for inactive segments;
 769          * that's OK because the bgwriter never needs to compute relation size.)
 770          */
 771         while (v->mdfd_chain != NULL)
 772         {
 773                 segno++;
 774                 v = v->mdfd_chain;
 775         }
 776
 777         for (;;)
 778         {
 779                 nblocks = _mdnblocks(reln, forknum, v);
 780                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
 781                         elog(FATAL, "segment too big");
 782                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
 783                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
 784
 785                 /*
 786                  * If segment is exactly RELSEG_SIZE, advance to next one.
 787                  */
 788                 segno++;
 789
 790                 if (v->mdfd_chain == NULL)
 791                 {
 792                         /*
 793                          * Because we pass O_CREAT, we will create the next segment (with
 794                          * zero length) immediately, if the last segment is of length
 795                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
 796                          * the logic simple.
 797                          */
 798                         v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
 799                         if (v->mdfd_chain == NULL)
 800                                 ereport(ERROR,
 801                                                 (errcode_for_file_access(),
 802                                                  errmsg("could not open file \"%s\": %m",
 803                                                                 _mdfd_segpath(reln, forknum, segno))));
 804                 }
 805
 806                 v = v->mdfd_chain;
 807         }
 808 }
 809
 810 /*
 811  *      mdtruncate() -- Truncate relation to specified number of blocks.
 812  */
 813 void
 814 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 815 {
 816         MdfdVec    *v;
 817         BlockNumber curnblk;
 818         BlockNumber priorblocks;
 819
 820         /*
 821          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 822          * truncation loop will get them all!
 823          */
 824         curnblk = mdnblocks(reln, forknum);
 825         if (nblocks > curnblk)
 826         {
 827                 /* Bogus request ... but no complaint if InRecovery */
 828                 if (InRecovery)
 829                         return;
 830                 ereport(ERROR,
 831                                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
 832                                                 relpath(reln->smgr_rnode, forknum),
 833                                                 nblocks, curnblk)));
 834         }
 835         if (nblocks == curnblk)
 836                 return;                                 /* no work */
 837
 838         v = mdopen(reln, forknum, EXTENSION_FAIL);
 839
 840         priorblocks = 0;
 841         while (v != NULL)
 842         {
 843                 MdfdVec    *ov = v;
 844
 845                 if (priorblocks > nblocks)
 846                 {
 847                         /*
 848                          * This segment is no longer active (and has already been unlinked
 849                          * from the mdfd_chain). We truncate the file, but do not delete
 850                          * it, for reasons explained in the header comments.
 851                          */
 852                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
 853                                 ereport(ERROR,
 854                                                 (errcode_for_file_access(),
 855                                                  errmsg("could not truncate file \"%s\": %m",
 856                                                                 FilePathName(v->mdfd_vfd))));
 857
 858                         if (!SmgrIsTemp(reln))
 859                                 register_dirty_segment(reln, forknum, v);
 860                         v = v->mdfd_chain;
 861                         Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
 862                                                                                                  * segment */
 863                         pfree(ov);
 864                 }
 865                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 866                 {
 867                         /*
 868                          * This is the last segment we want to keep. Truncate the file to
 869                          * the right length, and clear chain link that points to any
 870                          * remaining segments (which we shall zap). NOTE: if nblocks is
 871                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
 872                          * segment to 0 length but keep it. This adheres to the invariant
 873                          * given in the header comments.
 874                          */
 875                         BlockNumber lastsegblocks = nblocks - priorblocks;
 876
 877                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
 878                                 ereport(ERROR,
 879                                                 (errcode_for_file_access(),
 880                                         errmsg("could not truncate file \"%s\" to %u blocks: %m",
 881                                                    FilePathName(v->mdfd_vfd),
 882                                                    nblocks)));
 883                         if (!SmgrIsTemp(reln))
 884                                 register_dirty_segment(reln, forknum, v);
 885                         v = v->mdfd_chain;
 886                         ov->mdfd_chain = NULL;
 887                 }
 888                 else
 889                 {
 890                         /*
 891                          * We still need this segment and 0 or more blocks beyond it, so
 892                          * nothing to do here.
 893                          */
 894                         v = v->mdfd_chain;
 895                 }
 896                 priorblocks += RELSEG_SIZE;
 897         }
 898 }
 899
 900 /*
 901  *      mdimmedsync() -- Immediately sync a relation to stable storage.
 902  *
 903  * Note that only writes already issued are synced; this routine knows
 904  * nothing of dirty buffers that may exist inside the buffer manager.
 905  */
 906 void
 907 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 908 {
 909         MdfdVec    *v;
 910
 911         /*
 912          * NOTE: mdnblocks makes sure we have opened all active segments, so that
 913          * fsync loop will get them all!
 914          */
 915         mdnblocks(reln, forknum);
 916
 917         v = mdopen(reln, forknum, EXTENSION_FAIL);
 918
 919         while (v != NULL)
 920         {
 921                 if (FileSync(v->mdfd_vfd) < 0)
 922                         ereport(ERROR,
 923                                         (errcode_for_file_access(),
 924                                          errmsg("could not fsync file \"%s\": %m",
 925                                                         FilePathName(v->mdfd_vfd))));
 926                 v = v->mdfd_chain;
 927         }
 928 }
 929
 930 /*
 931  *      mdsync() -- Sync previous writes to stable storage.
 932  */
 933 void
 934 mdsync(void)
 935 {
 936         static bool mdsync_in_progress = false;
 937
 938         HASH_SEQ_STATUS hstat;
 939         PendingOperationEntry *entry;
 940         int                     absorb_counter;
 941
 942         /* Statistics on sync times */
 943         int                     processed = 0;
 944         instr_time      sync_start,
 945                                 sync_end,
 946                                 sync_diff;
 947         uint64          elapsed;
 948         uint64          longest = 0;
 949         uint64          total_elapsed = 0;
 950
 951         /*
 952          * This is only called during checkpoints, and checkpoints should only
 953          * occur in processes that have created a pendingOpsTable.
 954          */
 955         if (!pendingOpsTable)
 956                 elog(ERROR, "cannot sync without a pendingOpsTable");
 957
 958         /*
 959          * If we are in the bgwriter, the sync had better include all fsync
 960          * requests that were queued by backends up to this point.      The tightest
 961          * race condition that could occur is that a buffer that must be written
 962          * and fsync'd for the checkpoint could have been dumped by a backend just
 963          * before it was visited by BufferSync().  We know the backend will have
 964          * queued an fsync request before clearing the buffer's dirtybit, so we
 965          * are safe as long as we do an Absorb after completing BufferSync().
 966          */
 967         AbsorbFsyncRequests();
 968
 969         /*
 970          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
 971          * checkpoint), we want to ignore fsync requests that are entered into the
 972          * hashtable after this point --- they should be processed next time,
 973          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
 974          * ones: new ones will have cycle_ctr equal to the incremented value of
 975          * mdsync_cycle_ctr.
 976          *
 977          * In normal circumstances, all entries present in the table at this point
 978          * will have cycle_ctr exactly equal to the current (about to be old)
 979          * value of mdsync_cycle_ctr.  However, if we fail partway through the
 980          * fsync'ing loop, then older values of cycle_ctr might remain when we
 981          * come back here to try again.  Repeated checkpoint failures would
 982          * eventually wrap the counter around to the point where an old entry
 983          * might appear new, causing us to skip it, possibly allowing a checkpoint
 984          * to succeed that should not have.  To forestall wraparound, any time the
 985          * previous mdsync() failed to complete, run through the table and
 986          * forcibly set cycle_ctr = mdsync_cycle_ctr.
 987          *
 988          * Think not to merge this loop with the main loop, as the problem is
 989          * exactly that that loop may fail before having visited all the entries.
 990          * From a performance point of view it doesn't matter anyway, as this path
 991          * will never be taken in a system that's functioning normally.
 992          */
 993         if (mdsync_in_progress)
 994         {
 995                 /* prior try failed, so update any stale cycle_ctr values */
 996                 hash_seq_init(&hstat, pendingOpsTable);
 997                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
 998                 {
 999                         entry->cycle_ctr = mdsync_cycle_ctr;
1000                 }
1001         }
1002
1003         /* Advance counter so that new hashtable entries are distinguishable */
1004         mdsync_cycle_ctr++;
1005
1006         /* Set flag to detect failure if we don't reach the end of the loop */
1007         mdsync_in_progress = true;
1008
1009         /* Now scan the hashtable for fsync requests to process */
1010         absorb_counter = FSYNCS_PER_ABSORB;
1011         hash_seq_init(&hstat, pendingOpsTable);
1012         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1013         {
1014                 /*
1015                  * If the entry is new then don't process it this time.  Note that
1016                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1017                  */
1018                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1019                         continue;
1020
1021                 /* Else assert we haven't missed it */
1022                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1023
1024                 /*
1025                  * If fsync is off then we don't have to bother opening the file at
1026                  * all.  (We delay checking until this point so that changing fsync on
1027                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1028                  * fall through to delete it.
1029                  */
1030                 if (enableFsync && !entry->canceled)
1031                 {
1032                         int                     failures;
1033
1034                         /*
1035                          * If in bgwriter, we want to absorb pending requests every so
1036                          * often to prevent overflow of the fsync request queue.  It is
1037                          * unspecified whether newly-added entries will be visited by
1038                          * hash_seq_search, but we don't care since we don't need to
1039                          * process them anyway.
1040                          */
1041                         if (--absorb_counter <= 0)
1042                         {
1043                                 AbsorbFsyncRequests();
1044                                 absorb_counter = FSYNCS_PER_ABSORB;
1045                         }
1046
1047                         /*
1048                          * The fsync table could contain requests to fsync segments that
1049                          * have been deleted (unlinked) by the time we get to them. Rather
1050                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1051                          * ignored, what we do on error is absorb pending requests and
1052                          * then retry.  Since mdunlink() queues a "revoke" message before
1053                          * actually unlinking, the fsync request is guaranteed to be
1054                          * marked canceled after the absorb if it really was this case.
1055                          * DROP DATABASE likewise has to tell us to forget fsync requests
1056                          * before it starts deletions.
1057                          */
1058                         for (failures = 0;; failures++)         /* loop exits at "break" */
1059                         {
1060                                 SMgrRelation reln;
1061                                 MdfdVec    *seg;
1062                                 char       *path;
1063
1064                                 /*
1065                                  * Find or create an smgr hash entry for this relation. This
1066                                  * may seem a bit unclean -- md calling smgr?  But it's really
1067                                  * the best solution.  It ensures that the open file reference
1068                                  * isn't permanently leaked if we get an error here. (You may
1069                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1070                                  * really, because the only case in which a checkpoint is done
1071                                  * by a process that isn't about to shut down is in the
1072                                  * bgwriter, and it will periodically do smgrcloseall(). This
1073                                  * fact justifies our not closing the reln in the success path
1074                                  * either, which is a good thing since in non-bgwriter cases
1075                                  * we couldn't safely do that.)  Furthermore, in many cases
1076                                  * the relation will have been dirtied through this same smgr
1077                                  * relation, and so we can save a file open/close cycle.
1078                                  */
1079                                 reln = smgropen(entry->tag.rnode.node,
1080                                                                 entry->tag.rnode.backend);
1081
1082                                 /*
1083                                  * It is possible that the relation has been dropped or
1084                                  * truncated since the fsync request was entered.  Therefore,
1085                                  * allow ENOENT, but only if we didn't fail already on this
1086                                  * file.  This applies both during _mdfd_getseg() and during
1087                                  * FileSync, since fd.c might have closed the file behind our
1088                                  * back.
1089                                  */
1090                                 seg = _mdfd_getseg(reln, entry->tag.forknum,
1091                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1092                                                                    false, EXTENSION_RETURN_NULL);
1093
1094                                 if (log_checkpoints)
1095                                         INSTR_TIME_SET_CURRENT(sync_start);
1096                                 else
1097                                         INSTR_TIME_SET_ZERO(sync_start);
1098
1099                                 if (seg != NULL &&
1100                                         FileSync(seg->mdfd_vfd) >= 0)
1101                                 {
1102                                         if (log_checkpoints && (!INSTR_TIME_IS_ZERO(sync_start)))
1103                                         {
1104                                                 INSTR_TIME_SET_CURRENT(sync_end);
1105                                                 sync_diff = sync_end;
1106                                                 INSTR_TIME_SUBTRACT(sync_diff, sync_start);
1107                                                 elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
1108                                                 if (elapsed > longest)
1109                                                         longest = elapsed;
1110                                                 total_elapsed += elapsed;
1111                                                 processed++;
1112                                                 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
1113                                                          processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000);
1114                                         }
1115
1116                                         break;          /* success; break out of retry loop */
1117                                 }
1118
1119                                 /*
1120                                  * XXX is there any point in allowing more than one retry?
1121                                  * Don't see one at the moment, but easy to change the test
1122                                  * here if so.
1123                                  */
1124                                 path = _mdfd_segpath(reln, entry->tag.forknum,
1125                                                                          entry->tag.segno);
1126                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1127                                         failures > 0)
1128                                         ereport(ERROR,
1129                                                         (errcode_for_file_access(),
1130                                                    errmsg("could not fsync file \"%s\": %m", path)));
1131                                 else
1132                                         ereport(DEBUG1,
1133                                                         (errcode_for_file_access(),
1134                                            errmsg("could not fsync file \"%s\" but retrying: %m",
1135                                                           path)));
1136                                 pfree(path);
1137
1138                                 /*
1139                                  * Absorb incoming requests and check to see if canceled.
1140                                  */
1141                                 AbsorbFsyncRequests();
1142                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1143
1144                                 if (entry->canceled)
1145                                         break;
1146                         }                                       /* end retry loop */
1147                 }
1148
1149                 /*
1150                  * If we get here, either we fsync'd successfully, or we don't have to
1151                  * because enableFsync is off, or the entry is (now) marked canceled.
1152                  * Okay to delete it.
1153                  */
1154                 if (hash_search(pendingOpsTable, &entry->tag,
1155                                                 HASH_REMOVE, NULL) == NULL)
1156                         elog(ERROR, "pendingOpsTable corrupted");
1157         }                                                       /* end loop over hashtable entries */
1158
1159         /* Return sync performance metrics for report at checkpoint end */
1160         CheckpointStats.ckpt_sync_rels = processed;
1161         CheckpointStats.ckpt_longest_sync = longest;
1162         CheckpointStats.ckpt_agg_sync_time = total_elapsed;
1163
1164         /* Flag successful completion of mdsync */
1165         mdsync_in_progress = false;
1166 }
1167
1168 /*
1169  * mdpreckpt() -- Do pre-checkpoint work
1170  *
1171  * To distinguish unlink requests that arrived before this checkpoint
1172  * started from those that arrived during the checkpoint, we use a cycle
1173  * counter similar to the one we use for fsync requests. That cycle
1174  * counter is incremented here.
1175  *
1176  * This must be called *before* the checkpoint REDO point is determined.
1177  * That ensures that we won't delete files too soon.
1178  *
1179  * Note that we can't do anything here that depends on the assumption
1180  * that the checkpoint will be completed.
1181  */
1182 void
1183 mdpreckpt(void)
1184 {
1185         ListCell   *cell;
1186
1187         /*
1188          * In case the prior checkpoint wasn't completed, stamp all entries in the
1189          * list with the current cycle counter.  Anything that's in the list at
1190          * the start of checkpoint can surely be deleted after the checkpoint is
1191          * finished, regardless of when the request was made.
1192          */
1193         foreach(cell, pendingUnlinks)
1194         {
1195                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1196
1197                 entry->cycle_ctr = mdckpt_cycle_ctr;
1198         }
1199
1200         /*
1201          * Any unlink requests arriving after this point will be assigned the next
1202          * cycle counter, and won't be unlinked until next checkpoint.
1203          */
1204         mdckpt_cycle_ctr++;
1205 }
1206
1207 /*
1208  * mdpostckpt() -- Do post-checkpoint work
1209  *
1210  * Remove any lingering files that can now be safely removed.
1211  */
1212 void
1213 mdpostckpt(void)
1214 {
1215         while (pendingUnlinks != NIL)
1216         {
1217                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1218                 char       *path;
1219
1220                 /*
1221                  * New entries are appended to the end, so if the entry is new we've
1222                  * reached the end of old entries.
1223                  */
1224                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1225                         break;
1226
1227                 /* Else assert we haven't missed it */
1228                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1229
1230                 /* Unlink the file */
1231                 path = relpath(entry->rnode, MAIN_FORKNUM);
1232                 if (unlink(path) < 0)
1233                 {
1234                         /*
1235                          * There's a race condition, when the database is dropped at the
1236                          * same time that we process the pending unlink requests. If the
1237                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1238                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1239                          * the possibility that we delete the file first.
1240                          */
1241                         if (errno != ENOENT)
1242                                 ereport(WARNING,
1243                                                 (errcode_for_file_access(),
1244                                                  errmsg("could not remove file \"%s\": %m", path)));
1245                 }
1246                 pfree(path);
1247
1248                 pendingUnlinks = list_delete_first(pendingUnlinks);
1249                 pfree(entry);
1250         }
1251 }
1252
1253 /*
1254  * register_dirty_segment() -- Mark a relation segment as needing fsync
1255  *
1256  * If there is a local pending-ops table, just make an entry in it for
1257  * mdsync to process later.  Otherwise, try to pass off the fsync request
1258  * to the background writer process.  If that fails, just do the fsync
1259  * locally before returning (we expect this will not happen often enough
1260  * to be a performance problem).
1261  */
1262 static void
1263 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1264 {
1265         if (pendingOpsTable)
1266         {
1267                 /* push it into local pending-ops table */
1268                 RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
1269         }
1270         else
1271         {
1272                 if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
1273                         return;                         /* passed it off successfully */
1274
1275                 ereport(DEBUG1,
1276                                 (errmsg("could not forward fsync request because request queue is full")));
1277
1278                 if (FileSync(seg->mdfd_vfd) < 0)
1279                         ereport(ERROR,
1280                                         (errcode_for_file_access(),
1281                                          errmsg("could not fsync file \"%s\": %m",
1282                                                         FilePathName(seg->mdfd_vfd))));
1283         }
1284 }
1285
1286 /*
1287  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1288  *
1289  * As with register_dirty_segment, this could involve either a local or
1290  * a remote pending-ops table.
1291  */
1292 static void
1293 register_unlink(RelFileNodeBackend rnode)
1294 {
1295         if (pendingOpsTable)
1296         {
1297                 /* push it into local pending-ops table */
1298                 RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
1299         }
1300         else
1301         {
1302                 /*
1303                  * Notify the bgwriter about it.  If we fail to queue the request
1304                  * message, we have to sleep and try again, because we can't simply
1305                  * delete the file now.  Ugly, but hopefully won't happen often.
1306                  *
1307                  * XXX should we just leave the file orphaned instead?
1308                  */
1309                 Assert(IsUnderPostmaster);
1310                 while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
1311                                                                         UNLINK_RELATION_REQUEST))
1312                         pg_usleep(10000L);      /* 10 msec seems a good number */
1313         }
1314 }
1315
1316 /*
1317  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1318  *
1319  * We stuff most fsync requests into the local hash table for execution
1320  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1321  * separate linked list, however, because they get processed separately.
1322  *
1323  * The range of possible segment numbers is way less than the range of
1324  * BlockNumber, so we can reserve high values of segno for special purposes.
1325  * We define three:
1326  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1327  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1328  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1329  *       checkpoint.
1330  *
1331  * (Handling the FORGET_* requests is a tad slow because the hash table has
1332  * to be searched linearly, but it doesn't seem worth rethinking the table
1333  * structure for them.)
1334  */
1335 void
1336 RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
1337                                          BlockNumber segno)
1338 {
1339         Assert(pendingOpsTable);
1340
1341         if (segno == FORGET_RELATION_FSYNC)
1342         {
1343                 /* Remove any pending requests for the entire relation */
1344                 HASH_SEQ_STATUS hstat;
1345                 PendingOperationEntry *entry;
1346
1347                 hash_seq_init(&hstat, pendingOpsTable);
1348                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1349                 {
1350                         if (RelFileNodeBackendEquals(entry->tag.rnode, rnode) &&
1351                                 entry->tag.forknum == forknum)
1352                         {
1353                                 /* Okay, cancel this entry */
1354                                 entry->canceled = true;
1355                         }
1356                 }
1357         }
1358         else if (segno == FORGET_DATABASE_FSYNC)
1359         {
1360                 /* Remove any pending requests for the entire database */
1361                 HASH_SEQ_STATUS hstat;
1362                 PendingOperationEntry *entry;
1363                 ListCell   *cell,
1364                                    *prev,
1365                                    *next;
1366
1367                 /* Remove fsync requests */
1368                 hash_seq_init(&hstat, pendingOpsTable);
1369                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1370                 {
1371                         if (entry->tag.rnode.node.dbNode == rnode.node.dbNode)
1372                         {
1373                                 /* Okay, cancel this entry */
1374                                 entry->canceled = true;
1375                         }
1376                 }
1377
1378                 /* Remove unlink requests */
1379                 prev = NULL;
1380                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1381                 {
1382                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1383
1384                         next = lnext(cell);
1385                         if (entry->rnode.node.dbNode == rnode.node.dbNode)
1386                         {
1387                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1388                                 pfree(entry);
1389                         }
1390                         else
1391                                 prev = cell;
1392                 }
1393         }
1394         else if (segno == UNLINK_RELATION_REQUEST)
1395         {
1396                 /* Unlink request: put it in the linked list */
1397                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1398                 PendingUnlinkEntry *entry;
1399
1400                 entry = palloc(sizeof(PendingUnlinkEntry));
1401                 entry->rnode = rnode;
1402                 entry->cycle_ctr = mdckpt_cycle_ctr;
1403
1404                 pendingUnlinks = lappend(pendingUnlinks, entry);
1405
1406                 MemoryContextSwitchTo(oldcxt);
1407         }
1408         else
1409         {
1410                 /* Normal case: enter a request to fsync this segment */
1411                 PendingOperationTag key;
1412                 PendingOperationEntry *entry;
1413                 bool            found;
1414
1415                 /* ensure any pad bytes in the hash key are zeroed */
1416                 MemSet(&key, 0, sizeof(key));
1417                 key.rnode = rnode;
1418                 key.forknum = forknum;
1419                 key.segno = segno;
1420
1421                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1422                                                                                                           &key,
1423                                                                                                           HASH_ENTER,
1424                                                                                                           &found);
1425                 /* if new or previously canceled entry, initialize it */
1426                 if (!found || entry->canceled)
1427                 {
1428                         entry->canceled = false;
1429                         entry->cycle_ctr = mdsync_cycle_ctr;
1430                 }
1431
1432                 /*
1433                  * NB: it's intentional that we don't change cycle_ctr if the entry
1434                  * already exists.      The fsync request must be treated as old, even
1435                  * though the new request will be satisfied too by any subsequent
1436                  * fsync.
1437                  *
1438                  * However, if the entry is present but is marked canceled, we should
1439                  * act just as though it wasn't there.  The only case where this could
1440                  * happen would be if a file had been deleted, we received but did not
1441                  * yet act on the cancel request, and the same relfilenode was then
1442                  * assigned to a new file.      We mustn't lose the new request, but it
1443                  * should be considered new not old.
1444                  */
1445         }
1446 }
1447
1448 /*
1449  * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1450  */
1451 void
1452 ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
1453 {
1454         if (pendingOpsTable)
1455         {
1456                 /* standalone backend or startup process: fsync state is local */
1457                 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1458         }
1459         else if (IsUnderPostmaster)
1460         {
1461                 /*
1462                  * Notify the bgwriter about it.  If we fail to queue the revoke
1463                  * message, we have to sleep and try again ... ugly, but hopefully
1464                  * won't happen often.
1465                  *
1466                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1467                  * error would leave the no-longer-used file still present on disk,
1468                  * which would be bad, so I'm inclined to assume that the bgwriter
1469                  * will always empty the queue soon.
1470                  */
1471                 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1472                         pg_usleep(10000L);      /* 10 msec seems a good number */
1473
1474                 /*
1475                  * Note we don't wait for the bgwriter to actually absorb the revoke
1476                  * message; see mdsync() for the implications.
1477                  */
1478         }
1479 }
1480
1481 /*
1482  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1483  */
1484 void
1485 ForgetDatabaseFsyncRequests(Oid dbid)
1486 {
1487         RelFileNodeBackend rnode;
1488
1489         rnode.node.dbNode = dbid;
1490         rnode.node.spcNode = 0;
1491         rnode.node.relNode = 0;
1492         rnode.backend = InvalidBackendId;
1493
1494         if (pendingOpsTable)
1495         {
1496                 /* standalone backend or startup process: fsync state is local */
1497                 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1498         }
1499         else if (IsUnderPostmaster)
1500         {
1501                 /* see notes in ForgetRelationFsyncRequests */
1502                 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1503                                                                         FORGET_DATABASE_FSYNC))
1504                         pg_usleep(10000L);      /* 10 msec seems a good number */
1505         }
1506 }
1507
1508
1509 /*
1510  *      _fdvec_alloc() -- Make a MdfdVec object.
1511  */
1512 static MdfdVec *
1513 _fdvec_alloc(void)
1514 {
1515         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1516 }
1517
1518 /*
1519  * Return the filename for the specified segment of the relation. The
1520  * returned string is palloc'd.
1521  */
1522 static char *
1523 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1524 {
1525         char       *path,
1526                            *fullpath;
1527
1528         path = relpath(reln->smgr_rnode, forknum);
1529
1530         if (segno > 0)
1531         {
1532                 /* be sure we have enough space for the '.segno' */
1533                 fullpath = (char *) palloc(strlen(path) + 12);
1534                 sprintf(fullpath, "%s.%u", path, segno);
1535                 pfree(path);
1536         }
1537         else
1538                 fullpath = path;
1539
1540         return fullpath;
1541 }
1542
1543 /*
1544  * Open the specified segment of the relation,
1545  * and make a MdfdVec object for it.  Returns NULL on failure.
1546  */
1547 static MdfdVec *
1548 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1549                           int oflags)
1550 {
1551         MdfdVec    *v;
1552         int                     fd;
1553         char       *fullpath;
1554
1555         fullpath = _mdfd_segpath(reln, forknum, segno);
1556
1557         /* open the file */
1558         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1559
1560         pfree(fullpath);
1561
1562         if (fd < 0)
1563                 return NULL;
1564
1565         if (reln->smgr_transient)
1566                 FileSetTransient(fd);
1567
1568         /* allocate an mdfdvec entry for it */
1569         v = _fdvec_alloc();
1570
1571         /* fill the entry */
1572         v->mdfd_vfd = fd;
1573         v->mdfd_segno = segno;
1574         v->mdfd_chain = NULL;
1575         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1576
1577         /* all done */
1578         return v;
1579 }
1580
1581 /*
1582  *      _mdfd_getseg() -- Find the segment of the relation holding the
1583  *              specified block.
1584  *
1585  * If the segment doesn't exist, we ereport, return NULL, or create the
1586  * segment, according to "behavior".  Note: skipFsync is only used in the
1587  * EXTENSION_CREATE case.
1588  */
1589 static MdfdVec *
1590 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1591                          bool skipFsync, ExtensionBehavior behavior)
1592 {
1593         MdfdVec    *v = mdopen(reln, forknum, behavior);
1594         BlockNumber targetseg;
1595         BlockNumber nextsegno;
1596
1597         if (!v)
1598                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1599
1600         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1601         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1602         {
1603                 Assert(nextsegno == v->mdfd_segno + 1);
1604
1605                 if (v->mdfd_chain == NULL)
1606                 {
1607                         /*
1608                          * Normally we will create new segments only if authorized by the
1609                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1610                          * recovery, create segments anyway; this allows cases such as
1611                          * replaying WAL data that has a write into a high-numbered
1612                          * segment of a relation that was later deleted.  We want to go
1613                          * ahead and create the segments so we can finish out the replay.
1614                          *
1615                          * We have to maintain the invariant that segments before the last
1616                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1617                          * with zeroes if needed.  (This only matters if caller is
1618                          * extending the relation discontiguously, but that can happen in
1619                          * hash indexes.)
1620                          */
1621                         if (behavior == EXTENSION_CREATE || InRecovery)
1622                         {
1623                                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
1624                                 {
1625                                         char       *zerobuf = palloc0(BLCKSZ);
1626
1627                                         mdextend(reln, forknum,
1628                                                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1629                                                          zerobuf, skipFsync);
1630                                         pfree(zerobuf);
1631                                 }
1632                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
1633                         }
1634                         else
1635                         {
1636                                 /* We won't create segment if not existent */
1637                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
1638                         }
1639                         if (v->mdfd_chain == NULL)
1640                         {
1641                                 if (behavior == EXTENSION_RETURN_NULL &&
1642                                         FILE_POSSIBLY_DELETED(errno))
1643                                         return NULL;
1644                                 ereport(ERROR,
1645                                                 (errcode_for_file_access(),
1646                                    errmsg("could not open file \"%s\" (target block %u): %m",
1647                                                   _mdfd_segpath(reln, forknum, nextsegno),
1648                                                   blkno)));
1649                         }
1650                 }
1651                 v = v->mdfd_chain;
1652         }
1653         return v;
1654 }
1655
1656 /*
1657  * Get number of blocks present in a single disk file
1658  */
1659 static BlockNumber
1660 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1661 {
1662         off_t           len;
1663
1664         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1665         if (len < 0)
1666                 ereport(ERROR,
1667                                 (errcode_for_file_access(),
1668                                  errmsg("could not seek to end of file \"%s\": %m",
1669                                                 FilePathName(seg->mdfd_vfd))));
1670         /* note that this calculation will ignore any partial block at EOF */
1671         return (BlockNumber) (len / BLCKSZ);
1672 }