OSDN Git Service

Use "transient" files for blind writes, take 2
[pg-rex/syncrep.git] / src / backend / storage / smgr / md.c
1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *        This code manages relations that reside on magnetic disk.
5  *
6  * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        src/backend/storage/smgr/md.c
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <unistd.h>
18 #include <fcntl.h>
19 #include <sys/file.h>
20
21 #include "catalog/catalog.h"
22 #include "miscadmin.h"
23 #include "portability/instr_time.h"
24 #include "postmaster/bgwriter.h"
25 #include "storage/fd.h"
26 #include "storage/bufmgr.h"
27 #include "storage/relfilenode.h"
28 #include "storage/smgr.h"
29 #include "utils/hsearch.h"
30 #include "utils/memutils.h"
31 #include "pg_trace.h"
32
33
34 /* interval for calling AbsorbFsyncRequests in mdsync */
35 #define FSYNCS_PER_ABSORB               10
36
37 /*
38  * Special values for the segno arg to RememberFsyncRequest.
39  *
40  * Note that CompactBgwriterRequestQueue assumes that it's OK to remove an
41  * fsync request from the queue if an identical, subsequent request is found.
42  * See comments there before making changes here.
43  */
44 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
45 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
46 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
47
48 /*
49  * On Windows, we have to interpret EACCES as possibly meaning the same as
50  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
51  * that's what you get.  Ugh.  This code is designed so that we don't
52  * actually believe these cases are okay without further evidence (namely,
53  * a pending fsync request getting revoked ... see mdsync).
54  */
55 #ifndef WIN32
56 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
57 #else
58 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
59 #endif
60
61 /*
62  *      The magnetic disk storage manager keeps track of open file
63  *      descriptors in its own descriptor pool.  This is done to make it
64  *      easier to support relations that are larger than the operating
65  *      system's file size limit (often 2GBytes).  In order to do that,
66  *      we break relations up into "segment" files that are each shorter than
67  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
68  *      configuration constant in pg_config.h.
69  *
70  *      On disk, a relation must consist of consecutively numbered segment
71  *      files in the pattern
72  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
73  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
74  *              -- Optionally, any number of inactive segments of size 0 blocks.
75  *      The full and partial segments are collectively the "active" segments.
76  *      Inactive segments are those that once contained data but are currently
77  *      not needed because of an mdtruncate() operation.  The reason for leaving
78  *      them present at size zero, rather than unlinking them, is that other
79  *      backends and/or the bgwriter might be holding open file references to
80  *      such segments.  If the relation expands again after mdtruncate(), such
81  *      that a deactivated segment becomes active again, it is important that
82  *      such file references still be valid --- else data might get written
83  *      out to an unlinked old copy of a segment file that will eventually
84  *      disappear.
85  *
86  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
87  *      cache is, therefore, just the head of a list of MdfdVec objects, one
88  *      per segment.  But note the md_fd pointer can be NULL, indicating
89  *      relation not open.
90  *
91  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
92  *      doesn't have another segment after this one; we may just not have
93  *      opened the next segment yet.  (We could not have "all segments are
94  *      in the chain" as an invariant anyway, since another backend could
95  *      extend the relation when we weren't looking.)  We do not make chain
96  *      entries for inactive segments, however; as soon as we find a partial
97  *      segment, we assume that any subsequent segments are inactive.
98  *
99  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
100  */
101
102 typedef struct _MdfdVec
103 {
104         File            mdfd_vfd;               /* fd number in fd.c's pool */
105         BlockNumber mdfd_segno;         /* segment number, from 0 */
106         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
107 } MdfdVec;
108
109 static MemoryContext MdCxt;             /* context for all md.c allocations */
110
111
112 /*
113  * In some contexts (currently, standalone backends and the bgwriter process)
114  * we keep track of pending fsync operations: we need to remember all relation
115  * segments that have been written since the last checkpoint, so that we can
116  * fsync them down to disk before completing the next checkpoint.  This hash
117  * table remembers the pending operations.      We use a hash table mostly as
118  * a convenient way of eliminating duplicate requests.
119  *
120  * We use a similar mechanism to remember no-longer-needed files that can
121  * be deleted after the next checkpoint, but we use a linked list instead of
122  * a hash table, because we don't expect there to be any duplicate requests.
123  *
124  * (Regular backends do not track pending operations locally, but forward
125  * them to the bgwriter.)
126  */
127 typedef struct
128 {
129         RelFileNodeBackend rnode;       /* the targeted relation */
130         ForkNumber      forknum;
131         BlockNumber segno;                      /* which segment */
132 } PendingOperationTag;
133
134 typedef uint16 CycleCtr;                /* can be any convenient integer size */
135
136 typedef struct
137 {
138         PendingOperationTag tag;        /* hash table key (must be first!) */
139         bool            canceled;               /* T => request canceled, not yet removed */
140         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
141 } PendingOperationEntry;
142
143 typedef struct
144 {
145         RelFileNodeBackend rnode;       /* the dead relation to delete */
146         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
147 } PendingUnlinkEntry;
148
149 static HTAB *pendingOpsTable = NULL;
150 static List *pendingUnlinks = NIL;
151
152 static CycleCtr mdsync_cycle_ctr = 0;
153 static CycleCtr mdckpt_cycle_ctr = 0;
154
155
156 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
157 {
158         EXTENSION_FAIL,                         /* ereport if segment not present */
159         EXTENSION_RETURN_NULL,          /* return NULL if not present */
160         EXTENSION_CREATE                        /* create new segments as needed */
161 } ExtensionBehavior;
162
163 /* local routines */
164 static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum,
165            ExtensionBehavior behavior);
166 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
167                                            MdfdVec *seg);
168 static void register_unlink(RelFileNodeBackend rnode);
169 static MdfdVec *_fdvec_alloc(void);
170 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
171                           BlockNumber segno);
172 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
173                           BlockNumber segno, int oflags);
174 static MdfdVec *_mdfd_getseg(SMgrRelation reln, ForkNumber forkno,
175                          BlockNumber blkno, bool skipFsync, ExtensionBehavior behavior);
176 static BlockNumber _mdnblocks(SMgrRelation reln, ForkNumber forknum,
177                    MdfdVec *seg);
178
179
180 /*
181  *      mdinit() -- Initialize private state for magnetic disk storage manager.
182  */
183 void
184 mdinit(void)
185 {
186         MdCxt = AllocSetContextCreate(TopMemoryContext,
187                                                                   "MdSmgr",
188                                                                   ALLOCSET_DEFAULT_MINSIZE,
189                                                                   ALLOCSET_DEFAULT_INITSIZE,
190                                                                   ALLOCSET_DEFAULT_MAXSIZE);
191
192         /*
193          * Create pending-operations hashtable if we need it.  Currently, we need
194          * it if we are standalone (not under a postmaster) OR if we are a
195          * bootstrap-mode subprocess of a postmaster (that is, a startup or
196          * bgwriter process).
197          */
198         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
199         {
200                 HASHCTL         hash_ctl;
201
202                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
203                 hash_ctl.keysize = sizeof(PendingOperationTag);
204                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
205                 hash_ctl.hash = tag_hash;
206                 hash_ctl.hcxt = MdCxt;
207                 pendingOpsTable = hash_create("Pending Ops Table",
208                                                                           100L,
209                                                                           &hash_ctl,
210                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
211                 pendingUnlinks = NIL;
212         }
213 }
214
215 /*
216  * In archive recovery, we rely on bgwriter to do fsyncs, but we will have
217  * already created the pendingOpsTable during initialization of the startup
218  * process.  Calling this function drops the local pendingOpsTable so that
219  * subsequent requests will be forwarded to bgwriter.
220  */
221 void
222 SetForwardFsyncRequests(void)
223 {
224         /* Perform any pending ops we may have queued up */
225         if (pendingOpsTable)
226                 mdsync();
227         pendingOpsTable = NULL;
228 }
229
230 /*
231  *      mdexists() -- Does the physical file exist?
232  *
233  * Note: this will return true for lingering files, with pending deletions
234  */
235 bool
236 mdexists(SMgrRelation reln, ForkNumber forkNum)
237 {
238         /*
239          * Close it first, to ensure that we notice if the fork has been unlinked
240          * since we opened it.
241          */
242         mdclose(reln, forkNum);
243
244         return (mdopen(reln, forkNum, EXTENSION_RETURN_NULL) != NULL);
245 }
246
247 /*
248  *      mdcreate() -- Create a new relation on magnetic disk.
249  *
250  * If isRedo is true, it's okay for the relation to exist already.
251  */
252 void
253 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
254 {
255         char       *path;
256         File            fd;
257
258         if (isRedo && reln->md_fd[forkNum] != NULL)
259                 return;                                 /* created and opened already... */
260
261         Assert(reln->md_fd[forkNum] == NULL);
262
263         path = relpath(reln->smgr_rnode, forkNum);
264
265         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
266
267         if (fd < 0)
268         {
269                 int                     save_errno = errno;
270
271                 /*
272                  * During bootstrap, there are cases where a system relation will be
273                  * accessed (by internal backend processes) before the bootstrap
274                  * script nominally creates it.  Therefore, allow the file to exist
275                  * already, even if isRedo is not set.  (See also mdopen)
276                  */
277                 if (isRedo || IsBootstrapProcessingMode())
278                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
279                 if (fd < 0)
280                 {
281                         /* be sure to report the error reported by create, not open */
282                         errno = save_errno;
283                         ereport(ERROR,
284                                         (errcode_for_file_access(),
285                                          errmsg("could not create file \"%s\": %m", path)));
286                 }
287         }
288
289         pfree(path);
290
291         if (reln->smgr_transient)
292                 FileSetTransient(fd);
293
294         reln->md_fd[forkNum] = _fdvec_alloc();
295
296         reln->md_fd[forkNum]->mdfd_vfd = fd;
297         reln->md_fd[forkNum]->mdfd_segno = 0;
298         reln->md_fd[forkNum]->mdfd_chain = NULL;
299 }
300
301 /*
302  *      mdunlink() -- Unlink a relation.
303  *
304  * Note that we're passed a RelFileNode --- by the time this is called,
305  * there won't be an SMgrRelation hashtable entry anymore.
306  *
307  * Actually, we don't unlink the first segment file of the relation, but
308  * just truncate it to zero length, and record a request to unlink it after
309  * the next checkpoint.  Additional segments can be unlinked immediately,
310  * however.  Leaving the empty file in place prevents that relfilenode
311  * number from being reused.  The scenario this protects us from is:
312  * 1. We delete a relation (and commit, and actually remove its file).
313  * 2. We create a new relation, which by chance gets the same relfilenode as
314  *        the just-deleted one (OIDs must've wrapped around for that to happen).
315  * 3. We crash before another checkpoint occurs.
316  * During replay, we would delete the file and then recreate it, which is fine
317  * if the contents of the file were repopulated by subsequent WAL entries.
318  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
319  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
320  * the contents of the file would be lost forever.      By leaving the empty file
321  * until after the next checkpoint, we prevent reassignment of the relfilenode
322  * number until it's safe, because relfilenode assignment skips over any
323  * existing file.
324  *
325  * If isRedo is true, it's okay for the relation to be already gone.
326  * Also, we should remove the file immediately instead of queuing a request
327  * for later, since during redo there's no possibility of creating a
328  * conflicting relation.
329  *
330  * Note: any failure should be reported as WARNING not ERROR, because
331  * we are usually not in a transaction anymore when this is called.
332  */
333 void
334 mdunlink(RelFileNodeBackend rnode, ForkNumber forkNum, bool isRedo)
335 {
336         char       *path;
337         int                     ret;
338
339         /*
340          * We have to clean out any pending fsync requests for the doomed
341          * relation, else the next mdsync() will fail.
342          */
343         ForgetRelationFsyncRequests(rnode, forkNum);
344
345         path = relpath(rnode, forkNum);
346
347         /*
348          * Delete or truncate the first segment.
349          */
350         if (isRedo || forkNum != MAIN_FORKNUM)
351         {
352                 ret = unlink(path);
353                 if (ret < 0)
354                 {
355                         if (!isRedo || errno != ENOENT)
356                                 ereport(WARNING,
357                                                 (errcode_for_file_access(),
358                                                  errmsg("could not remove file \"%s\": %m", path)));
359                 }
360         }
361         else
362         {
363                 /* truncate(2) would be easier here, but Windows hasn't got it */
364                 int                     fd;
365
366                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
367                 if (fd >= 0)
368                 {
369                         int                     save_errno;
370
371                         ret = ftruncate(fd, 0);
372                         save_errno = errno;
373                         close(fd);
374                         errno = save_errno;
375                 }
376                 else
377                         ret = -1;
378                 if (ret < 0 && errno != ENOENT)
379                         ereport(WARNING,
380                                         (errcode_for_file_access(),
381                                          errmsg("could not truncate file \"%s\": %m", path)));
382         }
383
384         /*
385          * Delete any additional segments.
386          */
387         if (ret >= 0)
388         {
389                 char       *segpath = (char *) palloc(strlen(path) + 12);
390                 BlockNumber segno;
391
392                 /*
393                  * Note that because we loop until getting ENOENT, we will correctly
394                  * remove all inactive segments as well as active ones.
395                  */
396                 for (segno = 1;; segno++)
397                 {
398                         sprintf(segpath, "%s.%u", path, segno);
399                         if (unlink(segpath) < 0)
400                         {
401                                 /* ENOENT is expected after the last segment... */
402                                 if (errno != ENOENT)
403                                         ereport(WARNING,
404                                                         (errcode_for_file_access(),
405                                            errmsg("could not remove file \"%s\": %m", segpath)));
406                                 break;
407                         }
408                 }
409                 pfree(segpath);
410         }
411
412         pfree(path);
413
414         /* Register request to unlink first segment later */
415         if (!isRedo && forkNum == MAIN_FORKNUM)
416                 register_unlink(rnode);
417 }
418
419 /*
420  *      mdextend() -- Add a block to the specified relation.
421  *
422  *              The semantics are nearly the same as mdwrite(): write at the
423  *              specified position.  However, this is to be used for the case of
424  *              extending a relation (i.e., blocknum is at or beyond the current
425  *              EOF).  Note that we assume writing a block beyond current EOF
426  *              causes intervening file space to become filled with zeroes.
427  */
428 void
429 mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
430                  char *buffer, bool skipFsync)
431 {
432         off_t           seekpos;
433         int                     nbytes;
434         MdfdVec    *v;
435
436         /* This assert is too expensive to have on normally ... */
437 #ifdef CHECK_WRITE_VS_EXTEND
438         Assert(blocknum >= mdnblocks(reln, forknum));
439 #endif
440
441         /*
442          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
443          * more --- we mustn't create a block whose number actually is
444          * InvalidBlockNumber.
445          */
446         if (blocknum == InvalidBlockNumber)
447                 ereport(ERROR,
448                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
449                                  errmsg("cannot extend file \"%s\" beyond %u blocks",
450                                                 relpath(reln->smgr_rnode, forknum),
451                                                 InvalidBlockNumber)));
452
453         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_CREATE);
454
455         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
456
457         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
458
459         /*
460          * Note: because caller usually obtained blocknum by calling mdnblocks,
461          * which did a seek(SEEK_END), this seek is often redundant and will be
462          * optimized away by fd.c.      It's not redundant, however, if there is a
463          * partial page at the end of the file. In that case we want to try to
464          * overwrite the partial page with a full page.  It's also not redundant
465          * if bufmgr.c had to dump another buffer of the same file to make room
466          * for the new page's buffer.
467          */
468         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
469                 ereport(ERROR,
470                                 (errcode_for_file_access(),
471                                  errmsg("could not seek to block %u in file \"%s\": %m",
472                                                 blocknum, FilePathName(v->mdfd_vfd))));
473
474         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
475         {
476                 if (nbytes < 0)
477                         ereport(ERROR,
478                                         (errcode_for_file_access(),
479                                          errmsg("could not extend file \"%s\": %m",
480                                                         FilePathName(v->mdfd_vfd)),
481                                          errhint("Check free disk space.")));
482                 /* short write: complain appropriately */
483                 ereport(ERROR,
484                                 (errcode(ERRCODE_DISK_FULL),
485                                  errmsg("could not extend file \"%s\": wrote only %d of %d bytes at block %u",
486                                                 FilePathName(v->mdfd_vfd),
487                                                 nbytes, BLCKSZ, blocknum),
488                                  errhint("Check free disk space.")));
489         }
490
491         if (!skipFsync && !SmgrIsTemp(reln))
492                 register_dirty_segment(reln, forknum, v);
493
494         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
495 }
496
497 /*
498  *      mdopen() -- Open the specified relation.
499  *
500  * Note we only open the first segment, when there are multiple segments.
501  *
502  * If first segment is not present, either ereport or return NULL according
503  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
504  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
505  * invent one out of whole cloth.
506  */
507 static MdfdVec *
508 mdopen(SMgrRelation reln, ForkNumber forknum, ExtensionBehavior behavior)
509 {
510         MdfdVec    *mdfd;
511         char       *path;
512         File            fd;
513
514         /* No work if already open */
515         if (reln->md_fd[forknum])
516                 return reln->md_fd[forknum];
517
518         path = relpath(reln->smgr_rnode, forknum);
519
520         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
521
522         if (fd < 0)
523         {
524                 /*
525                  * During bootstrap, there are cases where a system relation will be
526                  * accessed (by internal backend processes) before the bootstrap
527                  * script nominally creates it.  Therefore, accept mdopen() as a
528                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
529                  */
530                 if (IsBootstrapProcessingMode())
531                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
532                 if (fd < 0)
533                 {
534                         if (behavior == EXTENSION_RETURN_NULL &&
535                                 FILE_POSSIBLY_DELETED(errno))
536                         {
537                                 pfree(path);
538                                 return NULL;
539                         }
540                         ereport(ERROR,
541                                         (errcode_for_file_access(),
542                                          errmsg("could not open file \"%s\": %m", path)));
543                 }
544         }
545
546         pfree(path);
547
548         if (reln->smgr_transient)
549                 FileSetTransient(fd);
550
551         reln->md_fd[forknum] = mdfd = _fdvec_alloc();
552
553         mdfd->mdfd_vfd = fd;
554         mdfd->mdfd_segno = 0;
555         mdfd->mdfd_chain = NULL;
556         Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
557
558         return mdfd;
559 }
560
561 /*
562  *      mdclose() -- Close the specified relation, if it isn't closed already.
563  */
564 void
565 mdclose(SMgrRelation reln, ForkNumber forknum)
566 {
567         MdfdVec    *v = reln->md_fd[forknum];
568
569         /* No work if already closed */
570         if (v == NULL)
571                 return;
572
573         reln->md_fd[forknum] = NULL;    /* prevent dangling pointer after error */
574
575         while (v != NULL)
576         {
577                 MdfdVec    *ov = v;
578
579                 /* if not closed already */
580                 if (v->mdfd_vfd >= 0)
581                         FileClose(v->mdfd_vfd);
582                 /* Now free vector */
583                 v = v->mdfd_chain;
584                 pfree(ov);
585         }
586 }
587
588 /*
589  *      mdprefetch() -- Initiate asynchronous read of the specified block of a relation
590  */
591 void
592 mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
593 {
594 #ifdef USE_PREFETCH
595         off_t           seekpos;
596         MdfdVec    *v;
597
598         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
599
600         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
601
602         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
603
604         (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ);
605 #endif   /* USE_PREFETCH */
606 }
607
608
609 /*
610  *      mdread() -- Read the specified block from a relation.
611  */
612 void
613 mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
614            char *buffer)
615 {
616         off_t           seekpos;
617         int                     nbytes;
618         MdfdVec    *v;
619
620         TRACE_POSTGRESQL_SMGR_MD_READ_START(forknum, blocknum,
621                                                                                 reln->smgr_rnode.node.spcNode,
622                                                                                 reln->smgr_rnode.node.dbNode,
623                                                                                 reln->smgr_rnode.node.relNode,
624                                                                                 reln->smgr_rnode.backend);
625
626         v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL);
627
628         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
629
630         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
631
632         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
633                 ereport(ERROR,
634                                 (errcode_for_file_access(),
635                                  errmsg("could not seek to block %u in file \"%s\": %m",
636                                                 blocknum, FilePathName(v->mdfd_vfd))));
637
638         nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ);
639
640         TRACE_POSTGRESQL_SMGR_MD_READ_DONE(forknum, blocknum,
641                                                                            reln->smgr_rnode.node.spcNode,
642                                                                            reln->smgr_rnode.node.dbNode,
643                                                                            reln->smgr_rnode.node.relNode,
644                                                                            reln->smgr_rnode.backend,
645                                                                            nbytes,
646                                                                            BLCKSZ);
647
648         if (nbytes != BLCKSZ)
649         {
650                 if (nbytes < 0)
651                         ereport(ERROR,
652                                         (errcode_for_file_access(),
653                                          errmsg("could not read block %u in file \"%s\": %m",
654                                                         blocknum, FilePathName(v->mdfd_vfd))));
655
656                 /*
657                  * Short read: we are at or past EOF, or we read a partial block at
658                  * EOF.  Normally this is an error; upper levels should never try to
659                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
660                  * we are InRecovery, we should instead return zeroes without
661                  * complaining.  This allows, for example, the case of trying to
662                  * update a block that was later truncated away.
663                  */
664                 if (zero_damaged_pages || InRecovery)
665                         MemSet(buffer, 0, BLCKSZ);
666                 else
667                         ereport(ERROR,
668                                         (errcode(ERRCODE_DATA_CORRUPTED),
669                                          errmsg("could not read block %u in file \"%s\": read only %d of %d bytes",
670                                                         blocknum, FilePathName(v->mdfd_vfd),
671                                                         nbytes, BLCKSZ)));
672         }
673 }
674
675 /*
676  *      mdwrite() -- Write the supplied block at the appropriate location.
677  *
678  *              This is to be used only for updating already-existing blocks of a
679  *              relation (ie, those before the current EOF).  To extend a relation,
680  *              use mdextend().
681  */
682 void
683 mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
684                 char *buffer, bool skipFsync)
685 {
686         off_t           seekpos;
687         int                     nbytes;
688         MdfdVec    *v;
689
690         /* This assert is too expensive to have on normally ... */
691 #ifdef CHECK_WRITE_VS_EXTEND
692         Assert(blocknum < mdnblocks(reln, forknum));
693 #endif
694
695         TRACE_POSTGRESQL_SMGR_MD_WRITE_START(forknum, blocknum,
696                                                                                  reln->smgr_rnode.node.spcNode,
697                                                                                  reln->smgr_rnode.node.dbNode,
698                                                                                  reln->smgr_rnode.node.relNode,
699                                                                                  reln->smgr_rnode.backend);
700
701         v = _mdfd_getseg(reln, forknum, blocknum, skipFsync, EXTENSION_FAIL);
702
703         seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE));
704
705         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
706
707         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
708                 ereport(ERROR,
709                                 (errcode_for_file_access(),
710                                  errmsg("could not seek to block %u in file \"%s\": %m",
711                                                 blocknum, FilePathName(v->mdfd_vfd))));
712
713         nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ);
714
715         TRACE_POSTGRESQL_SMGR_MD_WRITE_DONE(forknum, blocknum,
716                                                                                 reln->smgr_rnode.node.spcNode,
717                                                                                 reln->smgr_rnode.node.dbNode,
718                                                                                 reln->smgr_rnode.node.relNode,
719                                                                                 reln->smgr_rnode.backend,
720                                                                                 nbytes,
721                                                                                 BLCKSZ);
722
723         if (nbytes != BLCKSZ)
724         {
725                 if (nbytes < 0)
726                         ereport(ERROR,
727                                         (errcode_for_file_access(),
728                                          errmsg("could not write block %u in file \"%s\": %m",
729                                                         blocknum, FilePathName(v->mdfd_vfd))));
730                 /* short write: complain appropriately */
731                 ereport(ERROR,
732                                 (errcode(ERRCODE_DISK_FULL),
733                                  errmsg("could not write block %u in file \"%s\": wrote only %d of %d bytes",
734                                                 blocknum,
735                                                 FilePathName(v->mdfd_vfd),
736                                                 nbytes, BLCKSZ),
737                                  errhint("Check free disk space.")));
738         }
739
740         if (!skipFsync && !SmgrIsTemp(reln))
741                 register_dirty_segment(reln, forknum, v);
742 }
743
744 /*
745  *      mdnblocks() -- Get the number of blocks stored in a relation.
746  *
747  *              Important side effect: all active segments of the relation are opened
748  *              and added to the mdfd_chain list.  If this routine has not been
749  *              called, then only segments up to the last one actually touched
750  *              are present in the chain.
751  */
752 BlockNumber
753 mdnblocks(SMgrRelation reln, ForkNumber forknum)
754 {
755         MdfdVec    *v = mdopen(reln, forknum, EXTENSION_FAIL);
756         BlockNumber nblocks;
757         BlockNumber segno = 0;
758
759         /*
760          * Skip through any segments that aren't the last one, to avoid redundant
761          * seeks on them.  We have previously verified that these segments are
762          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
763          *
764          * NOTE: this assumption could only be wrong if another backend has
765          * truncated the relation.      We rely on higher code levels to handle that
766          * scenario by closing and re-opening the md fd, which is handled via
767          * relcache flush.      (Since the bgwriter doesn't participate in relcache
768          * flush, it could have segment chain entries for inactive segments;
769          * that's OK because the bgwriter never needs to compute relation size.)
770          */
771         while (v->mdfd_chain != NULL)
772         {
773                 segno++;
774                 v = v->mdfd_chain;
775         }
776
777         for (;;)
778         {
779                 nblocks = _mdnblocks(reln, forknum, v);
780                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
781                         elog(FATAL, "segment too big");
782                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
783                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
784
785                 /*
786                  * If segment is exactly RELSEG_SIZE, advance to next one.
787                  */
788                 segno++;
789
790                 if (v->mdfd_chain == NULL)
791                 {
792                         /*
793                          * Because we pass O_CREAT, we will create the next segment (with
794                          * zero length) immediately, if the last segment is of length
795                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
796                          * the logic simple.
797                          */
798                         v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, O_CREAT);
799                         if (v->mdfd_chain == NULL)
800                                 ereport(ERROR,
801                                                 (errcode_for_file_access(),
802                                                  errmsg("could not open file \"%s\": %m",
803                                                                 _mdfd_segpath(reln, forknum, segno))));
804                 }
805
806                 v = v->mdfd_chain;
807         }
808 }
809
810 /*
811  *      mdtruncate() -- Truncate relation to specified number of blocks.
812  */
813 void
814 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
815 {
816         MdfdVec    *v;
817         BlockNumber curnblk;
818         BlockNumber priorblocks;
819
820         /*
821          * NOTE: mdnblocks makes sure we have opened all active segments, so that
822          * truncation loop will get them all!
823          */
824         curnblk = mdnblocks(reln, forknum);
825         if (nblocks > curnblk)
826         {
827                 /* Bogus request ... but no complaint if InRecovery */
828                 if (InRecovery)
829                         return;
830                 ereport(ERROR,
831                                 (errmsg("could not truncate file \"%s\" to %u blocks: it's only %u blocks now",
832                                                 relpath(reln->smgr_rnode, forknum),
833                                                 nblocks, curnblk)));
834         }
835         if (nblocks == curnblk)
836                 return;                                 /* no work */
837
838         v = mdopen(reln, forknum, EXTENSION_FAIL);
839
840         priorblocks = 0;
841         while (v != NULL)
842         {
843                 MdfdVec    *ov = v;
844
845                 if (priorblocks > nblocks)
846                 {
847                         /*
848                          * This segment is no longer active (and has already been unlinked
849                          * from the mdfd_chain). We truncate the file, but do not delete
850                          * it, for reasons explained in the header comments.
851                          */
852                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
853                                 ereport(ERROR,
854                                                 (errcode_for_file_access(),
855                                                  errmsg("could not truncate file \"%s\": %m",
856                                                                 FilePathName(v->mdfd_vfd))));
857
858                         if (!SmgrIsTemp(reln))
859                                 register_dirty_segment(reln, forknum, v);
860                         v = v->mdfd_chain;
861                         Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
862                                                                                                  * segment */
863                         pfree(ov);
864                 }
865                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
866                 {
867                         /*
868                          * This is the last segment we want to keep. Truncate the file to
869                          * the right length, and clear chain link that points to any
870                          * remaining segments (which we shall zap). NOTE: if nblocks is
871                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
872                          * segment to 0 length but keep it. This adheres to the invariant
873                          * given in the header comments.
874                          */
875                         BlockNumber lastsegblocks = nblocks - priorblocks;
876
877                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
878                                 ereport(ERROR,
879                                                 (errcode_for_file_access(),
880                                         errmsg("could not truncate file \"%s\" to %u blocks: %m",
881                                                    FilePathName(v->mdfd_vfd),
882                                                    nblocks)));
883                         if (!SmgrIsTemp(reln))
884                                 register_dirty_segment(reln, forknum, v);
885                         v = v->mdfd_chain;
886                         ov->mdfd_chain = NULL;
887                 }
888                 else
889                 {
890                         /*
891                          * We still need this segment and 0 or more blocks beyond it, so
892                          * nothing to do here.
893                          */
894                         v = v->mdfd_chain;
895                 }
896                 priorblocks += RELSEG_SIZE;
897         }
898 }
899
900 /*
901  *      mdimmedsync() -- Immediately sync a relation to stable storage.
902  *
903  * Note that only writes already issued are synced; this routine knows
904  * nothing of dirty buffers that may exist inside the buffer manager.
905  */
906 void
907 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
908 {
909         MdfdVec    *v;
910
911         /*
912          * NOTE: mdnblocks makes sure we have opened all active segments, so that
913          * fsync loop will get them all!
914          */
915         mdnblocks(reln, forknum);
916
917         v = mdopen(reln, forknum, EXTENSION_FAIL);
918
919         while (v != NULL)
920         {
921                 if (FileSync(v->mdfd_vfd) < 0)
922                         ereport(ERROR,
923                                         (errcode_for_file_access(),
924                                          errmsg("could not fsync file \"%s\": %m",
925                                                         FilePathName(v->mdfd_vfd))));
926                 v = v->mdfd_chain;
927         }
928 }
929
930 /*
931  *      mdsync() -- Sync previous writes to stable storage.
932  */
933 void
934 mdsync(void)
935 {
936         static bool mdsync_in_progress = false;
937
938         HASH_SEQ_STATUS hstat;
939         PendingOperationEntry *entry;
940         int                     absorb_counter;
941
942         /* Statistics on sync times */
943         int                     processed = 0;
944         instr_time      sync_start,
945                                 sync_end,
946                                 sync_diff;
947         uint64          elapsed;
948         uint64          longest = 0;
949         uint64          total_elapsed = 0;
950
951         /*
952          * This is only called during checkpoints, and checkpoints should only
953          * occur in processes that have created a pendingOpsTable.
954          */
955         if (!pendingOpsTable)
956                 elog(ERROR, "cannot sync without a pendingOpsTable");
957
958         /*
959          * If we are in the bgwriter, the sync had better include all fsync
960          * requests that were queued by backends up to this point.      The tightest
961          * race condition that could occur is that a buffer that must be written
962          * and fsync'd for the checkpoint could have been dumped by a backend just
963          * before it was visited by BufferSync().  We know the backend will have
964          * queued an fsync request before clearing the buffer's dirtybit, so we
965          * are safe as long as we do an Absorb after completing BufferSync().
966          */
967         AbsorbFsyncRequests();
968
969         /*
970          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
971          * checkpoint), we want to ignore fsync requests that are entered into the
972          * hashtable after this point --- they should be processed next time,
973          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
974          * ones: new ones will have cycle_ctr equal to the incremented value of
975          * mdsync_cycle_ctr.
976          *
977          * In normal circumstances, all entries present in the table at this point
978          * will have cycle_ctr exactly equal to the current (about to be old)
979          * value of mdsync_cycle_ctr.  However, if we fail partway through the
980          * fsync'ing loop, then older values of cycle_ctr might remain when we
981          * come back here to try again.  Repeated checkpoint failures would
982          * eventually wrap the counter around to the point where an old entry
983          * might appear new, causing us to skip it, possibly allowing a checkpoint
984          * to succeed that should not have.  To forestall wraparound, any time the
985          * previous mdsync() failed to complete, run through the table and
986          * forcibly set cycle_ctr = mdsync_cycle_ctr.
987          *
988          * Think not to merge this loop with the main loop, as the problem is
989          * exactly that that loop may fail before having visited all the entries.
990          * From a performance point of view it doesn't matter anyway, as this path
991          * will never be taken in a system that's functioning normally.
992          */
993         if (mdsync_in_progress)
994         {
995                 /* prior try failed, so update any stale cycle_ctr values */
996                 hash_seq_init(&hstat, pendingOpsTable);
997                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
998                 {
999                         entry->cycle_ctr = mdsync_cycle_ctr;
1000                 }
1001         }
1002
1003         /* Advance counter so that new hashtable entries are distinguishable */
1004         mdsync_cycle_ctr++;
1005
1006         /* Set flag to detect failure if we don't reach the end of the loop */
1007         mdsync_in_progress = true;
1008
1009         /* Now scan the hashtable for fsync requests to process */
1010         absorb_counter = FSYNCS_PER_ABSORB;
1011         hash_seq_init(&hstat, pendingOpsTable);
1012         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1013         {
1014                 /*
1015                  * If the entry is new then don't process it this time.  Note that
1016                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1017                  */
1018                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1019                         continue;
1020
1021                 /* Else assert we haven't missed it */
1022                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1023
1024                 /*
1025                  * If fsync is off then we don't have to bother opening the file at
1026                  * all.  (We delay checking until this point so that changing fsync on
1027                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1028                  * fall through to delete it.
1029                  */
1030                 if (enableFsync && !entry->canceled)
1031                 {
1032                         int                     failures;
1033
1034                         /*
1035                          * If in bgwriter, we want to absorb pending requests every so
1036                          * often to prevent overflow of the fsync request queue.  It is
1037                          * unspecified whether newly-added entries will be visited by
1038                          * hash_seq_search, but we don't care since we don't need to
1039                          * process them anyway.
1040                          */
1041                         if (--absorb_counter <= 0)
1042                         {
1043                                 AbsorbFsyncRequests();
1044                                 absorb_counter = FSYNCS_PER_ABSORB;
1045                         }
1046
1047                         /*
1048                          * The fsync table could contain requests to fsync segments that
1049                          * have been deleted (unlinked) by the time we get to them. Rather
1050                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1051                          * ignored, what we do on error is absorb pending requests and
1052                          * then retry.  Since mdunlink() queues a "revoke" message before
1053                          * actually unlinking, the fsync request is guaranteed to be
1054                          * marked canceled after the absorb if it really was this case.
1055                          * DROP DATABASE likewise has to tell us to forget fsync requests
1056                          * before it starts deletions.
1057                          */
1058                         for (failures = 0;; failures++)         /* loop exits at "break" */
1059                         {
1060                                 SMgrRelation reln;
1061                                 MdfdVec    *seg;
1062                                 char       *path;
1063
1064                                 /*
1065                                  * Find or create an smgr hash entry for this relation. This
1066                                  * may seem a bit unclean -- md calling smgr?  But it's really
1067                                  * the best solution.  It ensures that the open file reference
1068                                  * isn't permanently leaked if we get an error here. (You may
1069                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1070                                  * really, because the only case in which a checkpoint is done
1071                                  * by a process that isn't about to shut down is in the
1072                                  * bgwriter, and it will periodically do smgrcloseall(). This
1073                                  * fact justifies our not closing the reln in the success path
1074                                  * either, which is a good thing since in non-bgwriter cases
1075                                  * we couldn't safely do that.)  Furthermore, in many cases
1076                                  * the relation will have been dirtied through this same smgr
1077                                  * relation, and so we can save a file open/close cycle.
1078                                  */
1079                                 reln = smgropen(entry->tag.rnode.node,
1080                                                                 entry->tag.rnode.backend);
1081
1082                                 /*
1083                                  * It is possible that the relation has been dropped or
1084                                  * truncated since the fsync request was entered.  Therefore,
1085                                  * allow ENOENT, but only if we didn't fail already on this
1086                                  * file.  This applies both during _mdfd_getseg() and during
1087                                  * FileSync, since fd.c might have closed the file behind our
1088                                  * back.
1089                                  */
1090                                 seg = _mdfd_getseg(reln, entry->tag.forknum,
1091                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1092                                                                    false, EXTENSION_RETURN_NULL);
1093
1094                                 if (log_checkpoints)
1095                                         INSTR_TIME_SET_CURRENT(sync_start);
1096                                 else
1097                                         INSTR_TIME_SET_ZERO(sync_start);
1098
1099                                 if (seg != NULL &&
1100                                         FileSync(seg->mdfd_vfd) >= 0)
1101                                 {
1102                                         if (log_checkpoints && (!INSTR_TIME_IS_ZERO(sync_start)))
1103                                         {
1104                                                 INSTR_TIME_SET_CURRENT(sync_end);
1105                                                 sync_diff = sync_end;
1106                                                 INSTR_TIME_SUBTRACT(sync_diff, sync_start);
1107                                                 elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
1108                                                 if (elapsed > longest)
1109                                                         longest = elapsed;
1110                                                 total_elapsed += elapsed;
1111                                                 processed++;
1112                                                 elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec",
1113                                                          processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000);
1114                                         }
1115
1116                                         break;          /* success; break out of retry loop */
1117                                 }
1118
1119                                 /*
1120                                  * XXX is there any point in allowing more than one retry?
1121                                  * Don't see one at the moment, but easy to change the test
1122                                  * here if so.
1123                                  */
1124                                 path = _mdfd_segpath(reln, entry->tag.forknum,
1125                                                                          entry->tag.segno);
1126                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1127                                         failures > 0)
1128                                         ereport(ERROR,
1129                                                         (errcode_for_file_access(),
1130                                                    errmsg("could not fsync file \"%s\": %m", path)));
1131                                 else
1132                                         ereport(DEBUG1,
1133                                                         (errcode_for_file_access(),
1134                                            errmsg("could not fsync file \"%s\" but retrying: %m",
1135                                                           path)));
1136                                 pfree(path);
1137
1138                                 /*
1139                                  * Absorb incoming requests and check to see if canceled.
1140                                  */
1141                                 AbsorbFsyncRequests();
1142                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1143
1144                                 if (entry->canceled)
1145                                         break;
1146                         }                                       /* end retry loop */
1147                 }
1148
1149                 /*
1150                  * If we get here, either we fsync'd successfully, or we don't have to
1151                  * because enableFsync is off, or the entry is (now) marked canceled.
1152                  * Okay to delete it.
1153                  */
1154                 if (hash_search(pendingOpsTable, &entry->tag,
1155                                                 HASH_REMOVE, NULL) == NULL)
1156                         elog(ERROR, "pendingOpsTable corrupted");
1157         }                                                       /* end loop over hashtable entries */
1158
1159         /* Return sync performance metrics for report at checkpoint end */
1160         CheckpointStats.ckpt_sync_rels = processed;
1161         CheckpointStats.ckpt_longest_sync = longest;
1162         CheckpointStats.ckpt_agg_sync_time = total_elapsed;
1163
1164         /* Flag successful completion of mdsync */
1165         mdsync_in_progress = false;
1166 }
1167
1168 /*
1169  * mdpreckpt() -- Do pre-checkpoint work
1170  *
1171  * To distinguish unlink requests that arrived before this checkpoint
1172  * started from those that arrived during the checkpoint, we use a cycle
1173  * counter similar to the one we use for fsync requests. That cycle
1174  * counter is incremented here.
1175  *
1176  * This must be called *before* the checkpoint REDO point is determined.
1177  * That ensures that we won't delete files too soon.
1178  *
1179  * Note that we can't do anything here that depends on the assumption
1180  * that the checkpoint will be completed.
1181  */
1182 void
1183 mdpreckpt(void)
1184 {
1185         ListCell   *cell;
1186
1187         /*
1188          * In case the prior checkpoint wasn't completed, stamp all entries in the
1189          * list with the current cycle counter.  Anything that's in the list at
1190          * the start of checkpoint can surely be deleted after the checkpoint is
1191          * finished, regardless of when the request was made.
1192          */
1193         foreach(cell, pendingUnlinks)
1194         {
1195                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1196
1197                 entry->cycle_ctr = mdckpt_cycle_ctr;
1198         }
1199
1200         /*
1201          * Any unlink requests arriving after this point will be assigned the next
1202          * cycle counter, and won't be unlinked until next checkpoint.
1203          */
1204         mdckpt_cycle_ctr++;
1205 }
1206
1207 /*
1208  * mdpostckpt() -- Do post-checkpoint work
1209  *
1210  * Remove any lingering files that can now be safely removed.
1211  */
1212 void
1213 mdpostckpt(void)
1214 {
1215         while (pendingUnlinks != NIL)
1216         {
1217                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1218                 char       *path;
1219
1220                 /*
1221                  * New entries are appended to the end, so if the entry is new we've
1222                  * reached the end of old entries.
1223                  */
1224                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1225                         break;
1226
1227                 /* Else assert we haven't missed it */
1228                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1229
1230                 /* Unlink the file */
1231                 path = relpath(entry->rnode, MAIN_FORKNUM);
1232                 if (unlink(path) < 0)
1233                 {
1234                         /*
1235                          * There's a race condition, when the database is dropped at the
1236                          * same time that we process the pending unlink requests. If the
1237                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1238                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1239                          * the possibility that we delete the file first.
1240                          */
1241                         if (errno != ENOENT)
1242                                 ereport(WARNING,
1243                                                 (errcode_for_file_access(),
1244                                                  errmsg("could not remove file \"%s\": %m", path)));
1245                 }
1246                 pfree(path);
1247
1248                 pendingUnlinks = list_delete_first(pendingUnlinks);
1249                 pfree(entry);
1250         }
1251 }
1252
1253 /*
1254  * register_dirty_segment() -- Mark a relation segment as needing fsync
1255  *
1256  * If there is a local pending-ops table, just make an entry in it for
1257  * mdsync to process later.  Otherwise, try to pass off the fsync request
1258  * to the background writer process.  If that fails, just do the fsync
1259  * locally before returning (we expect this will not happen often enough
1260  * to be a performance problem).
1261  */
1262 static void
1263 register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1264 {
1265         if (pendingOpsTable)
1266         {
1267                 /* push it into local pending-ops table */
1268                 RememberFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno);
1269         }
1270         else
1271         {
1272                 if (ForwardFsyncRequest(reln->smgr_rnode, forknum, seg->mdfd_segno))
1273                         return;                         /* passed it off successfully */
1274
1275                 ereport(DEBUG1,
1276                                 (errmsg("could not forward fsync request because request queue is full")));
1277
1278                 if (FileSync(seg->mdfd_vfd) < 0)
1279                         ereport(ERROR,
1280                                         (errcode_for_file_access(),
1281                                          errmsg("could not fsync file \"%s\": %m",
1282                                                         FilePathName(seg->mdfd_vfd))));
1283         }
1284 }
1285
1286 /*
1287  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1288  *
1289  * As with register_dirty_segment, this could involve either a local or
1290  * a remote pending-ops table.
1291  */
1292 static void
1293 register_unlink(RelFileNodeBackend rnode)
1294 {
1295         if (pendingOpsTable)
1296         {
1297                 /* push it into local pending-ops table */
1298                 RememberFsyncRequest(rnode, MAIN_FORKNUM, UNLINK_RELATION_REQUEST);
1299         }
1300         else
1301         {
1302                 /*
1303                  * Notify the bgwriter about it.  If we fail to queue the request
1304                  * message, we have to sleep and try again, because we can't simply
1305                  * delete the file now.  Ugly, but hopefully won't happen often.
1306                  *
1307                  * XXX should we just leave the file orphaned instead?
1308                  */
1309                 Assert(IsUnderPostmaster);
1310                 while (!ForwardFsyncRequest(rnode, MAIN_FORKNUM,
1311                                                                         UNLINK_RELATION_REQUEST))
1312                         pg_usleep(10000L);      /* 10 msec seems a good number */
1313         }
1314 }
1315
1316 /*
1317  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1318  *
1319  * We stuff most fsync requests into the local hash table for execution
1320  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1321  * separate linked list, however, because they get processed separately.
1322  *
1323  * The range of possible segment numbers is way less than the range of
1324  * BlockNumber, so we can reserve high values of segno for special purposes.
1325  * We define three:
1326  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1327  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1328  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1329  *       checkpoint.
1330  *
1331  * (Handling the FORGET_* requests is a tad slow because the hash table has
1332  * to be searched linearly, but it doesn't seem worth rethinking the table
1333  * structure for them.)
1334  */
1335 void
1336 RememberFsyncRequest(RelFileNodeBackend rnode, ForkNumber forknum,
1337                                          BlockNumber segno)
1338 {
1339         Assert(pendingOpsTable);
1340
1341         if (segno == FORGET_RELATION_FSYNC)
1342         {
1343                 /* Remove any pending requests for the entire relation */
1344                 HASH_SEQ_STATUS hstat;
1345                 PendingOperationEntry *entry;
1346
1347                 hash_seq_init(&hstat, pendingOpsTable);
1348                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1349                 {
1350                         if (RelFileNodeBackendEquals(entry->tag.rnode, rnode) &&
1351                                 entry->tag.forknum == forknum)
1352                         {
1353                                 /* Okay, cancel this entry */
1354                                 entry->canceled = true;
1355                         }
1356                 }
1357         }
1358         else if (segno == FORGET_DATABASE_FSYNC)
1359         {
1360                 /* Remove any pending requests for the entire database */
1361                 HASH_SEQ_STATUS hstat;
1362                 PendingOperationEntry *entry;
1363                 ListCell   *cell,
1364                                    *prev,
1365                                    *next;
1366
1367                 /* Remove fsync requests */
1368                 hash_seq_init(&hstat, pendingOpsTable);
1369                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1370                 {
1371                         if (entry->tag.rnode.node.dbNode == rnode.node.dbNode)
1372                         {
1373                                 /* Okay, cancel this entry */
1374                                 entry->canceled = true;
1375                         }
1376                 }
1377
1378                 /* Remove unlink requests */
1379                 prev = NULL;
1380                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1381                 {
1382                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1383
1384                         next = lnext(cell);
1385                         if (entry->rnode.node.dbNode == rnode.node.dbNode)
1386                         {
1387                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1388                                 pfree(entry);
1389                         }
1390                         else
1391                                 prev = cell;
1392                 }
1393         }
1394         else if (segno == UNLINK_RELATION_REQUEST)
1395         {
1396                 /* Unlink request: put it in the linked list */
1397                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1398                 PendingUnlinkEntry *entry;
1399
1400                 entry = palloc(sizeof(PendingUnlinkEntry));
1401                 entry->rnode = rnode;
1402                 entry->cycle_ctr = mdckpt_cycle_ctr;
1403
1404                 pendingUnlinks = lappend(pendingUnlinks, entry);
1405
1406                 MemoryContextSwitchTo(oldcxt);
1407         }
1408         else
1409         {
1410                 /* Normal case: enter a request to fsync this segment */
1411                 PendingOperationTag key;
1412                 PendingOperationEntry *entry;
1413                 bool            found;
1414
1415                 /* ensure any pad bytes in the hash key are zeroed */
1416                 MemSet(&key, 0, sizeof(key));
1417                 key.rnode = rnode;
1418                 key.forknum = forknum;
1419                 key.segno = segno;
1420
1421                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1422                                                                                                           &key,
1423                                                                                                           HASH_ENTER,
1424                                                                                                           &found);
1425                 /* if new or previously canceled entry, initialize it */
1426                 if (!found || entry->canceled)
1427                 {
1428                         entry->canceled = false;
1429                         entry->cycle_ctr = mdsync_cycle_ctr;
1430                 }
1431
1432                 /*
1433                  * NB: it's intentional that we don't change cycle_ctr if the entry
1434                  * already exists.      The fsync request must be treated as old, even
1435                  * though the new request will be satisfied too by any subsequent
1436                  * fsync.
1437                  *
1438                  * However, if the entry is present but is marked canceled, we should
1439                  * act just as though it wasn't there.  The only case where this could
1440                  * happen would be if a file had been deleted, we received but did not
1441                  * yet act on the cancel request, and the same relfilenode was then
1442                  * assigned to a new file.      We mustn't lose the new request, but it
1443                  * should be considered new not old.
1444                  */
1445         }
1446 }
1447
1448 /*
1449  * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1450  */
1451 void
1452 ForgetRelationFsyncRequests(RelFileNodeBackend rnode, ForkNumber forknum)
1453 {
1454         if (pendingOpsTable)
1455         {
1456                 /* standalone backend or startup process: fsync state is local */
1457                 RememberFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC);
1458         }
1459         else if (IsUnderPostmaster)
1460         {
1461                 /*
1462                  * Notify the bgwriter about it.  If we fail to queue the revoke
1463                  * message, we have to sleep and try again ... ugly, but hopefully
1464                  * won't happen often.
1465                  *
1466                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1467                  * error would leave the no-longer-used file still present on disk,
1468                  * which would be bad, so I'm inclined to assume that the bgwriter
1469                  * will always empty the queue soon.
1470                  */
1471                 while (!ForwardFsyncRequest(rnode, forknum, FORGET_RELATION_FSYNC))
1472                         pg_usleep(10000L);      /* 10 msec seems a good number */
1473
1474                 /*
1475                  * Note we don't wait for the bgwriter to actually absorb the revoke
1476                  * message; see mdsync() for the implications.
1477                  */
1478         }
1479 }
1480
1481 /*
1482  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1483  */
1484 void
1485 ForgetDatabaseFsyncRequests(Oid dbid)
1486 {
1487         RelFileNodeBackend rnode;
1488
1489         rnode.node.dbNode = dbid;
1490         rnode.node.spcNode = 0;
1491         rnode.node.relNode = 0;
1492         rnode.backend = InvalidBackendId;
1493
1494         if (pendingOpsTable)
1495         {
1496                 /* standalone backend or startup process: fsync state is local */
1497                 RememberFsyncRequest(rnode, InvalidForkNumber, FORGET_DATABASE_FSYNC);
1498         }
1499         else if (IsUnderPostmaster)
1500         {
1501                 /* see notes in ForgetRelationFsyncRequests */
1502                 while (!ForwardFsyncRequest(rnode, InvalidForkNumber,
1503                                                                         FORGET_DATABASE_FSYNC))
1504                         pg_usleep(10000L);      /* 10 msec seems a good number */
1505         }
1506 }
1507
1508
1509 /*
1510  *      _fdvec_alloc() -- Make a MdfdVec object.
1511  */
1512 static MdfdVec *
1513 _fdvec_alloc(void)
1514 {
1515         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1516 }
1517
1518 /*
1519  * Return the filename for the specified segment of the relation. The
1520  * returned string is palloc'd.
1521  */
1522 static char *
1523 _mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno)
1524 {
1525         char       *path,
1526                            *fullpath;
1527
1528         path = relpath(reln->smgr_rnode, forknum);
1529
1530         if (segno > 0)
1531         {
1532                 /* be sure we have enough space for the '.segno' */
1533                 fullpath = (char *) palloc(strlen(path) + 12);
1534                 sprintf(fullpath, "%s.%u", path, segno);
1535                 pfree(path);
1536         }
1537         else
1538                 fullpath = path;
1539
1540         return fullpath;
1541 }
1542
1543 /*
1544  * Open the specified segment of the relation,
1545  * and make a MdfdVec object for it.  Returns NULL on failure.
1546  */
1547 static MdfdVec *
1548 _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
1549                           int oflags)
1550 {
1551         MdfdVec    *v;
1552         int                     fd;
1553         char       *fullpath;
1554
1555         fullpath = _mdfd_segpath(reln, forknum, segno);
1556
1557         /* open the file */
1558         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1559
1560         pfree(fullpath);
1561
1562         if (fd < 0)
1563                 return NULL;
1564
1565         if (reln->smgr_transient)
1566                 FileSetTransient(fd);
1567
1568         /* allocate an mdfdvec entry for it */
1569         v = _fdvec_alloc();
1570
1571         /* fill the entry */
1572         v->mdfd_vfd = fd;
1573         v->mdfd_segno = segno;
1574         v->mdfd_chain = NULL;
1575         Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
1576
1577         /* all done */
1578         return v;
1579 }
1580
1581 /*
1582  *      _mdfd_getseg() -- Find the segment of the relation holding the
1583  *              specified block.
1584  *
1585  * If the segment doesn't exist, we ereport, return NULL, or create the
1586  * segment, according to "behavior".  Note: skipFsync is only used in the
1587  * EXTENSION_CREATE case.
1588  */
1589 static MdfdVec *
1590 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
1591                          bool skipFsync, ExtensionBehavior behavior)
1592 {
1593         MdfdVec    *v = mdopen(reln, forknum, behavior);
1594         BlockNumber targetseg;
1595         BlockNumber nextsegno;
1596
1597         if (!v)
1598                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1599
1600         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1601         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1602         {
1603                 Assert(nextsegno == v->mdfd_segno + 1);
1604
1605                 if (v->mdfd_chain == NULL)
1606                 {
1607                         /*
1608                          * Normally we will create new segments only if authorized by the
1609                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1610                          * recovery, create segments anyway; this allows cases such as
1611                          * replaying WAL data that has a write into a high-numbered
1612                          * segment of a relation that was later deleted.  We want to go
1613                          * ahead and create the segments so we can finish out the replay.
1614                          *
1615                          * We have to maintain the invariant that segments before the last
1616                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1617                          * with zeroes if needed.  (This only matters if caller is
1618                          * extending the relation discontiguously, but that can happen in
1619                          * hash indexes.)
1620                          */
1621                         if (behavior == EXTENSION_CREATE || InRecovery)
1622                         {
1623                                 if (_mdnblocks(reln, forknum, v) < RELSEG_SIZE)
1624                                 {
1625                                         char       *zerobuf = palloc0(BLCKSZ);
1626
1627                                         mdextend(reln, forknum,
1628                                                          nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1629                                                          zerobuf, skipFsync);
1630                                         pfree(zerobuf);
1631                                 }
1632                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, +nextsegno, O_CREAT);
1633                         }
1634                         else
1635                         {
1636                                 /* We won't create segment if not existent */
1637                                 v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, 0);
1638                         }
1639                         if (v->mdfd_chain == NULL)
1640                         {
1641                                 if (behavior == EXTENSION_RETURN_NULL &&
1642                                         FILE_POSSIBLY_DELETED(errno))
1643                                         return NULL;
1644                                 ereport(ERROR,
1645                                                 (errcode_for_file_access(),
1646                                    errmsg("could not open file \"%s\" (target block %u): %m",
1647                                                   _mdfd_segpath(reln, forknum, nextsegno),
1648                                                   blkno)));
1649                         }
1650                 }
1651                 v = v->mdfd_chain;
1652         }
1653         return v;
1654 }
1655
1656 /*
1657  * Get number of blocks present in a single disk file
1658  */
1659 static BlockNumber
1660 _mdnblocks(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg)
1661 {
1662         off_t           len;
1663
1664         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1665         if (len < 0)
1666                 ereport(ERROR,
1667                                 (errcode_for_file_access(),
1668                                  errmsg("could not seek to end of file \"%s\": %m",
1669                                                 FilePathName(seg->mdfd_vfd))));
1670         /* note that this calculation will ignore any partial block at EOF */
1671         return (BlockNumber) (len / BLCKSZ);
1672 }