OSDN Git Service

fdc7c7d07261b8e6d0919f835c69331ca42a4861
[pg-rex/syncrep.git] / src / backend / storage / smgr / md.c
1 /*-------------------------------------------------------------------------
2  *
3  * md.c
4  *        This code manages relations that reside on magnetic disk.
5  *
6  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7  * Portions Copyright (c) 1994, Regents of the University of California
8  *
9  *
10  * IDENTIFICATION
11  *        $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.137 2008/04/18 06:48:38 heikki Exp $
12  *
13  *-------------------------------------------------------------------------
14  */
15 #include "postgres.h"
16
17 #include <unistd.h>
18 #include <fcntl.h>
19 #include <sys/file.h>
20
21 #include "catalog/catalog.h"
22 #include "miscadmin.h"
23 #include "postmaster/bgwriter.h"
24 #include "storage/fd.h"
25 #include "storage/bufmgr.h"
26 #include "storage/smgr.h"
27 #include "utils/hsearch.h"
28 #include "utils/memutils.h"
29
30
31 /* interval for calling AbsorbFsyncRequests in mdsync */
32 #define FSYNCS_PER_ABSORB               10
33
34 /* special values for the segno arg to RememberFsyncRequest */
35 #define FORGET_RELATION_FSYNC   (InvalidBlockNumber)
36 #define FORGET_DATABASE_FSYNC   (InvalidBlockNumber-1)
37 #define UNLINK_RELATION_REQUEST (InvalidBlockNumber-2)
38
39 /*
40  * On Windows, we have to interpret EACCES as possibly meaning the same as
41  * ENOENT, because if a file is unlinked-but-not-yet-gone on that platform,
42  * that's what you get.  Ugh.  This code is designed so that we don't
43  * actually believe these cases are okay without further evidence (namely,
44  * a pending fsync request getting revoked ... see mdsync).
45  */
46 #ifndef WIN32
47 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT)
48 #else
49 #define FILE_POSSIBLY_DELETED(err)      ((err) == ENOENT || (err) == EACCES)
50 #endif
51
52 /*
53  *      The magnetic disk storage manager keeps track of open file
54  *      descriptors in its own descriptor pool.  This is done to make it
55  *      easier to support relations that are larger than the operating
56  *      system's file size limit (often 2GBytes).  In order to do that,
57  *      we break relations up into "segment" files that are each shorter than
58  *      the OS file size limit.  The segment size is set by the RELSEG_SIZE
59  *      configuration constant in pg_config_manual.h.
60  *
61  *      On disk, a relation must consist of consecutively numbered segment
62  *      files in the pattern
63  *              -- Zero or more full segments of exactly RELSEG_SIZE blocks each
64  *              -- Exactly one partial segment of size 0 <= size < RELSEG_SIZE blocks
65  *              -- Optionally, any number of inactive segments of size 0 blocks.
66  *      The full and partial segments are collectively the "active" segments.
67  *      Inactive segments are those that once contained data but are currently
68  *      not needed because of an mdtruncate() operation.  The reason for leaving
69  *      them present at size zero, rather than unlinking them, is that other
70  *      backends and/or the bgwriter might be holding open file references to
71  *      such segments.  If the relation expands again after mdtruncate(), such
72  *      that a deactivated segment becomes active again, it is important that
73  *      such file references still be valid --- else data might get written
74  *      out to an unlinked old copy of a segment file that will eventually
75  *      disappear.
76  *
77  *      The file descriptor pointer (md_fd field) stored in the SMgrRelation
78  *      cache is, therefore, just the head of a list of MdfdVec objects, one
79  *      per segment.  But note the md_fd pointer can be NULL, indicating
80  *      relation not open.
81  *
82  *      Also note that mdfd_chain == NULL does not necessarily mean the relation
83  *      doesn't have another segment after this one; we may just not have
84  *      opened the next segment yet.  (We could not have "all segments are
85  *      in the chain" as an invariant anyway, since another backend could
86  *      extend the relation when we weren't looking.)  We do not make chain
87  *      entries for inactive segments, however; as soon as we find a partial
88  *      segment, we assume that any subsequent segments are inactive.
89  *
90  *      All MdfdVec objects are palloc'd in the MdCxt memory context.
91  *
92  *      On platforms that support large files, USE_SEGMENTED_FILES can be
93  *      #undef'd to disable the segmentation logic.  In that case each
94  *      relation is a single operating-system file.
95  */
96
97 typedef struct _MdfdVec
98 {
99         File            mdfd_vfd;               /* fd number in fd.c's pool */
100         BlockNumber mdfd_segno;         /* segment number, from 0 */
101 #ifdef USE_SEGMENTED_FILES
102         struct _MdfdVec *mdfd_chain;    /* next segment, or NULL */
103 #endif
104 } MdfdVec;
105
106 static MemoryContext MdCxt;             /* context for all md.c allocations */
107
108
109 /*
110  * In some contexts (currently, standalone backends and the bgwriter process)
111  * we keep track of pending fsync operations: we need to remember all relation
112  * segments that have been written since the last checkpoint, so that we can
113  * fsync them down to disk before completing the next checkpoint.  This hash
114  * table remembers the pending operations.      We use a hash table mostly as
115  * a convenient way of eliminating duplicate requests.
116  *
117  * We use a similar mechanism to remember no-longer-needed files that can
118  * be deleted after the next checkpoint, but we use a linked list instead of
119  * a hash table, because we don't expect there to be any duplicate requests.
120  *
121  * (Regular backends do not track pending operations locally, but forward
122  * them to the bgwriter.)
123  */
124 typedef struct
125 {
126         RelFileNode rnode;                      /* the targeted relation */
127         BlockNumber segno;                      /* which segment */
128 } PendingOperationTag;
129
130 typedef uint16 CycleCtr;                /* can be any convenient integer size */
131
132 typedef struct
133 {
134         PendingOperationTag tag;        /* hash table key (must be first!) */
135         bool            canceled;               /* T => request canceled, not yet removed */
136         CycleCtr        cycle_ctr;              /* mdsync_cycle_ctr when request was made */
137 } PendingOperationEntry;
138
139 typedef struct
140 {
141         RelFileNode rnode;                      /* the dead relation to delete */
142         CycleCtr        cycle_ctr;              /* mdckpt_cycle_ctr when request was made */
143 } PendingUnlinkEntry;
144
145 static HTAB *pendingOpsTable = NULL;
146 static List *pendingUnlinks = NIL;
147
148 static CycleCtr mdsync_cycle_ctr = 0;
149 static CycleCtr mdckpt_cycle_ctr = 0;
150
151
152 typedef enum                                    /* behavior for mdopen & _mdfd_getseg */
153 {
154         EXTENSION_FAIL,                         /* ereport if segment not present */
155         EXTENSION_RETURN_NULL,          /* return NULL if not present */
156         EXTENSION_CREATE                        /* create new segments as needed */
157 } ExtensionBehavior;
158
159 /* local routines */
160 static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
161 static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
162 static void register_unlink(RelFileNode rnode);
163 static MdfdVec *_fdvec_alloc(void);
164
165 #ifdef USE_SEGMENTED_FILES
166 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
167                           int oflags);
168 #endif
169 static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
170                          bool isTemp, ExtensionBehavior behavior);
171 static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
172
173
174 /*
175  *      mdinit() -- Initialize private state for magnetic disk storage manager.
176  */
177 void
178 mdinit(void)
179 {
180         MdCxt = AllocSetContextCreate(TopMemoryContext,
181                                                                   "MdSmgr",
182                                                                   ALLOCSET_DEFAULT_MINSIZE,
183                                                                   ALLOCSET_DEFAULT_INITSIZE,
184                                                                   ALLOCSET_DEFAULT_MAXSIZE);
185
186         /*
187          * Create pending-operations hashtable if we need it.  Currently, we need
188          * it if we are standalone (not under a postmaster) OR if we are a
189          * bootstrap-mode subprocess of a postmaster (that is, a startup or
190          * bgwriter process).
191          */
192         if (!IsUnderPostmaster || IsBootstrapProcessingMode())
193         {
194                 HASHCTL         hash_ctl;
195
196                 MemSet(&hash_ctl, 0, sizeof(hash_ctl));
197                 hash_ctl.keysize = sizeof(PendingOperationTag);
198                 hash_ctl.entrysize = sizeof(PendingOperationEntry);
199                 hash_ctl.hash = tag_hash;
200                 hash_ctl.hcxt = MdCxt;
201                 pendingOpsTable = hash_create("Pending Ops Table",
202                                                                           100L,
203                                                                           &hash_ctl,
204                                                                    HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
205                 pendingUnlinks = NIL;
206         }
207 }
208
209 /*
210  *      mdcreate() -- Create a new relation on magnetic disk.
211  *
212  * If isRedo is true, it's okay for the relation to exist already.
213  */
214 void
215 mdcreate(SMgrRelation reln, bool isRedo)
216 {
217         char       *path;
218         File            fd;
219
220         if (isRedo && reln->md_fd != NULL)
221                 return;                                 /* created and opened already... */
222
223         Assert(reln->md_fd == NULL);
224
225         path = relpath(reln->smgr_rnode);
226
227         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
228
229         if (fd < 0)
230         {
231                 int                     save_errno = errno;
232
233                 /*
234                  * During bootstrap, there are cases where a system relation will be
235                  * accessed (by internal backend processes) before the bootstrap
236                  * script nominally creates it.  Therefore, allow the file to exist
237                  * already, even if isRedo is not set.  (See also mdopen)
238                  */
239                 if (isRedo || IsBootstrapProcessingMode())
240                         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
241                 if (fd < 0)
242                 {
243                         pfree(path);
244                         /* be sure to report the error reported by create, not open */
245                         errno = save_errno;
246                         ereport(ERROR,
247                                         (errcode_for_file_access(),
248                                          errmsg("could not create relation %u/%u/%u: %m",
249                                                         reln->smgr_rnode.spcNode,
250                                                         reln->smgr_rnode.dbNode,
251                                                         reln->smgr_rnode.relNode)));
252                 }
253         }
254
255         pfree(path);
256
257         reln->md_fd = _fdvec_alloc();
258
259         reln->md_fd->mdfd_vfd = fd;
260         reln->md_fd->mdfd_segno = 0;
261 #ifdef USE_SEGMENTED_FILES
262         reln->md_fd->mdfd_chain = NULL;
263 #endif
264 }
265
266 /*
267  *      mdunlink() -- Unlink a relation.
268  *
269  * Note that we're passed a RelFileNode --- by the time this is called,
270  * there won't be an SMgrRelation hashtable entry anymore.
271  *
272  * Actually, we don't unlink the first segment file of the relation, but
273  * just truncate it to zero length, and record a request to unlink it after
274  * the next checkpoint.  Additional segments can be unlinked immediately,
275  * however.  Leaving the empty file in place prevents that relfilenode
276  * number from being reused.  The scenario this protects us from is:
277  * 1. We delete a relation (and commit, and actually remove its file).
278  * 2. We create a new relation, which by chance gets the same relfilenode as
279  *        the just-deleted one (OIDs must've wrapped around for that to happen).
280  * 3. We crash before another checkpoint occurs.
281  * During replay, we would delete the file and then recreate it, which is fine
282  * if the contents of the file were repopulated by subsequent WAL entries.
283  * But if we didn't WAL-log insertions, but instead relied on fsyncing the
284  * file after populating it (as for instance CLUSTER and CREATE INDEX do),
285  * the contents of the file would be lost forever.      By leaving the empty file
286  * until after the next checkpoint, we prevent reassignment of the relfilenode
287  * number until it's safe, because relfilenode assignment skips over any
288  * existing file.
289  *
290  * If isRedo is true, it's okay for the relation to be already gone.
291  * Also, we should remove the file immediately instead of queuing a request
292  * for later, since during redo there's no possibility of creating a
293  * conflicting relation.
294  *
295  * Note: any failure should be reported as WARNING not ERROR, because
296  * we are usually not in a transaction anymore when this is called.
297  */
298 void
299 mdunlink(RelFileNode rnode, bool isRedo)
300 {
301         char       *path;
302         int                     ret;
303
304         /*
305          * We have to clean out any pending fsync requests for the doomed
306          * relation, else the next mdsync() will fail.
307          */
308         ForgetRelationFsyncRequests(rnode);
309
310         path = relpath(rnode);
311
312         /*
313          * Delete or truncate the first segment, or only segment if not doing
314          * segmenting
315          */
316         if (isRedo)
317                 ret = unlink(path);
318         else
319         {
320                 /* truncate(2) would be easier here, but Windows hasn't got it */
321                 int                     fd;
322
323                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY, 0);
324                 if (fd >= 0)
325                 {
326                         int                     save_errno;
327
328                         ret = ftruncate(fd, 0);
329                         save_errno = errno;
330                         close(fd);
331                         errno = save_errno;
332                 }
333                 else
334                         ret = -1;
335         }
336         if (ret < 0)
337         {
338                 if (!isRedo || errno != ENOENT)
339                         ereport(WARNING,
340                                         (errcode_for_file_access(),
341                                          errmsg("could not remove relation %u/%u/%u: %m",
342                                                         rnode.spcNode,
343                                                         rnode.dbNode,
344                                                         rnode.relNode)));
345         }
346
347 #ifdef USE_SEGMENTED_FILES
348         /* Delete the additional segments, if any */
349         else
350         {
351                 char       *segpath = (char *) palloc(strlen(path) + 12);
352                 BlockNumber segno;
353
354                 /*
355                  * Note that because we loop until getting ENOENT, we will correctly
356                  * remove all inactive segments as well as active ones.
357                  */
358                 for (segno = 1;; segno++)
359                 {
360                         sprintf(segpath, "%s.%u", path, segno);
361                         if (unlink(segpath) < 0)
362                         {
363                                 /* ENOENT is expected after the last segment... */
364                                 if (errno != ENOENT)
365                                         ereport(WARNING,
366                                                         (errcode_for_file_access(),
367                                                          errmsg("could not remove segment %u of relation %u/%u/%u: %m",
368                                                                         segno,
369                                                                         rnode.spcNode,
370                                                                         rnode.dbNode,
371                                                                         rnode.relNode)));
372                                 break;
373                         }
374                 }
375                 pfree(segpath);
376         }
377 #endif
378
379         pfree(path);
380
381         /* Register request to unlink first segment later */
382         if (!isRedo)
383                 register_unlink(rnode);
384 }
385
386 /*
387  *      mdextend() -- Add a block to the specified relation.
388  *
389  *              The semantics are nearly the same as mdwrite(): write at the
390  *              specified position.  However, this is to be used for the case of
391  *              extending a relation (i.e., blocknum is at or beyond the current
392  *              EOF).  Note that we assume writing a block beyond current EOF
393  *              causes intervening file space to become filled with zeroes.
394  */
395 void
396 mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
397 {
398         off_t           seekpos;
399         int                     nbytes;
400         MdfdVec    *v;
401
402         /* This assert is too expensive to have on normally ... */
403 #ifdef CHECK_WRITE_VS_EXTEND
404         Assert(blocknum >= mdnblocks(reln));
405 #endif
406
407         /*
408          * If a relation manages to grow to 2^32-1 blocks, refuse to extend it any
409          * more --- we mustn't create a block whose number actually is
410          * InvalidBlockNumber.
411          */
412         if (blocknum == InvalidBlockNumber)
413                 ereport(ERROR,
414                                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
415                                  errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
416                                                 reln->smgr_rnode.spcNode,
417                                                 reln->smgr_rnode.dbNode,
418                                                 reln->smgr_rnode.relNode,
419                                                 InvalidBlockNumber)));
420
421         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
422
423 #ifdef USE_SEGMENTED_FILES
424         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
425         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
426 #else
427         seekpos = (off_t) BLCKSZ * blocknum;
428 #endif
429
430         /*
431          * Note: because caller usually obtained blocknum by calling mdnblocks,
432          * which did a seek(SEEK_END), this seek is often redundant and will be
433          * optimized away by fd.c.      It's not redundant, however, if there is a
434          * partial page at the end of the file. In that case we want to try to
435          * overwrite the partial page with a full page.  It's also not redundant
436          * if bufmgr.c had to dump another buffer of the same file to make room
437          * for the new page's buffer.
438          */
439         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
440                 ereport(ERROR,
441                                 (errcode_for_file_access(),
442                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
443                                                 blocknum,
444                                                 reln->smgr_rnode.spcNode,
445                                                 reln->smgr_rnode.dbNode,
446                                                 reln->smgr_rnode.relNode)));
447
448         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
449         {
450                 if (nbytes < 0)
451                         ereport(ERROR,
452                                         (errcode_for_file_access(),
453                                          errmsg("could not extend relation %u/%u/%u: %m",
454                                                         reln->smgr_rnode.spcNode,
455                                                         reln->smgr_rnode.dbNode,
456                                                         reln->smgr_rnode.relNode),
457                                          errhint("Check free disk space.")));
458                 /* short write: complain appropriately */
459                 ereport(ERROR,
460                                 (errcode(ERRCODE_DISK_FULL),
461                                  errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
462                                                 reln->smgr_rnode.spcNode,
463                                                 reln->smgr_rnode.dbNode,
464                                                 reln->smgr_rnode.relNode,
465                                                 nbytes, BLCKSZ, blocknum),
466                                  errhint("Check free disk space.")));
467         }
468
469         if (!isTemp)
470                 register_dirty_segment(reln, v);
471
472 #ifdef USE_SEGMENTED_FILES
473         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
474 #endif
475 }
476
477 /*
478  *      mdopen() -- Open the specified relation.
479  *
480  * Note we only open the first segment, when there are multiple segments.
481  *
482  * If first segment is not present, either ereport or return NULL according
483  * to "behavior".  We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
484  * EXTENSION_CREATE means it's OK to extend an existing relation, not to
485  * invent one out of whole cloth.
486  */
487 static MdfdVec *
488 mdopen(SMgrRelation reln, ExtensionBehavior behavior)
489 {
490         MdfdVec    *mdfd;
491         char       *path;
492         File            fd;
493
494         /* No work if already open */
495         if (reln->md_fd)
496                 return reln->md_fd;
497
498         path = relpath(reln->smgr_rnode);
499
500         fd = PathNameOpenFile(path, O_RDWR | PG_BINARY, 0600);
501
502         if (fd < 0)
503         {
504                 /*
505                  * During bootstrap, there are cases where a system relation will be
506                  * accessed (by internal backend processes) before the bootstrap
507                  * script nominally creates it.  Therefore, accept mdopen() as a
508                  * substitute for mdcreate() in bootstrap mode only. (See mdcreate)
509                  */
510                 if (IsBootstrapProcessingMode())
511                         fd = PathNameOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, 0600);
512                 if (fd < 0)
513                 {
514                         pfree(path);
515                         if (behavior == EXTENSION_RETURN_NULL &&
516                                 FILE_POSSIBLY_DELETED(errno))
517                                 return NULL;
518                         ereport(ERROR,
519                                         (errcode_for_file_access(),
520                                          errmsg("could not open relation %u/%u/%u: %m",
521                                                         reln->smgr_rnode.spcNode,
522                                                         reln->smgr_rnode.dbNode,
523                                                         reln->smgr_rnode.relNode)));
524                 }
525         }
526
527         pfree(path);
528
529         reln->md_fd = mdfd = _fdvec_alloc();
530
531         mdfd->mdfd_vfd = fd;
532         mdfd->mdfd_segno = 0;
533 #ifdef USE_SEGMENTED_FILES
534         mdfd->mdfd_chain = NULL;
535         Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
536 #endif
537
538         return mdfd;
539 }
540
541 /*
542  *      mdclose() -- Close the specified relation, if it isn't closed already.
543  */
544 void
545 mdclose(SMgrRelation reln)
546 {
547         MdfdVec    *v = reln->md_fd;
548
549         /* No work if already closed */
550         if (v == NULL)
551                 return;
552
553         reln->md_fd = NULL;                     /* prevent dangling pointer after error */
554
555 #ifdef USE_SEGMENTED_FILES
556         while (v != NULL)
557         {
558                 MdfdVec    *ov = v;
559
560                 /* if not closed already */
561                 if (v->mdfd_vfd >= 0)
562                         FileClose(v->mdfd_vfd);
563                 /* Now free vector */
564                 v = v->mdfd_chain;
565                 pfree(ov);
566         }
567 #else
568         if (v->mdfd_vfd >= 0)
569                 FileClose(v->mdfd_vfd);
570         pfree(v);
571 #endif
572 }
573
574 /*
575  *      mdread() -- Read the specified block from a relation.
576  */
577 void
578 mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
579 {
580         off_t           seekpos;
581         int                     nbytes;
582         MdfdVec    *v;
583
584         v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
585
586 #ifdef USE_SEGMENTED_FILES
587         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
588         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
589 #else
590         seekpos = (off_t) BLCKSZ * blocknum;
591 #endif
592
593         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
594                 ereport(ERROR,
595                                 (errcode_for_file_access(),
596                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
597                                                 blocknum,
598                                                 reln->smgr_rnode.spcNode,
599                                                 reln->smgr_rnode.dbNode,
600                                                 reln->smgr_rnode.relNode)));
601
602         if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
603         {
604                 if (nbytes < 0)
605                         ereport(ERROR,
606                                         (errcode_for_file_access(),
607                                    errmsg("could not read block %u of relation %u/%u/%u: %m",
608                                                   blocknum,
609                                                   reln->smgr_rnode.spcNode,
610                                                   reln->smgr_rnode.dbNode,
611                                                   reln->smgr_rnode.relNode)));
612
613                 /*
614                  * Short read: we are at or past EOF, or we read a partial block at
615                  * EOF.  Normally this is an error; upper levels should never try to
616                  * read a nonexistent block.  However, if zero_damaged_pages is ON or
617                  * we are InRecovery, we should instead return zeroes without
618                  * complaining.  This allows, for example, the case of trying to
619                  * update a block that was later truncated away.
620                  */
621                 if (zero_damaged_pages || InRecovery)
622                         MemSet(buffer, 0, BLCKSZ);
623                 else
624                         ereport(ERROR,
625                                         (errcode(ERRCODE_DATA_CORRUPTED),
626                                          errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
627                                                         blocknum,
628                                                         reln->smgr_rnode.spcNode,
629                                                         reln->smgr_rnode.dbNode,
630                                                         reln->smgr_rnode.relNode,
631                                                         nbytes, BLCKSZ)));
632         }
633 }
634
635 /*
636  *      mdwrite() -- Write the supplied block at the appropriate location.
637  *
638  *              This is to be used only for updating already-existing blocks of a
639  *              relation (ie, those before the current EOF).  To extend a relation,
640  *              use mdextend().
641  */
642 void
643 mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
644 {
645         off_t           seekpos;
646         int                     nbytes;
647         MdfdVec    *v;
648
649         /* This assert is too expensive to have on normally ... */
650 #ifdef CHECK_WRITE_VS_EXTEND
651         Assert(blocknum < mdnblocks(reln));
652 #endif
653
654         v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
655
656 #ifdef USE_SEGMENTED_FILES
657         seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE));
658         Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE);
659 #else
660         seekpos = (off_t) BLCKSZ * blocknum;
661 #endif
662
663         if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
664                 ereport(ERROR,
665                                 (errcode_for_file_access(),
666                                  errmsg("could not seek to block %u of relation %u/%u/%u: %m",
667                                                 blocknum,
668                                                 reln->smgr_rnode.spcNode,
669                                                 reln->smgr_rnode.dbNode,
670                                                 reln->smgr_rnode.relNode)));
671
672         if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
673         {
674                 if (nbytes < 0)
675                         ereport(ERROR,
676                                         (errcode_for_file_access(),
677                                   errmsg("could not write block %u of relation %u/%u/%u: %m",
678                                                  blocknum,
679                                                  reln->smgr_rnode.spcNode,
680                                                  reln->smgr_rnode.dbNode,
681                                                  reln->smgr_rnode.relNode)));
682                 /* short write: complain appropriately */
683                 ereport(ERROR,
684                                 (errcode(ERRCODE_DISK_FULL),
685                                  errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
686                                                 blocknum,
687                                                 reln->smgr_rnode.spcNode,
688                                                 reln->smgr_rnode.dbNode,
689                                                 reln->smgr_rnode.relNode,
690                                                 nbytes, BLCKSZ),
691                                  errhint("Check free disk space.")));
692         }
693
694         if (!isTemp)
695                 register_dirty_segment(reln, v);
696 }
697
698 /*
699  *      mdnblocks() -- Get the number of blocks stored in a relation.
700  *
701  *              Important side effect: all active segments of the relation are opened
702  *              and added to the mdfd_chain list.  If this routine has not been
703  *              called, then only segments up to the last one actually touched
704  *              are present in the chain.
705  */
706 BlockNumber
707 mdnblocks(SMgrRelation reln)
708 {
709         MdfdVec    *v = mdopen(reln, EXTENSION_FAIL);
710
711 #ifdef USE_SEGMENTED_FILES
712         BlockNumber nblocks;
713         BlockNumber segno = 0;
714
715         /*
716          * Skip through any segments that aren't the last one, to avoid redundant
717          * seeks on them.  We have previously verified that these segments are
718          * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
719          *
720          * NOTE: this assumption could only be wrong if another backend has
721          * truncated the relation.      We rely on higher code levels to handle that
722          * scenario by closing and re-opening the md fd, which is handled via
723          * relcache flush.      (Since the bgwriter doesn't participate in relcache
724          * flush, it could have segment chain entries for inactive segments;
725          * that's OK because the bgwriter never needs to compute relation size.)
726          */
727         while (v->mdfd_chain != NULL)
728         {
729                 segno++;
730                 v = v->mdfd_chain;
731         }
732
733         for (;;)
734         {
735                 nblocks = _mdnblocks(reln, v);
736                 if (nblocks > ((BlockNumber) RELSEG_SIZE))
737                         elog(FATAL, "segment too big");
738                 if (nblocks < ((BlockNumber) RELSEG_SIZE))
739                         return (segno * ((BlockNumber) RELSEG_SIZE)) + nblocks;
740
741                 /*
742                  * If segment is exactly RELSEG_SIZE, advance to next one.
743                  */
744                 segno++;
745
746                 if (v->mdfd_chain == NULL)
747                 {
748                         /*
749                          * Because we pass O_CREAT, we will create the next segment (with
750                          * zero length) immediately, if the last segment is of length
751                          * RELSEG_SIZE.  While perhaps not strictly necessary, this keeps
752                          * the logic simple.
753                          */
754                         v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
755                         if (v->mdfd_chain == NULL)
756                                 ereport(ERROR,
757                                                 (errcode_for_file_access(),
758                                  errmsg("could not open segment %u of relation %u/%u/%u: %m",
759                                                 segno,
760                                                 reln->smgr_rnode.spcNode,
761                                                 reln->smgr_rnode.dbNode,
762                                                 reln->smgr_rnode.relNode)));
763                 }
764
765                 v = v->mdfd_chain;
766         }
767 #else
768         return _mdnblocks(reln, v);
769 #endif
770 }
771
772 /*
773  *      mdtruncate() -- Truncate relation to specified number of blocks.
774  */
775 void
776 mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
777 {
778         MdfdVec    *v;
779         BlockNumber curnblk;
780
781 #ifdef USE_SEGMENTED_FILES
782         BlockNumber priorblocks;
783 #endif
784
785         /*
786          * NOTE: mdnblocks makes sure we have opened all active segments, so that
787          * truncation loop will get them all!
788          */
789         curnblk = mdnblocks(reln);
790         if (nblocks > curnblk)
791         {
792                 /* Bogus request ... but no complaint if InRecovery */
793                 if (InRecovery)
794                         return;
795                 ereport(ERROR,
796                                 (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
797                                                 reln->smgr_rnode.spcNode,
798                                                 reln->smgr_rnode.dbNode,
799                                                 reln->smgr_rnode.relNode,
800                                                 nblocks, curnblk)));
801         }
802         if (nblocks == curnblk)
803                 return;                                 /* no work */
804
805         v = mdopen(reln, EXTENSION_FAIL);
806
807 #ifdef USE_SEGMENTED_FILES
808         priorblocks = 0;
809         while (v != NULL)
810         {
811                 MdfdVec    *ov = v;
812
813                 if (priorblocks > nblocks)
814                 {
815                         /*
816                          * This segment is no longer active (and has already been unlinked
817                          * from the mdfd_chain). We truncate the file, but do not delete
818                          * it, for reasons explained in the header comments.
819                          */
820                         if (FileTruncate(v->mdfd_vfd, 0) < 0)
821                                 ereport(ERROR,
822                                                 (errcode_for_file_access(),
823                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
824                                                                 reln->smgr_rnode.spcNode,
825                                                                 reln->smgr_rnode.dbNode,
826                                                                 reln->smgr_rnode.relNode,
827                                                                 nblocks)));
828                         if (!isTemp)
829                                 register_dirty_segment(reln, v);
830                         v = v->mdfd_chain;
831                         Assert(ov != reln->md_fd);      /* we never drop the 1st segment */
832                         pfree(ov);
833                 }
834                 else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
835                 {
836                         /*
837                          * This is the last segment we want to keep. Truncate the file to
838                          * the right length, and clear chain link that points to any
839                          * remaining segments (which we shall zap). NOTE: if nblocks is
840                          * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
841                          * segment to 0 length but keep it. This adheres to the invariant
842                          * given in the header comments.
843                          */
844                         BlockNumber lastsegblocks = nblocks - priorblocks;
845
846                         if (FileTruncate(v->mdfd_vfd, (off_t) lastsegblocks * BLCKSZ) < 0)
847                                 ereport(ERROR,
848                                                 (errcode_for_file_access(),
849                                                  errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
850                                                                 reln->smgr_rnode.spcNode,
851                                                                 reln->smgr_rnode.dbNode,
852                                                                 reln->smgr_rnode.relNode,
853                                                                 nblocks)));
854                         if (!isTemp)
855                                 register_dirty_segment(reln, v);
856                         v = v->mdfd_chain;
857                         ov->mdfd_chain = NULL;
858                 }
859                 else
860                 {
861                         /*
862                          * We still need this segment and 0 or more blocks beyond it, so
863                          * nothing to do here.
864                          */
865                         v = v->mdfd_chain;
866                 }
867                 priorblocks += RELSEG_SIZE;
868         }
869 #else
870         /* For unsegmented files, it's a lot easier */
871         if (FileTruncate(v->mdfd_vfd, (off_t) nblocks * BLCKSZ) < 0)
872                 ereport(ERROR,
873                                 (errcode_for_file_access(),
874                           errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
875                                          reln->smgr_rnode.spcNode,
876                                          reln->smgr_rnode.dbNode,
877                                          reln->smgr_rnode.relNode,
878                                          nblocks)));
879         if (!isTemp)
880                 register_dirty_segment(reln, v);
881 #endif
882 }
883
884 /*
885  *      mdimmedsync() -- Immediately sync a relation to stable storage.
886  *
887  * Note that only writes already issued are synced; this routine knows
888  * nothing of dirty buffers that may exist inside the buffer manager.
889  */
890 void
891 mdimmedsync(SMgrRelation reln)
892 {
893         MdfdVec    *v;
894         BlockNumber curnblk;
895
896         /*
897          * NOTE: mdnblocks makes sure we have opened all active segments, so that
898          * fsync loop will get them all!
899          */
900         curnblk = mdnblocks(reln);
901
902         v = mdopen(reln, EXTENSION_FAIL);
903
904 #ifdef USE_SEGMENTED_FILES
905         while (v != NULL)
906         {
907                 if (FileSync(v->mdfd_vfd) < 0)
908                         ereport(ERROR,
909                                         (errcode_for_file_access(),
910                                 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
911                                            v->mdfd_segno,
912                                            reln->smgr_rnode.spcNode,
913                                            reln->smgr_rnode.dbNode,
914                                            reln->smgr_rnode.relNode)));
915                 v = v->mdfd_chain;
916         }
917 #else
918         if (FileSync(v->mdfd_vfd) < 0)
919                 ereport(ERROR,
920                                 (errcode_for_file_access(),
921                                  errmsg("could not fsync relation %u/%u/%u: %m",
922                                                 reln->smgr_rnode.spcNode,
923                                                 reln->smgr_rnode.dbNode,
924                                                 reln->smgr_rnode.relNode)));
925 #endif
926 }
927
928 /*
929  *      mdsync() -- Sync previous writes to stable storage.
930  */
931 void
932 mdsync(void)
933 {
934         static bool mdsync_in_progress = false;
935
936         HASH_SEQ_STATUS hstat;
937         PendingOperationEntry *entry;
938         int                     absorb_counter;
939
940         /*
941          * This is only called during checkpoints, and checkpoints should only
942          * occur in processes that have created a pendingOpsTable.
943          */
944         if (!pendingOpsTable)
945                 elog(ERROR, "cannot sync without a pendingOpsTable");
946
947         /*
948          * If we are in the bgwriter, the sync had better include all fsync
949          * requests that were queued by backends up to this point.      The tightest
950          * race condition that could occur is that a buffer that must be written
951          * and fsync'd for the checkpoint could have been dumped by a backend just
952          * before it was visited by BufferSync().  We know the backend will have
953          * queued an fsync request before clearing the buffer's dirtybit, so we
954          * are safe as long as we do an Absorb after completing BufferSync().
955          */
956         AbsorbFsyncRequests();
957
958         /*
959          * To avoid excess fsync'ing (in the worst case, maybe a never-terminating
960          * checkpoint), we want to ignore fsync requests that are entered into the
961          * hashtable after this point --- they should be processed next time,
962          * instead.  We use mdsync_cycle_ctr to tell old entries apart from new
963          * ones: new ones will have cycle_ctr equal to the incremented value of
964          * mdsync_cycle_ctr.
965          *
966          * In normal circumstances, all entries present in the table at this point
967          * will have cycle_ctr exactly equal to the current (about to be old)
968          * value of mdsync_cycle_ctr.  However, if we fail partway through the
969          * fsync'ing loop, then older values of cycle_ctr might remain when we
970          * come back here to try again.  Repeated checkpoint failures would
971          * eventually wrap the counter around to the point where an old entry
972          * might appear new, causing us to skip it, possibly allowing a checkpoint
973          * to succeed that should not have.  To forestall wraparound, any time the
974          * previous mdsync() failed to complete, run through the table and
975          * forcibly set cycle_ctr = mdsync_cycle_ctr.
976          *
977          * Think not to merge this loop with the main loop, as the problem is
978          * exactly that that loop may fail before having visited all the entries.
979          * From a performance point of view it doesn't matter anyway, as this path
980          * will never be taken in a system that's functioning normally.
981          */
982         if (mdsync_in_progress)
983         {
984                 /* prior try failed, so update any stale cycle_ctr values */
985                 hash_seq_init(&hstat, pendingOpsTable);
986                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
987                 {
988                         entry->cycle_ctr = mdsync_cycle_ctr;
989                 }
990         }
991
992         /* Advance counter so that new hashtable entries are distinguishable */
993         mdsync_cycle_ctr++;
994
995         /* Set flag to detect failure if we don't reach the end of the loop */
996         mdsync_in_progress = true;
997
998         /* Now scan the hashtable for fsync requests to process */
999         absorb_counter = FSYNCS_PER_ABSORB;
1000         hash_seq_init(&hstat, pendingOpsTable);
1001         while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1002         {
1003                 /*
1004                  * If the entry is new then don't process it this time.  Note that
1005                  * "continue" bypasses the hash-remove call at the bottom of the loop.
1006                  */
1007                 if (entry->cycle_ctr == mdsync_cycle_ctr)
1008                         continue;
1009
1010                 /* Else assert we haven't missed it */
1011                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdsync_cycle_ctr);
1012
1013                 /*
1014                  * If fsync is off then we don't have to bother opening the file at
1015                  * all.  (We delay checking until this point so that changing fsync on
1016                  * the fly behaves sensibly.)  Also, if the entry is marked canceled,
1017                  * fall through to delete it.
1018                  */
1019                 if (enableFsync && !entry->canceled)
1020                 {
1021                         int                     failures;
1022
1023                         /*
1024                          * If in bgwriter, we want to absorb pending requests every so
1025                          * often to prevent overflow of the fsync request queue.  It is
1026                          * unspecified whether newly-added entries will be visited by
1027                          * hash_seq_search, but we don't care since we don't need to
1028                          * process them anyway.
1029                          */
1030                         if (--absorb_counter <= 0)
1031                         {
1032                                 AbsorbFsyncRequests();
1033                                 absorb_counter = FSYNCS_PER_ABSORB;
1034                         }
1035
1036                         /*
1037                          * The fsync table could contain requests to fsync segments that
1038                          * have been deleted (unlinked) by the time we get to them. Rather
1039                          * than just hoping an ENOENT (or EACCES on Windows) error can be
1040                          * ignored, what we do on error is absorb pending requests and
1041                          * then retry.  Since mdunlink() queues a "revoke" message before
1042                          * actually unlinking, the fsync request is guaranteed to be
1043                          * marked canceled after the absorb if it really was this case.
1044                          * DROP DATABASE likewise has to tell us to forget fsync requests
1045                          * before it starts deletions.
1046                          */
1047                         for (failures = 0;; failures++)         /* loop exits at "break" */
1048                         {
1049                                 SMgrRelation reln;
1050                                 MdfdVec    *seg;
1051
1052                                 /*
1053                                  * Find or create an smgr hash entry for this relation. This
1054                                  * may seem a bit unclean -- md calling smgr?  But it's really
1055                                  * the best solution.  It ensures that the open file reference
1056                                  * isn't permanently leaked if we get an error here. (You may
1057                                  * say "but an unreferenced SMgrRelation is still a leak!" Not
1058                                  * really, because the only case in which a checkpoint is done
1059                                  * by a process that isn't about to shut down is in the
1060                                  * bgwriter, and it will periodically do smgrcloseall(). This
1061                                  * fact justifies our not closing the reln in the success path
1062                                  * either, which is a good thing since in non-bgwriter cases
1063                                  * we couldn't safely do that.)  Furthermore, in many cases
1064                                  * the relation will have been dirtied through this same smgr
1065                                  * relation, and so we can save a file open/close cycle.
1066                                  */
1067                                 reln = smgropen(entry->tag.rnode);
1068
1069                                 /*
1070                                  * It is possible that the relation has been dropped or
1071                                  * truncated since the fsync request was entered.  Therefore,
1072                                  * allow ENOENT, but only if we didn't fail already on this
1073                                  * file.  This applies both during _mdfd_getseg() and during
1074                                  * FileSync, since fd.c might have closed the file behind our
1075                                  * back.
1076                                  */
1077                                 seg = _mdfd_getseg(reln,
1078                                                           entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
1079                                                                    false, EXTENSION_RETURN_NULL);
1080                                 if (seg != NULL &&
1081                                         FileSync(seg->mdfd_vfd) >= 0)
1082                                         break;          /* success; break out of retry loop */
1083
1084                                 /*
1085                                  * XXX is there any point in allowing more than one retry?
1086                                  * Don't see one at the moment, but easy to change the test
1087                                  * here if so.
1088                                  */
1089                                 if (!FILE_POSSIBLY_DELETED(errno) ||
1090                                         failures > 0)
1091                                         ereport(ERROR,
1092                                                         (errcode_for_file_access(),
1093                                                          errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1094                                                                         entry->tag.segno,
1095                                                                         entry->tag.rnode.spcNode,
1096                                                                         entry->tag.rnode.dbNode,
1097                                                                         entry->tag.rnode.relNode)));
1098                                 else
1099                                         ereport(DEBUG1,
1100                                                         (errcode_for_file_access(),
1101                                                          errmsg("could not fsync segment %u of relation %u/%u/%u, but retrying: %m",
1102                                                                         entry->tag.segno,
1103                                                                         entry->tag.rnode.spcNode,
1104                                                                         entry->tag.rnode.dbNode,
1105                                                                         entry->tag.rnode.relNode)));
1106
1107                                 /*
1108                                  * Absorb incoming requests and check to see if canceled.
1109                                  */
1110                                 AbsorbFsyncRequests();
1111                                 absorb_counter = FSYNCS_PER_ABSORB;             /* might as well... */
1112
1113                                 if (entry->canceled)
1114                                         break;
1115                         }                                       /* end retry loop */
1116                 }
1117
1118                 /*
1119                  * If we get here, either we fsync'd successfully, or we don't have to
1120                  * because enableFsync is off, or the entry is (now) marked canceled.
1121                  * Okay to delete it.
1122                  */
1123                 if (hash_search(pendingOpsTable, &entry->tag,
1124                                                 HASH_REMOVE, NULL) == NULL)
1125                         elog(ERROR, "pendingOpsTable corrupted");
1126         }                                                       /* end loop over hashtable entries */
1127
1128         /* Flag successful completion of mdsync */
1129         mdsync_in_progress = false;
1130 }
1131
1132 /*
1133  * mdpreckpt() -- Do pre-checkpoint work
1134  *
1135  * To distinguish unlink requests that arrived before this checkpoint
1136  * started from those that arrived during the checkpoint, we use a cycle
1137  * counter similar to the one we use for fsync requests. That cycle
1138  * counter is incremented here.
1139  *
1140  * This must be called *before* the checkpoint REDO point is determined.
1141  * That ensures that we won't delete files too soon.
1142  *
1143  * Note that we can't do anything here that depends on the assumption
1144  * that the checkpoint will be completed.
1145  */
1146 void
1147 mdpreckpt(void)
1148 {
1149         ListCell   *cell;
1150
1151         /*
1152          * In case the prior checkpoint wasn't completed, stamp all entries in the
1153          * list with the current cycle counter.  Anything that's in the list at
1154          * the start of checkpoint can surely be deleted after the checkpoint is
1155          * finished, regardless of when the request was made.
1156          */
1157         foreach(cell, pendingUnlinks)
1158         {
1159                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1160
1161                 entry->cycle_ctr = mdckpt_cycle_ctr;
1162         }
1163
1164         /*
1165          * Any unlink requests arriving after this point will be assigned the next
1166          * cycle counter, and won't be unlinked until next checkpoint.
1167          */
1168         mdckpt_cycle_ctr++;
1169 }
1170
1171 /*
1172  * mdpostckpt() -- Do post-checkpoint work
1173  *
1174  * Remove any lingering files that can now be safely removed.
1175  */
1176 void
1177 mdpostckpt(void)
1178 {
1179         while (pendingUnlinks != NIL)
1180         {
1181                 PendingUnlinkEntry *entry = (PendingUnlinkEntry *) linitial(pendingUnlinks);
1182                 char       *path;
1183
1184                 /*
1185                  * New entries are appended to the end, so if the entry is new we've
1186                  * reached the end of old entries.
1187                  */
1188                 if (entry->cycle_ctr == mdckpt_cycle_ctr)
1189                         break;
1190
1191                 /* Else assert we haven't missed it */
1192                 Assert((CycleCtr) (entry->cycle_ctr + 1) == mdckpt_cycle_ctr);
1193
1194                 /* Unlink the file */
1195                 path = relpath(entry->rnode);
1196                 if (unlink(path) < 0)
1197                 {
1198                         /*
1199                          * There's a race condition, when the database is dropped at the
1200                          * same time that we process the pending unlink requests. If the
1201                          * DROP DATABASE deletes the file before we do, we will get ENOENT
1202                          * here. rmtree() also has to ignore ENOENT errors, to deal with
1203                          * the possibility that we delete the file first.
1204                          */
1205                         if (errno != ENOENT)
1206                                 ereport(WARNING,
1207                                                 (errcode_for_file_access(),
1208                                                  errmsg("could not remove relation %u/%u/%u: %m",
1209                                                                 entry->rnode.spcNode,
1210                                                                 entry->rnode.dbNode,
1211                                                                 entry->rnode.relNode)));
1212                 }
1213                 pfree(path);
1214
1215                 pendingUnlinks = list_delete_first(pendingUnlinks);
1216                 pfree(entry);
1217         }
1218 }
1219
1220 /*
1221  * register_dirty_segment() -- Mark a relation segment as needing fsync
1222  *
1223  * If there is a local pending-ops table, just make an entry in it for
1224  * mdsync to process later.  Otherwise, try to pass off the fsync request
1225  * to the background writer process.  If that fails, just do the fsync
1226  * locally before returning (we expect this will not happen often enough
1227  * to be a performance problem).
1228  */
1229 static void
1230 register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
1231 {
1232         if (pendingOpsTable)
1233         {
1234                 /* push it into local pending-ops table */
1235                 RememberFsyncRequest(reln->smgr_rnode, seg->mdfd_segno);
1236         }
1237         else
1238         {
1239                 if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
1240                         return;                         /* passed it off successfully */
1241
1242                 if (FileSync(seg->mdfd_vfd) < 0)
1243                         ereport(ERROR,
1244                                         (errcode_for_file_access(),
1245                                 errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
1246                                            seg->mdfd_segno,
1247                                            reln->smgr_rnode.spcNode,
1248                                            reln->smgr_rnode.dbNode,
1249                                            reln->smgr_rnode.relNode)));
1250         }
1251 }
1252
1253 /*
1254  * register_unlink() -- Schedule a file to be deleted after next checkpoint
1255  *
1256  * As with register_dirty_segment, this could involve either a local or
1257  * a remote pending-ops table.
1258  */
1259 static void
1260 register_unlink(RelFileNode rnode)
1261 {
1262         if (pendingOpsTable)
1263         {
1264                 /* push it into local pending-ops table */
1265                 RememberFsyncRequest(rnode, UNLINK_RELATION_REQUEST);
1266         }
1267         else
1268         {
1269                 /*
1270                  * Notify the bgwriter about it.  If we fail to queue the request
1271                  * message, we have to sleep and try again, because we can't simply
1272                  * delete the file now.  Ugly, but hopefully won't happen often.
1273                  *
1274                  * XXX should we just leave the file orphaned instead?
1275                  */
1276                 Assert(IsUnderPostmaster);
1277                 while (!ForwardFsyncRequest(rnode, UNLINK_RELATION_REQUEST))
1278                         pg_usleep(10000L);      /* 10 msec seems a good number */
1279         }
1280 }
1281
1282 /*
1283  * RememberFsyncRequest() -- callback from bgwriter side of fsync request
1284  *
1285  * We stuff most fsync requests into the local hash table for execution
1286  * during the bgwriter's next checkpoint.  UNLINK requests go into a
1287  * separate linked list, however, because they get processed separately.
1288  *
1289  * The range of possible segment numbers is way less than the range of
1290  * BlockNumber, so we can reserve high values of segno for special purposes.
1291  * We define three:
1292  * - FORGET_RELATION_FSYNC means to cancel pending fsyncs for a relation
1293  * - FORGET_DATABASE_FSYNC means to cancel pending fsyncs for a whole database
1294  * - UNLINK_RELATION_REQUEST is a request to delete the file after the next
1295  *       checkpoint.
1296  *
1297  * (Handling the FORGET_* requests is a tad slow because the hash table has
1298  * to be searched linearly, but it doesn't seem worth rethinking the table
1299  * structure for them.)
1300  */
1301 void
1302 RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
1303 {
1304         Assert(pendingOpsTable);
1305
1306         if (segno == FORGET_RELATION_FSYNC)
1307         {
1308                 /* Remove any pending requests for the entire relation */
1309                 HASH_SEQ_STATUS hstat;
1310                 PendingOperationEntry *entry;
1311
1312                 hash_seq_init(&hstat, pendingOpsTable);
1313                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1314                 {
1315                         if (RelFileNodeEquals(entry->tag.rnode, rnode))
1316                         {
1317                                 /* Okay, cancel this entry */
1318                                 entry->canceled = true;
1319                         }
1320                 }
1321         }
1322         else if (segno == FORGET_DATABASE_FSYNC)
1323         {
1324                 /* Remove any pending requests for the entire database */
1325                 HASH_SEQ_STATUS hstat;
1326                 PendingOperationEntry *entry;
1327                 ListCell   *cell, 
1328                                    *prev,
1329                                    *next;
1330
1331                 /* Remove fsync requests */
1332                 hash_seq_init(&hstat, pendingOpsTable);
1333                 while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
1334                 {
1335                         if (entry->tag.rnode.dbNode == rnode.dbNode)
1336                         {
1337                                 /* Okay, cancel this entry */
1338                                 entry->canceled = true;
1339                         }
1340                 }
1341         
1342                 /* Remove unlink requests */
1343                 prev = NULL;
1344                 for (cell = list_head(pendingUnlinks); cell; cell = next)
1345                 {
1346                         PendingUnlinkEntry *entry = (PendingUnlinkEntry *) lfirst(cell);
1347
1348                         next = lnext(cell);
1349                         if (entry->rnode.dbNode == rnode.dbNode) 
1350                         {
1351                                 pendingUnlinks = list_delete_cell(pendingUnlinks, cell, prev);
1352                                 pfree(entry);
1353                         }
1354                         else
1355                                 prev = cell;
1356                 }
1357         }
1358         else if (segno == UNLINK_RELATION_REQUEST)
1359         {
1360                 /* Unlink request: put it in the linked list */
1361                 MemoryContext oldcxt = MemoryContextSwitchTo(MdCxt);
1362                 PendingUnlinkEntry *entry;
1363
1364                 entry = palloc(sizeof(PendingUnlinkEntry));
1365                 entry->rnode = rnode;
1366                 entry->cycle_ctr = mdckpt_cycle_ctr;
1367
1368                 pendingUnlinks = lappend(pendingUnlinks, entry);
1369
1370                 MemoryContextSwitchTo(oldcxt);
1371         }
1372         else
1373         {
1374                 /* Normal case: enter a request to fsync this segment */
1375                 PendingOperationTag key;
1376                 PendingOperationEntry *entry;
1377                 bool            found;
1378
1379                 /* ensure any pad bytes in the hash key are zeroed */
1380                 MemSet(&key, 0, sizeof(key));
1381                 key.rnode = rnode;
1382                 key.segno = segno;
1383
1384                 entry = (PendingOperationEntry *) hash_search(pendingOpsTable,
1385                                                                                                           &key,
1386                                                                                                           HASH_ENTER,
1387                                                                                                           &found);
1388                 /* if new or previously canceled entry, initialize it */
1389                 if (!found || entry->canceled)
1390                 {
1391                         entry->canceled = false;
1392                         entry->cycle_ctr = mdsync_cycle_ctr;
1393                 }
1394
1395                 /*
1396                  * NB: it's intentional that we don't change cycle_ctr if the entry
1397                  * already exists.      The fsync request must be treated as old, even
1398                  * though the new request will be satisfied too by any subsequent
1399                  * fsync.
1400                  *
1401                  * However, if the entry is present but is marked canceled, we should
1402                  * act just as though it wasn't there.  The only case where this could
1403                  * happen would be if a file had been deleted, we received but did not
1404                  * yet act on the cancel request, and the same relfilenode was then
1405                  * assigned to a new file.      We mustn't lose the new request, but it
1406                  * should be considered new not old.
1407                  */
1408         }
1409 }
1410
1411 /*
1412  * ForgetRelationFsyncRequests -- forget any fsyncs for a rel
1413  */
1414 void
1415 ForgetRelationFsyncRequests(RelFileNode rnode)
1416 {
1417         if (pendingOpsTable)
1418         {
1419                 /* standalone backend or startup process: fsync state is local */
1420                 RememberFsyncRequest(rnode, FORGET_RELATION_FSYNC);
1421         }
1422         else if (IsUnderPostmaster)
1423         {
1424                 /*
1425                  * Notify the bgwriter about it.  If we fail to queue the revoke
1426                  * message, we have to sleep and try again ... ugly, but hopefully
1427                  * won't happen often.
1428                  *
1429                  * XXX should we CHECK_FOR_INTERRUPTS in this loop?  Escaping with an
1430                  * error would leave the no-longer-used file still present on disk,
1431                  * which would be bad, so I'm inclined to assume that the bgwriter
1432                  * will always empty the queue soon.
1433                  */
1434                 while (!ForwardFsyncRequest(rnode, FORGET_RELATION_FSYNC))
1435                         pg_usleep(10000L);      /* 10 msec seems a good number */
1436
1437                 /*
1438                  * Note we don't wait for the bgwriter to actually absorb the revoke
1439                  * message; see mdsync() for the implications.
1440                  */
1441         }
1442 }
1443
1444 /*
1445  * ForgetDatabaseFsyncRequests -- forget any fsyncs and unlinks for a DB
1446  */
1447 void
1448 ForgetDatabaseFsyncRequests(Oid dbid)
1449 {
1450         RelFileNode rnode;
1451
1452         rnode.dbNode = dbid;
1453         rnode.spcNode = 0;
1454         rnode.relNode = 0;
1455
1456         if (pendingOpsTable)
1457         {
1458                 /* standalone backend or startup process: fsync state is local */
1459                 RememberFsyncRequest(rnode, FORGET_DATABASE_FSYNC);
1460         }
1461         else if (IsUnderPostmaster)
1462         {
1463                 /* see notes in ForgetRelationFsyncRequests */
1464                 while (!ForwardFsyncRequest(rnode, FORGET_DATABASE_FSYNC))
1465                         pg_usleep(10000L);      /* 10 msec seems a good number */
1466         }
1467 }
1468
1469
1470 /*
1471  *      _fdvec_alloc() -- Make a MdfdVec object.
1472  */
1473 static MdfdVec *
1474 _fdvec_alloc(void)
1475 {
1476         return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
1477 }
1478
1479 #ifdef USE_SEGMENTED_FILES
1480
1481 /*
1482  * Open the specified segment of the relation,
1483  * and make a MdfdVec object for it.  Returns NULL on failure.
1484  */
1485 static MdfdVec *
1486 _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
1487 {
1488         MdfdVec    *v;
1489         int                     fd;
1490         char       *path,
1491                            *fullpath;
1492
1493         path = relpath(reln->smgr_rnode);
1494
1495         if (segno > 0)
1496         {
1497                 /* be sure we have enough space for the '.segno' */
1498                 fullpath = (char *) palloc(strlen(path) + 12);
1499                 sprintf(fullpath, "%s.%u", path, segno);
1500                 pfree(path);
1501         }
1502         else
1503                 fullpath = path;
1504
1505         /* open the file */
1506         fd = PathNameOpenFile(fullpath, O_RDWR | PG_BINARY | oflags, 0600);
1507
1508         pfree(fullpath);
1509
1510         if (fd < 0)
1511                 return NULL;
1512
1513         /* allocate an mdfdvec entry for it */
1514         v = _fdvec_alloc();
1515
1516         /* fill the entry */
1517         v->mdfd_vfd = fd;
1518         v->mdfd_segno = segno;
1519         v->mdfd_chain = NULL;
1520         Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
1521
1522         /* all done */
1523         return v;
1524 }
1525 #endif   /* USE_SEGMENTED_FILES */
1526
1527 /*
1528  *      _mdfd_getseg() -- Find the segment of the relation holding the
1529  *              specified block.
1530  *
1531  * If the segment doesn't exist, we ereport, return NULL, or create the
1532  * segment, according to "behavior".  Note: isTemp need only be correct
1533  * in the EXTENSION_CREATE case.
1534  */
1535 static MdfdVec *
1536 _mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
1537                          ExtensionBehavior behavior)
1538 {
1539         MdfdVec    *v = mdopen(reln, behavior);
1540
1541 #ifdef USE_SEGMENTED_FILES
1542         BlockNumber targetseg;
1543         BlockNumber nextsegno;
1544
1545         if (!v)
1546                 return NULL;                    /* only possible if EXTENSION_RETURN_NULL */
1547
1548         targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
1549         for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
1550         {
1551                 Assert(nextsegno == v->mdfd_segno + 1);
1552
1553                 if (v->mdfd_chain == NULL)
1554                 {
1555                         /*
1556                          * Normally we will create new segments only if authorized by the
1557                          * caller (i.e., we are doing mdextend()).      But when doing WAL
1558                          * recovery, create segments anyway; this allows cases such as
1559                          * replaying WAL data that has a write into a high-numbered
1560                          * segment of a relation that was later deleted.  We want to go
1561                          * ahead and create the segments so we can finish out the replay.
1562                          *
1563                          * We have to maintain the invariant that segments before the last
1564                          * active segment are of size RELSEG_SIZE; therefore, pad them out
1565                          * with zeroes if needed.  (This only matters if caller is
1566                          * extending the relation discontiguously, but that can happen in
1567                          * hash indexes.)
1568                          */
1569                         if (behavior == EXTENSION_CREATE || InRecovery)
1570                         {
1571                                 if (_mdnblocks(reln, v) < RELSEG_SIZE)
1572                                 {
1573                                         char       *zerobuf = palloc0(BLCKSZ);
1574
1575                                         mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
1576                                                          zerobuf, isTemp);
1577                                         pfree(zerobuf);
1578                                 }
1579                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
1580                         }
1581                         else
1582                         {
1583                                 /* We won't create segment if not existent */
1584                                 v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
1585                         }
1586                         if (v->mdfd_chain == NULL)
1587                         {
1588                                 if (behavior == EXTENSION_RETURN_NULL &&
1589                                         FILE_POSSIBLY_DELETED(errno))
1590                                         return NULL;
1591                                 ereport(ERROR,
1592                                                 (errcode_for_file_access(),
1593                                                  errmsg("could not open segment %u of relation %u/%u/%u (target block %u): %m",
1594                                                                 nextsegno,
1595                                                                 reln->smgr_rnode.spcNode,
1596                                                                 reln->smgr_rnode.dbNode,
1597                                                                 reln->smgr_rnode.relNode,
1598                                                                 blkno)));
1599                         }
1600                 }
1601                 v = v->mdfd_chain;
1602         }
1603 #endif
1604
1605         return v;
1606 }
1607
1608 /*
1609  * Get number of blocks present in a single disk file
1610  */
1611 static BlockNumber
1612 _mdnblocks(SMgrRelation reln, MdfdVec *seg)
1613 {
1614         off_t           len;
1615
1616         len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
1617         if (len < 0)
1618                 ereport(ERROR,
1619                                 (errcode_for_file_access(),
1620                 errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
1621                            seg->mdfd_segno,
1622                            reln->smgr_rnode.spcNode,
1623                            reln->smgr_rnode.dbNode,
1624                            reln->smgr_rnode.relNode)));
1625         /* note that this calculation will ignore any partial block at EOF */
1626         return (BlockNumber) (len / BLCKSZ);
1627 }