1 /*-------------------------------------------------------------------------
4 * public interface routines to storage manager switch.
6 * All file system operations in POSTGRES dispatch through these
9 * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
14 * src/backend/storage/smgr/smgr.c
16 *-------------------------------------------------------------------------
20 #include "access/xlogutils.h"
21 #include "catalog/catalog.h"
22 #include "commands/tablespace.h"
23 #include "storage/bufmgr.h"
24 #include "storage/ipc.h"
25 #include "storage/smgr.h"
26 #include "utils/hsearch.h"
27 #include "utils/inval.h"
31 * This struct of function pointers defines the API between smgr.c and
32 * any individual storage manager module. Note that smgr subfunctions are
33 * generally expected to report problems via elog(ERROR). An exception is
34 * that smgr_unlink should use elog(WARNING), rather than erroring out,
35 * because we normally unlink relations during post-commit/abort cleanup,
36 * and so it's too late to raise an error. Also, various conditions that
37 * would normally be errors should be allowed during bootstrap and/or WAL
38 * recovery --- see comments in md.c for details.
42 void (*smgr_init) (void); /* may be NULL */
43 void (*smgr_shutdown) (void); /* may be NULL */
44 void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
45 void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
47 bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
48 void (*smgr_unlink) (RelFileNodeBackend rnode, ForkNumber forknum,
50 void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
51 BlockNumber blocknum, char *buffer, bool skipFsync);
52 void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
53 BlockNumber blocknum);
54 void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
55 BlockNumber blocknum, char *buffer);
56 void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
57 BlockNumber blocknum, char *buffer, bool skipFsync);
58 BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
59 void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
61 void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
62 void (*smgr_pre_ckpt) (void); /* may be NULL */
63 void (*smgr_sync) (void); /* may be NULL */
64 void (*smgr_post_ckpt) (void); /* may be NULL */
68 static const f_smgr smgrsw[] = {
70 {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
71 mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
72 mdpreckpt, mdsync, mdpostckpt
76 static const int NSmgr = lengthof(smgrsw);
80 * Each backend has a hashtable that stores all extant SMgrRelation objects.
82 static HTAB *SMgrRelationHash = NULL;
84 /* local function prototypes */
85 static void smgrshutdown(int code, Datum arg);
89 * smgrinit(), smgrshutdown() -- Initialize or shut down storage
92 * Note: smgrinit is called during backend startup (normal or standalone
93 * case), *not* during postmaster start. Therefore, any resources created
94 * here or destroyed in smgrshutdown are backend-local.
101 for (i = 0; i < NSmgr; i++)
103 if (smgrsw[i].smgr_init)
104 (*(smgrsw[i].smgr_init)) ();
107 /* register the shutdown proc */
108 on_proc_exit(smgrshutdown, 0);
112 * on_proc_exit hook for smgr cleanup during backend shutdown
115 smgrshutdown(int code, Datum arg)
119 for (i = 0; i < NSmgr; i++)
121 if (smgrsw[i].smgr_shutdown)
122 (*(smgrsw[i].smgr_shutdown)) ();
127 * smgropen() -- Return an SMgrRelation object, creating it if need be.
129 * This does not attempt to actually open the object.
132 smgropen(RelFileNode rnode, BackendId backend)
134 RelFileNodeBackend brnode;
138 if (SMgrRelationHash == NULL)
140 /* First time through: initialize the hash table */
143 MemSet(&ctl, 0, sizeof(ctl));
144 ctl.keysize = sizeof(RelFileNodeBackend);
145 ctl.entrysize = sizeof(SMgrRelationData);
147 SMgrRelationHash = hash_create("smgr relation table", 400,
148 &ctl, HASH_ELEM | HASH_FUNCTION);
151 /* Look up or create an entry */
153 brnode.backend = backend;
154 reln = (SMgrRelation) hash_search(SMgrRelationHash,
158 /* Initialize it if not present before */
163 /* hash_search already filled in the lookup key */
164 reln->smgr_owner = NULL;
165 reln->smgr_targblock = InvalidBlockNumber;
166 reln->smgr_fsm_nblocks = InvalidBlockNumber;
167 reln->smgr_vm_nblocks = InvalidBlockNumber;
168 reln->smgr_which = 0; /* we only have md.c at present */
170 /* mark it not open */
171 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
172 reln->md_fd[forknum] = NULL;
179 * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
181 * There can be only one owner at a time; this is sufficient since currently
182 * the only such owners exist in the relcache.
185 smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
188 * First, unhook any old owner. (Normally there shouldn't be any, but it
189 * seems possible that this can happen during swap_relation_files()
190 * depending on the order of processing. It's ok to close the old
191 * relcache entry early in that case.)
193 if (reln->smgr_owner)
194 *(reln->smgr_owner) = NULL;
196 /* Now establish the ownership relationship. */
197 reln->smgr_owner = owner;
202 * smgrexists() -- Does the underlying file for a fork exist?
205 smgrexists(SMgrRelation reln, ForkNumber forknum)
207 return (*(smgrsw[reln->smgr_which].smgr_exists)) (reln, forknum);
211 * smgrclose() -- Close and delete an SMgrRelation object.
214 smgrclose(SMgrRelation reln)
219 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
220 (*(smgrsw[reln->smgr_which].smgr_close)) (reln, forknum);
222 owner = reln->smgr_owner;
224 if (hash_search(SMgrRelationHash,
225 (void *) &(reln->smgr_rnode),
226 HASH_REMOVE, NULL) == NULL)
227 elog(ERROR, "SMgrRelation hashtable corrupted");
230 * Unhook the owner pointer, if any. We do this last since in the remote
231 * possibility of failure above, the SMgrRelation object will still exist.
238 * smgrcloseall() -- Close all existing SMgrRelation objects.
243 HASH_SEQ_STATUS status;
246 /* Nothing to do if hashtable not set up */
247 if (SMgrRelationHash == NULL)
250 hash_seq_init(&status, SMgrRelationHash);
252 while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
257 * smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
260 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
261 * uselessly creating a hashtable entry only to drop it again when no
262 * such entry exists already.
265 smgrclosenode(RelFileNodeBackend rnode)
269 /* Nothing to do if hashtable not set up */
270 if (SMgrRelationHash == NULL)
273 reln = (SMgrRelation) hash_search(SMgrRelationHash,
281 * smgrcreate() -- Create a new relation.
283 * Given an already-created (but presumably unused) SMgrRelation,
284 * cause the underlying disk file or other storage for the fork
287 * If isRedo is true, it is okay for the underlying file to exist
288 * already because we are in a WAL replay sequence.
291 smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
294 * Exit quickly in WAL replay mode if we've already opened the file. If
295 * it's open, it surely must exist.
297 if (isRedo && reln->md_fd[forknum] != NULL)
301 * We may be using the target table space for the first time in this
302 * database, so create a per-database subdirectory if needed.
304 * XXX this is a fairly ugly violation of module layering, but this seems
305 * to be the best place to put the check. Maybe TablespaceCreateDbspace
306 * should be here and not in commands/tablespace.c? But that would imply
307 * importing a lot of stuff that smgr.c oughtn't know, either.
309 TablespaceCreateDbspace(reln->smgr_rnode.node.spcNode,
310 reln->smgr_rnode.node.dbNode,
313 (*(smgrsw[reln->smgr_which].smgr_create)) (reln, forknum, isRedo);
317 * smgrdounlink() -- Immediately unlink a relation.
319 * The specified fork of the relation is removed from the store. This
320 * should not be used during transactional operations, since it can't be
323 * If isRedo is true, it is okay for the underlying file to be gone
327 smgrdounlink(SMgrRelation reln, ForkNumber forknum, bool isRedo)
329 RelFileNodeBackend rnode = reln->smgr_rnode;
330 int which = reln->smgr_which;
333 (*(smgrsw[which].smgr_close)) (reln, forknum);
336 * Get rid of any remaining buffers for the relation. bufmgr will just
337 * drop them without bothering to write the contents.
339 DropRelFileNodeBuffers(rnode, forknum, 0);
342 * It'd be nice to tell the stats collector to forget it immediately, too.
343 * But we can't because we don't know the OID (and in cases involving
344 * relfilenode swaps, it's not always clear which table OID to forget,
349 * Send a shared-inval message to force other backends to close any
350 * dangling smgr references they may have for this rel. We should do this
351 * before starting the actual unlinking, in case we fail partway through
352 * that step. Note that the sinval message will eventually come back to
353 * this backend, too, and thereby provide a backstop that we closed our
356 CacheInvalidateSmgr(rnode);
359 * Delete the physical file(s).
361 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
362 * ERROR, because we've already decided to commit or abort the current
365 (*(smgrsw[which].smgr_unlink)) (rnode, forknum, isRedo);
369 * smgrextend() -- Add a new block to a file.
371 * The semantics are nearly the same as smgrwrite(): write at the
372 * specified position. However, this is to be used for the case of
373 * extending a relation (i.e., blocknum is at or beyond the current
374 * EOF). Note that we assume writing a block beyond current EOF
375 * causes intervening file space to become filled with zeroes.
378 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
379 char *buffer, bool skipFsync)
381 (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum,
386 * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
389 smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
391 (*(smgrsw[reln->smgr_which].smgr_prefetch)) (reln, forknum, blocknum);
395 * smgrread() -- read a particular block from a relation into the supplied
398 * This routine is called from the buffer manager in order to
399 * instantiate pages in the shared buffer cache. All storage managers
400 * return pages in the format that POSTGRES expects.
403 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
406 (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
410 * smgrwrite() -- Write the supplied buffer out.
412 * This is to be used only for updating already-existing blocks of a
413 * relation (ie, those before the current EOF). To extend a relation,
416 * This is not a synchronous write -- the block is not necessarily
417 * on disk at return, only dumped out to the kernel. However,
418 * provisions will be made to fsync the write before the next checkpoint.
420 * skipFsync indicates that the caller will make other provisions to
421 * fsync the relation, so we needn't bother. Temporary relations also
422 * do not require fsync.
425 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
426 char *buffer, bool skipFsync)
428 (*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum,
433 * smgrnblocks() -- Calculate the number of blocks in the
437 smgrnblocks(SMgrRelation reln, ForkNumber forknum)
439 return (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln, forknum);
443 * smgrtruncate() -- Truncate supplied relation to the specified number
446 * The truncation is done immediately, so this can't be rolled back.
449 smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
452 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
453 * just drop them without bothering to write the contents.
455 DropRelFileNodeBuffers(reln->smgr_rnode, forknum, nblocks);
458 * Send a shared-inval message to force other backends to close any smgr
459 * references they may have for this rel. This is useful because they
460 * might have open file pointers to segments that got removed, and/or
461 * smgr_targblock variables pointing past the new rel end. (The inval
462 * message will come back to our backend, too, causing a
463 * probably-unnecessary local smgr flush. But we don't expect that this
464 * is a performance-critical path.) As in the unlink code, we want to be
465 * sure the message is sent before we start changing things on-disk.
467 CacheInvalidateSmgr(reln->smgr_rnode);
472 (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, forknum, nblocks);
476 * smgrimmedsync() -- Force the specified relation to stable storage.
478 * Synchronously force all previous writes to the specified relation
481 * This is useful for building completely new relations (eg, new
482 * indexes). Instead of incrementally WAL-logging the index build
483 * steps, we can just write completed index pages to disk with smgrwrite
484 * or smgrextend, and then fsync the completed index file before
485 * committing the transaction. (This is sufficient for purposes of
486 * crash recovery, since it effectively duplicates forcing a checkpoint
487 * for the completed index. But it is *not* sufficient if one wishes
488 * to use the WAL log for PITR or replication purposes: in that case
489 * we have to make WAL entries as well.)
491 * The preceding writes should specify skipFsync = true to avoid
492 * duplicative fsyncs.
494 * Note that you need to do FlushRelationBuffers() first if there is
495 * any possibility that there are dirty buffers for the relation;
496 * otherwise the sync is not very meaningful.
499 smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
501 (*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln, forknum);
506 * smgrpreckpt() -- Prepare for checkpoint.
513 for (i = 0; i < NSmgr; i++)
515 if (smgrsw[i].smgr_pre_ckpt)
516 (*(smgrsw[i].smgr_pre_ckpt)) ();
521 * smgrsync() -- Sync files to disk during checkpoint.
528 for (i = 0; i < NSmgr; i++)
530 if (smgrsw[i].smgr_sync)
531 (*(smgrsw[i].smgr_sync)) ();
536 * smgrpostckpt() -- Post-checkpoint cleanup.
543 for (i = 0; i < NSmgr; i++)
545 if (smgrsw[i].smgr_post_ckpt)
546 (*(smgrsw[i].smgr_post_ckpt)) ();