1 /*-------------------------------------------------------------------------
4 * public interface routines to storage manager switch.
6 * All file system operations in POSTGRES dispatch through these
9 * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
10 * Portions Copyright (c) 1994, Regents of the University of California
14 * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.117 2009/06/11 14:49:02 momjian Exp $
16 *-------------------------------------------------------------------------
20 #include "access/xlogutils.h"
21 #include "catalog/catalog.h"
22 #include "commands/tablespace.h"
23 #include "storage/bufmgr.h"
24 #include "storage/ipc.h"
25 #include "storage/smgr.h"
26 #include "utils/hsearch.h"
30 * This struct of function pointers defines the API between smgr.c and
31 * any individual storage manager module. Note that smgr subfunctions are
32 * generally expected to report problems via elog(ERROR). An exception is
33 * that smgr_unlink should use elog(WARNING), rather than erroring out,
34 * because we normally unlink relations during post-commit/abort cleanup,
35 * and so it's too late to raise an error. Also, various conditions that
36 * would normally be errors should be allowed during bootstrap and/or WAL
37 * recovery --- see comments in md.c for details.
41 void (*smgr_init) (void); /* may be NULL */
42 void (*smgr_shutdown) (void); /* may be NULL */
43 void (*smgr_close) (SMgrRelation reln, ForkNumber forknum);
44 void (*smgr_create) (SMgrRelation reln, ForkNumber forknum,
46 bool (*smgr_exists) (SMgrRelation reln, ForkNumber forknum);
47 void (*smgr_unlink) (RelFileNode rnode, ForkNumber forknum,
49 void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum,
50 BlockNumber blocknum, char *buffer, bool isTemp);
51 void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum,
52 BlockNumber blocknum);
53 void (*smgr_read) (SMgrRelation reln, ForkNumber forknum,
54 BlockNumber blocknum, char *buffer);
55 void (*smgr_write) (SMgrRelation reln, ForkNumber forknum,
56 BlockNumber blocknum, char *buffer, bool isTemp);
57 BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum);
58 void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum,
59 BlockNumber nblocks, bool isTemp);
60 void (*smgr_immedsync) (SMgrRelation reln, ForkNumber forknum);
61 void (*smgr_pre_ckpt) (void); /* may be NULL */
62 void (*smgr_sync) (void); /* may be NULL */
63 void (*smgr_post_ckpt) (void); /* may be NULL */
67 static const f_smgr smgrsw[] = {
69 {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend,
70 mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync,
71 mdpreckpt, mdsync, mdpostckpt
75 static const int NSmgr = lengthof(smgrsw);
79 * Each backend has a hashtable that stores all extant SMgrRelation objects.
81 static HTAB *SMgrRelationHash = NULL;
83 /* local function prototypes */
84 static void smgrshutdown(int code, Datum arg);
85 static void smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum,
86 int which, bool isTemp, bool isRedo);
90 * smgrinit(), smgrshutdown() -- Initialize or shut down storage
93 * Note: smgrinit is called during backend startup (normal or standalone
94 * case), *not* during postmaster start. Therefore, any resources created
95 * here or destroyed in smgrshutdown are backend-local.
102 for (i = 0; i < NSmgr; i++)
104 if (smgrsw[i].smgr_init)
105 (*(smgrsw[i].smgr_init)) ();
108 /* register the shutdown proc */
109 on_proc_exit(smgrshutdown, 0);
113 * on_proc_exit hook for smgr cleanup during backend shutdown
116 smgrshutdown(int code, Datum arg)
120 for (i = 0; i < NSmgr; i++)
122 if (smgrsw[i].smgr_shutdown)
123 (*(smgrsw[i].smgr_shutdown)) ();
128 * smgropen() -- Return an SMgrRelation object, creating it if need be.
130 * This does not attempt to actually open the object.
133 smgropen(RelFileNode rnode)
138 if (SMgrRelationHash == NULL)
140 /* First time through: initialize the hash table */
143 MemSet(&ctl, 0, sizeof(ctl));
144 ctl.keysize = sizeof(RelFileNode);
145 ctl.entrysize = sizeof(SMgrRelationData);
147 SMgrRelationHash = hash_create("smgr relation table", 400,
148 &ctl, HASH_ELEM | HASH_FUNCTION);
151 /* Look up or create an entry */
152 reln = (SMgrRelation) hash_search(SMgrRelationHash,
156 /* Initialize it if not present before */
161 /* hash_search already filled in the lookup key */
162 reln->smgr_owner = NULL;
163 reln->smgr_which = 0; /* we only have md.c at present */
165 /* mark it not open */
166 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
167 reln->md_fd[forknum] = NULL;
174 * smgrsetowner() -- Establish a long-lived reference to an SMgrRelation object
176 * There can be only one owner at a time; this is sufficient since currently
177 * the only such owners exist in the relcache.
180 smgrsetowner(SMgrRelation *owner, SMgrRelation reln)
183 * First, unhook any old owner. (Normally there shouldn't be any, but it
184 * seems possible that this can happen during swap_relation_files()
185 * depending on the order of processing. It's ok to close the old
186 * relcache entry early in that case.)
188 if (reln->smgr_owner)
189 *(reln->smgr_owner) = NULL;
191 /* Now establish the ownership relationship. */
192 reln->smgr_owner = owner;
197 * smgrexists() -- Does the underlying file for a fork exist?
200 smgrexists(SMgrRelation reln, ForkNumber forknum)
202 return (*(smgrsw[reln->smgr_which].smgr_exists)) (reln, forknum);
206 * smgrclose() -- Close and delete an SMgrRelation object.
209 smgrclose(SMgrRelation reln)
214 for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
215 (*(smgrsw[reln->smgr_which].smgr_close)) (reln, forknum);
217 owner = reln->smgr_owner;
219 if (hash_search(SMgrRelationHash,
220 (void *) &(reln->smgr_rnode),
221 HASH_REMOVE, NULL) == NULL)
222 elog(ERROR, "SMgrRelation hashtable corrupted");
225 * Unhook the owner pointer, if any. We do this last since in the remote
226 * possibility of failure above, the SMgrRelation object will still exist.
233 * smgrcloseall() -- Close all existing SMgrRelation objects.
238 HASH_SEQ_STATUS status;
241 /* Nothing to do if hashtable not set up */
242 if (SMgrRelationHash == NULL)
245 hash_seq_init(&status, SMgrRelationHash);
247 while ((reln = (SMgrRelation) hash_seq_search(&status)) != NULL)
252 * smgrclosenode() -- Close SMgrRelation object for given RelFileNode,
255 * This has the same effects as smgrclose(smgropen(rnode)), but it avoids
256 * uselessly creating a hashtable entry only to drop it again when no
257 * such entry exists already.
260 smgrclosenode(RelFileNode rnode)
264 /* Nothing to do if hashtable not set up */
265 if (SMgrRelationHash == NULL)
268 reln = (SMgrRelation) hash_search(SMgrRelationHash,
276 * smgrcreate() -- Create a new relation.
278 * Given an already-created (but presumably unused) SMgrRelation,
279 * cause the underlying disk file or other storage for the fork
282 * If isRedo is true, it is okay for the underlying file to exist
283 * already because we are in a WAL replay sequence.
286 smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
289 * Exit quickly in WAL replay mode if we've already opened the file. If
290 * it's open, it surely must exist.
292 if (isRedo && reln->md_fd[forknum] != NULL)
296 * We may be using the target table space for the first time in this
297 * database, so create a per-database subdirectory if needed.
299 * XXX this is a fairly ugly violation of module layering, but this seems
300 * to be the best place to put the check. Maybe TablespaceCreateDbspace
301 * should be here and not in commands/tablespace.c? But that would imply
302 * importing a lot of stuff that smgr.c oughtn't know, either.
304 TablespaceCreateDbspace(reln->smgr_rnode.spcNode,
305 reln->smgr_rnode.dbNode,
308 (*(smgrsw[reln->smgr_which].smgr_create)) (reln, forknum, isRedo);
312 * smgrdounlink() -- Immediately unlink a relation.
314 * The specified fork of the relation is removed from the store. This
315 * should not be used during transactional operations, since it can't be
318 * If isRedo is true, it is okay for the underlying file to be gone
322 smgrdounlink(SMgrRelation reln, ForkNumber forknum, bool isTemp, bool isRedo)
324 RelFileNode rnode = reln->smgr_rnode;
325 int which = reln->smgr_which;
328 (*(smgrsw[which].smgr_close)) (reln, forknum);
330 smgr_internal_unlink(rnode, forknum, which, isTemp, isRedo);
334 * Shared subroutine that actually does the unlink ...
337 smgr_internal_unlink(RelFileNode rnode, ForkNumber forknum,
338 int which, bool isTemp, bool isRedo)
341 * Get rid of any remaining buffers for the relation. bufmgr will just
342 * drop them without bothering to write the contents.
344 DropRelFileNodeBuffers(rnode, forknum, isTemp, 0);
347 * It'd be nice to tell the stats collector to forget it immediately, too.
348 * But we can't because we don't know the OID (and in cases involving
349 * relfilenode swaps, it's not always clear which table OID to forget,
354 * And delete the physical files.
356 * Note: smgr_unlink must treat deletion failure as a WARNING, not an
357 * ERROR, because we've already decided to commit or abort the current
360 (*(smgrsw[which].smgr_unlink)) (rnode, forknum, isRedo);
364 * smgrextend() -- Add a new block to a file.
366 * The semantics are nearly the same as smgrwrite(): write at the
367 * specified position. However, this is to be used for the case of
368 * extending a relation (i.e., blocknum is at or beyond the current
369 * EOF). Note that we assume writing a block beyond current EOF
370 * causes intervening file space to become filled with zeroes.
373 smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
374 char *buffer, bool isTemp)
376 (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, forknum, blocknum,
381 * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation.
384 smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum)
386 (*(smgrsw[reln->smgr_which].smgr_prefetch)) (reln, forknum, blocknum);
390 * smgrread() -- read a particular block from a relation into the supplied
393 * This routine is called from the buffer manager in order to
394 * instantiate pages in the shared buffer cache. All storage managers
395 * return pages in the format that POSTGRES expects.
398 smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
401 (*(smgrsw[reln->smgr_which].smgr_read)) (reln, forknum, blocknum, buffer);
405 * smgrwrite() -- Write the supplied buffer out.
407 * This is to be used only for updating already-existing blocks of a
408 * relation (ie, those before the current EOF). To extend a relation,
411 * This is not a synchronous write -- the block is not necessarily
412 * on disk at return, only dumped out to the kernel. However,
413 * provisions will be made to fsync the write before the next checkpoint.
415 * isTemp indicates that the relation is a temp table (ie, is managed
416 * by the local-buffer manager). In this case no provisions need be
417 * made to fsync the write before checkpointing.
420 smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
421 char *buffer, bool isTemp)
423 (*(smgrsw[reln->smgr_which].smgr_write)) (reln, forknum, blocknum,
428 * smgrnblocks() -- Calculate the number of blocks in the
432 smgrnblocks(SMgrRelation reln, ForkNumber forknum)
434 return (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln, forknum);
438 * smgrtruncate() -- Truncate supplied relation to the specified number
442 smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks,
446 * Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
447 * just drop them without bothering to write the contents.
449 DropRelFileNodeBuffers(reln->smgr_rnode, forknum, isTemp, nblocks);
451 /* Do the truncation */
452 (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, forknum, nblocks,
457 * smgrimmedsync() -- Force the specified relation to stable storage.
459 * Synchronously force all previous writes to the specified relation
462 * This is useful for building completely new relations (eg, new
463 * indexes). Instead of incrementally WAL-logging the index build
464 * steps, we can just write completed index pages to disk with smgrwrite
465 * or smgrextend, and then fsync the completed index file before
466 * committing the transaction. (This is sufficient for purposes of
467 * crash recovery, since it effectively duplicates forcing a checkpoint
468 * for the completed index. But it is *not* sufficient if one wishes
469 * to use the WAL log for PITR or replication purposes: in that case
470 * we have to make WAL entries as well.)
472 * The preceding writes should specify isTemp = true to avoid
473 * duplicative fsyncs.
475 * Note that you need to do FlushRelationBuffers() first if there is
476 * any possibility that there are dirty buffers for the relation;
477 * otherwise the sync is not very meaningful.
480 smgrimmedsync(SMgrRelation reln, ForkNumber forknum)
482 (*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln, forknum);
487 * smgrpreckpt() -- Prepare for checkpoint.
494 for (i = 0; i < NSmgr; i++)
496 if (smgrsw[i].smgr_pre_ckpt)
497 (*(smgrsw[i].smgr_pre_ckpt)) ();
502 * smgrsync() -- Sync files to disk during checkpoint.
509 for (i = 0; i < NSmgr; i++)
511 if (smgrsw[i].smgr_sync)
512 (*(smgrsw[i].smgr_sync)) ();
517 * smgrpostckpt() -- Post-checkpoint cleanup.
524 for (i = 0; i < NSmgr; i++)
526 if (smgrsw[i].smgr_post_ckpt)
527 (*(smgrsw[i].smgr_post_ckpt)) ();