*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.123 2006/11/20 01:07:56 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.124 2007/01/03 18:11:01 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "miscadmin.h"
#include "postmaster/bgwriter.h"
#include "storage/fd.h"
+#include "storage/bufmgr.h"
#include "storage/smgr.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
static HTAB *pendingOpsTable = NULL;
+typedef enum /* behavior for mdopen & _mdfd_getseg */
+{
+ EXTENSION_FAIL, /* ereport if segment not present */
+ EXTENSION_RETURN_NULL, /* return NULL if not present */
+ EXTENSION_CREATE /* create new segments as needed */
+} ExtensionBehavior;
+
/* local routines */
-static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
-static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
+static MdfdVec *mdopen(SMgrRelation reln, ExtensionBehavior behavior);
+static void register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
static MdfdVec *_fdvec_alloc(void);
#ifndef LET_OS_MANAGE_FILESIZE
int oflags);
#endif
static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
- bool allowNotFound);
-static BlockNumber _mdnblocks(File file, Size blcksz);
+ bool isTemp, ExtensionBehavior behavior);
+static BlockNumber _mdnblocks(SMgrRelation reln, MdfdVec *seg);
/*
* mdinit() -- Initialize private state for magnetic disk storage manager.
*/
-bool
+void
mdinit(void)
{
MdCxt = AllocSetContextCreate(TopMemoryContext,
&hash_ctl,
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
}
-
- return true;
}
/*
*
* If isRedo is true, it's okay for the relation to exist already.
*/
-bool
+void
mdcreate(SMgrRelation reln, bool isRedo)
{
char *path;
File fd;
if (isRedo && reln->md_fd != NULL)
- return true; /* created and opened already... */
+ return; /* created and opened already... */
Assert(reln->md_fd == NULL);
if (fd < 0)
{
pfree(path);
- /* be sure to return the error reported by create, not open */
+ /* be sure to report the error reported by create, not open */
errno = save_errno;
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not create relation %u/%u/%u: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
}
- errno = 0;
}
pfree(path);
#ifndef LET_OS_MANAGE_FILESIZE
reln->md_fd->mdfd_chain = NULL;
#endif
-
- return true;
}
/*
* there won't be an SMgrRelation hashtable entry anymore.
*
* If isRedo is true, it's okay for the relation to be already gone.
+ * Also, any failure should be reported as WARNING not ERROR, because
+ * we are usually not in a transaction anymore when this is called.
*/
-bool
+void
mdunlink(RelFileNode rnode, bool isRedo)
{
- bool status = true;
- int save_errno = 0;
char *path;
path = relpath(rnode);
if (unlink(path) < 0)
{
if (!isRedo || errno != ENOENT)
- {
- status = false;
- save_errno = errno;
- }
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove relation %u/%u/%u: %m",
+ rnode.spcNode,
+ rnode.dbNode,
+ rnode.relNode)));
}
#ifndef LET_OS_MANAGE_FILESIZE
/* Delete the additional segments, if any */
- if (status)
+ else
{
char *segpath = (char *) palloc(strlen(path) + 12);
BlockNumber segno;
{
/* ENOENT is expected after the last segment... */
if (errno != ENOENT)
- {
- status = false;
- save_errno = errno;
- }
+ ereport(WARNING,
+ (errcode_for_file_access(),
+ errmsg("could not remove segment %u of relation %u/%u/%u: %m",
+ segno,
+ rnode.spcNode,
+ rnode.dbNode,
+ rnode.relNode)));
break;
}
}
#endif
pfree(path);
-
- errno = save_errno;
- return status;
}
/*
* mdextend() -- Add a block to the specified relation.
*
- * The semantics are basically the same as mdwrite(): write at the
- * specified position. However, we are expecting to extend the
- * relation (ie, blocknum is >= the current EOF), and so in case of
- * failure we clean up by truncating.
- *
- * This routine returns true or false, with errno set as appropriate.
+ * The semantics are nearly the same as mdwrite(): write at the
+ * specified position. However, this is to be used for the case of
+ * extending a relation (i.e., blocknum is at or beyond the current
+ * EOF). Note that we assume writing a block beyond current EOF
+ * causes intervening file space to become filled with zeroes.
*/
-bool
+void
mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
long seekpos;
int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum, false);
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum >= mdnblocks(reln));
+#endif
+
+ /*
+ * If a relation manages to grow to 2^32-1 blocks, refuse to extend it
+ * any more --- we mustn't create a block whose number
+ * actually is InvalidBlockNumber.
+ */
+ if (blocknum == InvalidBlockNumber)
+ ereport(ERROR,
+ (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+ errmsg("cannot extend relation %u/%u/%u beyond %u blocks",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ InvalidBlockNumber)));
+
+ v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_CREATE);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
#endif
/*
- * Note: because caller obtained blocknum by calling _mdnblocks, which did
- * a seek(SEEK_END), this seek is often redundant and will be optimized
- * away by fd.c. It's not redundant, however, if there is a partial page
- * at the end of the file. In that case we want to try to overwrite the
- * partial page with a full page. It's also not redundant if bufmgr.c had
- * to dump another buffer of the same file to make room for the new page's
- * buffer.
+ * Note: because caller usually obtained blocknum by calling mdnblocks,
+ * which did a seek(SEEK_END), this seek is often redundant and will be
+ * optimized away by fd.c. It's not redundant, however, if there is a
+ * partial page at the end of the file. In that case we want to try to
+ * overwrite the partial page with a full page. It's also not redundant
+ * if bufmgr.c had to dump another buffer of the same file to make room
+ * for the new page's buffer.
*/
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
- if (nbytes > 0)
- {
- int save_errno = errno;
-
- /* Remove the partially-written page */
- FileTruncate(v->mdfd_vfd, seekpos);
- FileSeek(v->mdfd_vfd, seekpos, SEEK_SET);
- errno = save_errno;
- }
- return false;
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not extend relation %u/%u/%u: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode),
+ errhint("Check free disk space.")));
+ /* short write: complain appropriately */
+ ereport(ERROR,
+ (errcode(ERRCODE_DISK_FULL),
+ errmsg("could not extend relation %u/%u/%u: wrote only %d of %d bytes at block %u",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nbytes, BLCKSZ, blocknum),
+ errhint("Check free disk space.")));
}
if (!isTemp)
- {
- if (!register_dirty_segment(reln, v))
- return false;
- }
+ register_dirty_segment(reln, v);
#ifndef LET_OS_MANAGE_FILESIZE
- Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
+ Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
#endif
-
- return true;
}
/*
- * mdopen() -- Open the specified relation. ereport's on failure.
- * (Optionally, can return NULL instead of ereport for ENOENT.)
+ * mdopen() -- Open the specified relation.
*
* Note we only open the first segment, when there are multiple segments.
+ *
+ * If first segment is not present, either ereport or return NULL according
+ * to "behavior". We treat EXTENSION_CREATE the same as EXTENSION_FAIL;
+ * EXTENSION_CREATE means it's OK to extend an existing relation, not to
+ * invent one out of whole cloth.
*/
static MdfdVec *
-mdopen(SMgrRelation reln, bool allowNotFound)
+mdopen(SMgrRelation reln, ExtensionBehavior behavior)
{
MdfdVec *mdfd;
char *path;
if (fd < 0)
{
pfree(path);
- if (allowNotFound && errno == ENOENT)
+ if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
mdfd->mdfd_segno = 0;
#ifndef LET_OS_MANAGE_FILESIZE
mdfd->mdfd_chain = NULL;
- Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
+ Assert(_mdnblocks(reln, mdfd) <= ((BlockNumber) RELSEG_SIZE));
#endif
return mdfd;
/*
* mdclose() -- Close the specified relation, if it isn't closed already.
- *
- * Returns true or false with errno set as appropriate.
*/
-bool
+void
mdclose(SMgrRelation reln)
{
MdfdVec *v = reln->md_fd;
/* No work if already closed */
if (v == NULL)
- return true;
+ return;
reln->md_fd = NULL; /* prevent dangling pointer after error */
FileClose(v->mdfd_vfd);
pfree(v);
#endif
-
- return true;
}
/*
* mdread() -- Read the specified block from a relation.
*/
-bool
+void
mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
{
- bool status;
long seekpos;
int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum, false);
+ v = _mdfd_getseg(reln, blocknum, false, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
- status = true;
if ((nbytes = FileRead(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not read block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
/*
- * If we are at or past EOF, return zeroes without complaining. Also
- * substitute zeroes if we found a partial block at EOF.
- *
- * XXX this is really ugly, bad design. However the current
- * implementation of hash indexes requires it, because hash index
- * pages are initialized out-of-order.
+ * Short read: we are at or past EOF, or we read a partial block at
+ * EOF. Normally this is an error; upper levels should never try to
+ * read a nonexistent block. However, if zero_damaged_pages is ON
+ * or we are InRecovery, we should instead return zeroes without
+ * complaining. This allows, for example, the case of trying to
+ * update a block that was later truncated away.
*/
- if (nbytes == 0 ||
- (nbytes > 0 && mdnblocks(reln) == blocknum))
+ if (zero_damaged_pages || InRecovery)
MemSet(buffer, 0, BLCKSZ);
else
- status = false;
+ ereport(ERROR,
+ (errcode(ERRCODE_DATA_CORRUPTED),
+ errmsg("could not read block %u of relation %u/%u/%u: read only %d of %d bytes",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nbytes, BLCKSZ)));
}
-
- return status;
}
/*
* mdwrite() -- Write the supplied block at the appropriate location.
+ *
+ * This is to be used only for updating already-existing blocks of a
+ * relation (ie, those before the current EOF). To extend a relation,
+ * use mdextend().
*/
-bool
+void
mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
long seekpos;
+ int nbytes;
MdfdVec *v;
- v = _mdfd_getseg(reln, blocknum, false);
+ /* This assert is too expensive to have on normally ... */
+#ifdef CHECK_WRITE_VS_EXTEND
+ Assert(blocknum < mdnblocks(reln));
+#endif
+
+ v = _mdfd_getseg(reln, blocknum, isTemp, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
#endif
if (FileSeek(v->mdfd_vfd, seekpos, SEEK_SET) != seekpos)
- return false;
-
- if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
- if (!isTemp)
+ if ((nbytes = FileWrite(v->mdfd_vfd, buffer, BLCKSZ)) != BLCKSZ)
{
- if (!register_dirty_segment(reln, v))
- return false;
+ if (nbytes < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not write block %u of relation %u/%u/%u: %m",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
+ /* short write: complain appropriately */
+ ereport(ERROR,
+ (errcode(ERRCODE_DISK_FULL),
+ errmsg("could not write block %u of relation %u/%u/%u: wrote only %d of %d bytes",
+ blocknum,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nbytes, BLCKSZ),
+ errhint("Check free disk space.")));
}
- return true;
+ if (!isTemp)
+ register_dirty_segment(reln, v);
}
/*
* and added to the mdfd_chain list. If this routine has not been
* called, then only segments up to the last one actually touched
* are present in the chain.
- *
- * Returns # of blocks, or InvalidBlockNumber on error.
*/
BlockNumber
mdnblocks(SMgrRelation reln)
{
- MdfdVec *v = mdopen(reln, false);
+ MdfdVec *v = mdopen(reln, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
BlockNumber nblocks;
for (;;)
{
- nblocks = _mdnblocks(v->mdfd_vfd, BLCKSZ);
+ nblocks = _mdnblocks(reln, v);
if (nblocks > ((BlockNumber) RELSEG_SIZE))
elog(FATAL, "segment too big");
if (nblocks < ((BlockNumber) RELSEG_SIZE))
*/
v->mdfd_chain = _mdfd_openseg(reln, segno, O_CREAT);
if (v->mdfd_chain == NULL)
- return InvalidBlockNumber; /* failed? */
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not open segment %u of relation %u/%u/%u: %m",
+ segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
}
v = v->mdfd_chain;
}
#else
- return _mdnblocks(v->mdfd_vfd, BLCKSZ);
+ return _mdnblocks(reln, v);
#endif
}
/*
* mdtruncate() -- Truncate relation to specified number of blocks.
- *
- * Returns # of blocks or InvalidBlockNumber on error.
*/
-BlockNumber
+void
mdtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
{
MdfdVec *v;
* that truncation loop will get them all!
*/
curnblk = mdnblocks(reln);
- if (curnblk == InvalidBlockNumber)
- return InvalidBlockNumber; /* mdnblocks failed */
if (nblocks > curnblk)
- return InvalidBlockNumber; /* bogus request */
+ {
+ /* Bogus request ... but no complaint if InRecovery */
+ if (InRecovery)
+ return;
+ ereport(ERROR,
+ (errmsg("could not truncate relation %u/%u/%u to %u blocks: it's only %u blocks now",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nblocks, curnblk)));
+ }
if (nblocks == curnblk)
- return nblocks; /* no work */
+ return; /* no work */
- v = mdopen(reln, false);
+ v = mdopen(reln, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
priorblocks = 0;
* not delete it, for reasons explained in the header comments.
*/
if (FileTruncate(v->mdfd_vfd, 0) < 0)
- return InvalidBlockNumber;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nblocks)));
if (!isTemp)
- {
- if (!register_dirty_segment(reln, v))
- return InvalidBlockNumber;
- }
+ register_dirty_segment(reln, v);
v = v->mdfd_chain;
Assert(ov != reln->md_fd); /* we never drop the 1st segment */
pfree(ov);
BlockNumber lastsegblocks = nblocks - priorblocks;
if (FileTruncate(v->mdfd_vfd, lastsegblocks * BLCKSZ) < 0)
- return InvalidBlockNumber;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nblocks)));
if (!isTemp)
- {
- if (!register_dirty_segment(reln, v))
- return InvalidBlockNumber;
- }
+ register_dirty_segment(reln, v);
v = v->mdfd_chain;
ov->mdfd_chain = NULL;
}
}
#else
if (FileTruncate(v->mdfd_vfd, nblocks * BLCKSZ) < 0)
- return InvalidBlockNumber;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode,
+ nblocks)));
if (!isTemp)
- {
- if (!register_dirty_segment(reln, v))
- return InvalidBlockNumber;
- }
+ register_dirty_segment(reln, v);
#endif
-
- return nblocks;
}
/*
* Note that only writes already issued are synced; this routine knows
* nothing of dirty buffers that may exist inside the buffer manager.
*/
-bool
+void
mdimmedsync(SMgrRelation reln)
{
MdfdVec *v;
* that fsync loop will get them all!
*/
curnblk = mdnblocks(reln);
- if (curnblk == InvalidBlockNumber)
- return false; /* mdnblocks failed */
- v = mdopen(reln, false);
+ v = mdopen(reln, EXTENSION_FAIL);
#ifndef LET_OS_MANAGE_FILESIZE
while (v != NULL)
{
if (FileSync(v->mdfd_vfd) < 0)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+ v->mdfd_segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
v = v->mdfd_chain;
}
#else
if (FileSync(v->mdfd_vfd) < 0)
- return false;
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+ v->mdfd_segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
#endif
-
- return true;
}
/*
* This is only called during checkpoints, and checkpoints should only
* occur in processes that have created a pendingOpsTable.
*/
-bool
+void
mdsync(void)
{
HASH_SEQ_STATUS hstat;
int absorb_counter;
if (!pendingOpsTable)
- return false;
+ elog(ERROR, "cannot sync without a pendingOpsTable");
/*
* If we are in the bgwriter, the sync had better include all fsync
*/
seg = _mdfd_getseg(reln,
entry->segno * ((BlockNumber) RELSEG_SIZE),
- true);
+ false, EXTENSION_RETURN_NULL);
if (seg)
{
if (FileSync(seg->mdfd_vfd) < 0 &&
errno != ENOENT)
- {
- ereport(LOG,
+ ereport(ERROR,
(errcode_for_file_access(),
errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
entry->segno,
entry->rnode.spcNode,
entry->rnode.dbNode,
entry->rnode.relNode)));
- return false;
- }
}
}
HASH_REMOVE, NULL) == NULL)
elog(ERROR, "pendingOpsTable corrupted");
}
-
- return true;
}
/*
* to the background writer process. If that fails, just do the fsync
* locally before returning (we expect this will not happen often enough
* to be a performance problem).
- *
- * A false result implies I/O failure during local fsync. errno will be
- * valid for error reporting.
*/
-static bool
+static void
register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
{
if (pendingOpsTable)
entry.segno = seg->mdfd_segno;
(void) hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL);
- return true;
}
else
{
if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
- return true;
- }
+ return; /* passed it off successfully */
- if (FileSync(seg->mdfd_vfd) < 0)
- return false;
- return true;
+ if (FileSync(seg->mdfd_vfd) < 0)
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not fsync segment %u of relation %u/%u/%u: %m",
+ seg->mdfd_segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
+ }
}
/*
v->mdfd_vfd = fd;
v->mdfd_segno = segno;
v->mdfd_chain = NULL;
- Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
+ Assert(_mdnblocks(reln, v) <= ((BlockNumber) RELSEG_SIZE));
/* all done */
return v;
/*
* _mdfd_getseg() -- Find the segment of the relation holding the
- * specified block. ereport's on failure.
- * (Optionally, can return NULL instead of ereport for ENOENT.)
+ * specified block.
+ *
+ * If the segment doesn't exist, we ereport, return NULL, or create the
+ * segment, according to "behavior". Note: isTemp need only be correct
+ * in the EXTENSION_CREATE case.
*/
static MdfdVec *
-_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
+_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool isTemp,
+ ExtensionBehavior behavior)
{
- MdfdVec *v = mdopen(reln, allowNotFound);
+ MdfdVec *v = mdopen(reln, behavior);
#ifndef LET_OS_MANAGE_FILESIZE
- BlockNumber segstogo;
+ BlockNumber targetseg;
BlockNumber nextsegno;
if (!v)
- return NULL; /* only possible if allowNotFound */
+ return NULL; /* only possible if EXTENSION_RETURN_NULL */
- for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
- segstogo > 0;
- nextsegno++, segstogo--)
+ targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
+ for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
{
+ Assert(nextsegno == v->mdfd_segno + 1);
+
if (v->mdfd_chain == NULL)
{
/*
- * We will create the next segment only if the target block is
- * within it. This prevents Sorcerer's Apprentice syndrome if a
- * bug at higher levels causes us to be handed a ridiculously
- * large blkno --- otherwise we could create many thousands of
- * empty segment files before reaching the "target" block. We
- * should never need to create more than one new segment per call,
- * so this restriction seems reasonable.
+ * Normally we will create new segments only if authorized by
+ * the caller (i.e., we are doing mdextend()). But when doing
+ * WAL recovery, create segments anyway; this allows cases such as
+ * replaying WAL data that has a write into a high-numbered
+ * segment of a relation that was later deleted. We want to go
+ * ahead and create the segments so we can finish out the replay.
*
- * BUT: when doing WAL recovery, disable this logic and create
- * segments unconditionally. In this case it seems better to
- * assume the given blkno is good (it presumably came from a
- * CRC-checked WAL record); furthermore this lets us cope in the
- * case where we are replaying WAL data that has a write into a
- * high-numbered segment of a relation that was later deleted. We
- * want to go ahead and create the segments so we can finish out
- * the replay.
+ * We have to maintain the invariant that segments before the
+ * last active segment are of size RELSEG_SIZE; therefore, pad
+ * them out with zeroes if needed. (This only matters if caller
+ * is extending the relation discontiguously, but that can happen
+ * in hash indexes.)
*/
- v->mdfd_chain = _mdfd_openseg(reln,
- nextsegno,
- (segstogo == 1 || InRecovery) ? O_CREAT : 0);
+ if (behavior == EXTENSION_CREATE || InRecovery)
+ {
+ if (_mdnblocks(reln, v) < RELSEG_SIZE)
+ {
+ char *zerobuf = palloc0(BLCKSZ);
+
+ mdextend(reln, nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+ zerobuf, isTemp);
+ pfree(zerobuf);
+ }
+ v->mdfd_chain = _mdfd_openseg(reln, nextsegno, O_CREAT);
+ }
+ else
+ {
+ /* We won't create segment if not existent */
+ v->mdfd_chain = _mdfd_openseg(reln, nextsegno, 0);
+ }
if (v->mdfd_chain == NULL)
{
- if (allowNotFound && errno == ENOENT)
+ if (behavior == EXTENSION_RETURN_NULL && errno == ENOENT)
return NULL;
ereport(ERROR,
(errcode_for_file_access(),
* Get number of blocks present in a single disk file
*/
static BlockNumber
-_mdnblocks(File file, Size blcksz)
+_mdnblocks(SMgrRelation reln, MdfdVec *seg)
{
long len;
- len = FileSeek(file, 0L, SEEK_END);
+ len = FileSeek(seg->mdfd_vfd, 0L, SEEK_END);
if (len < 0)
- return 0; /* on failure, assume file is empty */
- return (BlockNumber) (len / blcksz);
+ ereport(ERROR,
+ (errcode_for_file_access(),
+ errmsg("could not seek to end of segment %u of relation %u/%u/%u: %m",
+ seg->mdfd_segno,
+ reln->smgr_rnode.spcNode,
+ reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode)));
+ /* note that this calculation will ignore any partial block at EOF */
+ return (BlockNumber) (len / BLCKSZ);
}
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.101 2006/10/04 00:29:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.102 2007/01/03 18:11:01 tgl Exp $
*
*-------------------------------------------------------------------------
*/
/*
* This struct of function pointers defines the API between smgr.c and
* any individual storage manager module. Note that smgr subfunctions are
- * generally expected to return TRUE on success, FALSE on error. (For
- * nblocks and truncate we instead say that returning InvalidBlockNumber
- * indicates an error.)
+ * generally expected to report problems via elog(ERROR). An exception is
+ * that smgr_unlink should use elog(WARNING), rather than erroring out,
+ * because we normally unlink relations during post-commit/abort cleanup,
+ * and so it's too late to raise an error. Also, various conditions that
+ * would normally be errors should be allowed during bootstrap and/or WAL
+ * recovery --- see comments in md.c for details.
*/
typedef struct f_smgr
{
- bool (*smgr_init) (void); /* may be NULL */
- bool (*smgr_shutdown) (void); /* may be NULL */
- bool (*smgr_close) (SMgrRelation reln);
- bool (*smgr_create) (SMgrRelation reln, bool isRedo);
- bool (*smgr_unlink) (RelFileNode rnode, bool isRedo);
- bool (*smgr_extend) (SMgrRelation reln, BlockNumber blocknum,
+ void (*smgr_init) (void); /* may be NULL */
+ void (*smgr_shutdown) (void); /* may be NULL */
+ void (*smgr_close) (SMgrRelation reln);
+ void (*smgr_create) (SMgrRelation reln, bool isRedo);
+ void (*smgr_unlink) (RelFileNode rnode, bool isRedo);
+ void (*smgr_extend) (SMgrRelation reln, BlockNumber blocknum,
char *buffer, bool isTemp);
- bool (*smgr_read) (SMgrRelation reln, BlockNumber blocknum,
+ void (*smgr_read) (SMgrRelation reln, BlockNumber blocknum,
char *buffer);
- bool (*smgr_write) (SMgrRelation reln, BlockNumber blocknum,
+ void (*smgr_write) (SMgrRelation reln, BlockNumber blocknum,
char *buffer, bool isTemp);
BlockNumber (*smgr_nblocks) (SMgrRelation reln);
- BlockNumber (*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
- bool isTemp);
- bool (*smgr_immedsync) (SMgrRelation reln);
- bool (*smgr_commit) (void); /* may be NULL */
- bool (*smgr_abort) (void); /* may be NULL */
- bool (*smgr_sync) (void); /* may be NULL */
+ void (*smgr_truncate) (SMgrRelation reln, BlockNumber nblocks,
+ bool isTemp);
+ void (*smgr_immedsync) (SMgrRelation reln);
+ void (*smgr_commit) (void); /* may be NULL */
+ void (*smgr_abort) (void); /* may be NULL */
+ void (*smgr_sync) (void); /* may be NULL */
} f_smgr;
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_init)
- {
- if (!(*(smgrsw[i].smgr_init)) ())
- elog(FATAL, "smgr initialization failed on %s: %m",
- DatumGetCString(DirectFunctionCall1(smgrout,
- Int16GetDatum(i))));
- }
+ (*(smgrsw[i].smgr_init)) ();
}
/* register the shutdown proc */
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_shutdown)
- {
- if (!(*(smgrsw[i].smgr_shutdown)) ())
- elog(FATAL, "smgr shutdown failed on %s: %m",
- DatumGetCString(DirectFunctionCall1(smgrout,
- Int16GetDatum(i))));
- }
+ (*(smgrsw[i].smgr_shutdown)) ();
}
}
{
SMgrRelation *owner;
- if (!(*(smgrsw[reln->smgr_which].smgr_close)) (reln))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not close relation %u/%u/%u: %m",
- reln->smgr_rnode.spcNode,
- reln->smgr_rnode.dbNode,
- reln->smgr_rnode.relNode)));
+ (*(smgrsw[reln->smgr_which].smgr_close)) (reln);
owner = reln->smgr_owner;
reln->smgr_rnode.dbNode,
isRedo);
- if (!(*(smgrsw[reln->smgr_which].smgr_create)) (reln, isRedo))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not create relation %u/%u/%u: %m",
- reln->smgr_rnode.spcNode,
- reln->smgr_rnode.dbNode,
- reln->smgr_rnode.relNode)));
+ (*(smgrsw[reln->smgr_which].smgr_create)) (reln, isRedo);
if (isRedo)
return;
/*
* And delete the physical files.
*
- * Note: we treat deletion failure as a WARNING, not an error, because
- * we've already decided to commit or abort the current xact.
+ * Note: smgr_unlink must treat deletion failure as a WARNING, not an
+ * ERROR, because we've already decided to commit or abort the current
+ * xact.
*/
- if (!(*(smgrsw[which].smgr_unlink)) (rnode, isRedo))
- ereport(WARNING,
- (errcode_for_file_access(),
- errmsg("could not remove relation %u/%u/%u: %m",
- rnode.spcNode,
- rnode.dbNode,
- rnode.relNode)));
+ (*(smgrsw[which].smgr_unlink)) (rnode, isRedo);
}
/*
* smgrextend() -- Add a new block to a file.
*
- * The semantics are basically the same as smgrwrite(): write at the
- * specified position. However, we are expecting to extend the
- * relation (ie, blocknum is the current EOF), and so in case of
- * failure we clean up by truncating.
+ * The semantics are nearly the same as smgrwrite(): write at the
+ * specified position. However, this is to be used for the case of
+ * extending a relation (i.e., blocknum is at or beyond the current
+ * EOF). Note that we assume writing a block beyond current EOF
+ * causes intervening file space to become filled with zeroes.
*/
void
smgrextend(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
- if (!(*(smgrsw[reln->smgr_which].smgr_extend)) (reln, blocknum, buffer,
- isTemp))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not extend relation %u/%u/%u: %m",
- reln->smgr_rnode.spcNode,
- reln->smgr_rnode.dbNode,
- reln->smgr_rnode.relNode),
- errhint("Check free disk space.")));
+ (*(smgrsw[reln->smgr_which].smgr_extend)) (reln, blocknum, buffer, isTemp);
}
/*
void
smgrread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
{
- if (!(*(smgrsw[reln->smgr_which].smgr_read)) (reln, blocknum, buffer))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not read block %u of relation %u/%u/%u: %m",
- blocknum,
- reln->smgr_rnode.spcNode,
- reln->smgr_rnode.dbNode,
- reln->smgr_rnode.relNode)));
+ (*(smgrsw[reln->smgr_which].smgr_read)) (reln, blocknum, buffer);
}
/*
* smgrwrite() -- Write the supplied buffer out.
*
+ * This is to be used only for updating already-existing blocks of a
+ * relation (ie, those before the current EOF). To extend a relation,
+ * use smgrextend().
+ *
* This is not a synchronous write -- the block is not necessarily
* on disk at return, only dumped out to the kernel. However,
* provisions will be made to fsync the write before the next checkpoint.
void
smgrwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer, bool isTemp)
{
- if (!(*(smgrsw[reln->smgr_which].smgr_write)) (reln, blocknum, buffer,
- isTemp))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not write block %u of relation %u/%u/%u: %m",
- blocknum,
- reln->smgr_rnode.spcNode,
- reln->smgr_rnode.dbNode,
- reln->smgr_rnode.relNode)));
+ (*(smgrsw[reln->smgr_which].smgr_write)) (reln, blocknum, buffer, isTemp);
}
/*
* smgrnblocks() -- Calculate the number of blocks in the
* supplied relation.
- *
- * Returns the number of blocks on success, aborts the current
- * transaction on failure.
*/
BlockNumber
smgrnblocks(SMgrRelation reln)
{
- BlockNumber nblocks;
-
- nblocks = (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln);
-
- /*
- * NOTE: if a relation ever did grow to 2^32-1 blocks, this code would
- * fail --- but that's a good thing, because it would stop us from
- * extending the rel another block and having a block whose number
- * actually is InvalidBlockNumber.
- */
- if (nblocks == InvalidBlockNumber)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not count blocks of relation %u/%u/%u: %m",
- reln->smgr_rnode.spcNode,
- reln->smgr_rnode.dbNode,
- reln->smgr_rnode.relNode)));
-
- return nblocks;
+ return (*(smgrsw[reln->smgr_which].smgr_nblocks)) (reln);
}
/*
* smgrtruncate() -- Truncate supplied relation to the specified number
* of blocks
- *
- * Returns the number of blocks on success, aborts the current
- * transaction on failure.
*/
-BlockNumber
+void
smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
{
- BlockNumber newblks;
-
/*
* Get rid of any buffers for the about-to-be-deleted blocks. bufmgr will
* just drop them without bothering to write the contents.
FreeSpaceMapTruncateRel(&reln->smgr_rnode, nblocks);
/* Do the truncation */
- newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, nblocks,
- isTemp);
- if (newblks == InvalidBlockNumber)
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
- reln->smgr_rnode.spcNode,
- reln->smgr_rnode.dbNode,
- reln->smgr_rnode.relNode,
- nblocks)));
+ (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln, nblocks, isTemp);
if (!isTemp)
{
XLogRecData rdata;
xl_smgr_truncate xlrec;
- xlrec.blkno = newblks;
+ xlrec.blkno = nblocks;
xlrec.rnode = reln->smgr_rnode;
rdata.data = (char *) &xlrec;
lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLOG_NO_TRAN,
&rdata);
}
-
- return newblks;
}
/*
void
smgrimmedsync(SMgrRelation reln)
{
- if (!(*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln))
- ereport(ERROR,
- (errcode_for_file_access(),
- errmsg("could not sync relation %u/%u/%u: %m",
- reln->smgr_rnode.spcNode,
- reln->smgr_rnode.dbNode,
- reln->smgr_rnode.relNode)));
+ (*(smgrsw[reln->smgr_which].smgr_immedsync)) (reln);
}
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_commit)
- {
- if (!(*(smgrsw[i].smgr_commit)) ())
- elog(ERROR, "transaction commit failed on %s: %m",
- DatumGetCString(DirectFunctionCall1(smgrout,
- Int16GetDatum(i))));
- }
+ (*(smgrsw[i].smgr_commit)) ();
}
}
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_abort)
- {
- if (!(*(smgrsw[i].smgr_abort)) ())
- elog(ERROR, "transaction abort failed on %s: %m",
- DatumGetCString(DirectFunctionCall1(smgrout,
- Int16GetDatum(i))));
- }
+ (*(smgrsw[i].smgr_abort)) ();
}
}
for (i = 0; i < NSmgr; i++)
{
if (smgrsw[i].smgr_sync)
- {
- if (!(*(smgrsw[i].smgr_sync)) ())
- elog(ERROR, "storage sync failed on %s: %m",
- DatumGetCString(DirectFunctionCall1(smgrout,
- Int16GetDatum(i))));
- }
+ (*(smgrsw[i].smgr_sync)) ();
}
}
{
xl_smgr_truncate *xlrec = (xl_smgr_truncate *) XLogRecGetData(record);
SMgrRelation reln;
- BlockNumber newblks;
reln = smgropen(xlrec->rnode);
FreeSpaceMapTruncateRel(&reln->smgr_rnode, xlrec->blkno);
/* Do the truncation */
- newblks = (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln,
- xlrec->blkno,
- false);
- if (newblks == InvalidBlockNumber)
- ereport(WARNING,
- (errcode_for_file_access(),
- errmsg("could not truncate relation %u/%u/%u to %u blocks: %m",
- reln->smgr_rnode.spcNode,
- reln->smgr_rnode.dbNode,
- reln->smgr_rnode.relNode,
- xlrec->blkno)));
+ (*(smgrsw[reln->smgr_which].smgr_truncate)) (reln,
+ xlrec->blkno,
+ false);
/* Also tell xlogutils.c about it */
XLogTruncateRelation(xlrec->rnode, xlrec->blkno);