Make the visibility map crash-safe.

[pg-rex/syncrep.git] / src / backend / access / heap / visibilitymap.c
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c

index 1ae92e6..a193520 100644 (file)
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -3,18 +3,19 @@
   * visibilitymap.c
   *       bitmap for tracking visibility of heap tuples
   *
- * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
   * Portions Copyright (c) 1994, Regents of the University of California
   *
   *
   * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/heap/visibilitymap.c,v 1.9 2010/02/26 02:00:33 momjian Exp $
+ *       src/backend/access/heap/visibilitymap.c
   *
   * INTERFACE ROUTINES
- *             visibilitymap_clear - clear a bit in the visibility map
- *             visibilitymap_pin       - pin a map page for setting a bit
- *             visibilitymap_set       - set a bit in a previously pinned page
- *             visibilitymap_test      - test if a bit is set
+ *             visibilitymap_clear  - clear a bit in the visibility map
+ *             visibilitymap_pin        - pin a map page for setting a bit
+ *             visibilitymap_pin_ok - check whether correct map page is already pinned
+ *             visibilitymap_set        - set a bit in a previously pinned page
+ *             visibilitymap_test       - test if a bit is set
   *
   * NOTES
   *
@@ -64,32 +65,13 @@
   * It would be nice to use the visibility map to skip visibility checks in
   * index scans.
   *
- * Currently, the visibility map is not 100% correct all the time.
- * During updates, the bit in the visibility map is cleared after releasing
- * the lock on the heap page. During the window between releasing the lock
- * and clearing the bit in the visibility map, the bit in the visibility map
- * is set, but the new insertion or deletion is not yet visible to other
- * backends.
- *
- * That might actually be OK for the index scans, though. The newly inserted
- * tuple wouldn't have an index pointer yet, so all tuples reachable from an
- * index would still be visible to all other backends, and deletions wouldn't
- * be visible to other backends yet.  (But HOT breaks that argument, no?)
- *
- * There's another hole in the way the PD_ALL_VISIBLE flag is set. When
- * vacuum observes that all tuples are visible to all, it sets the flag on
- * the heap page, and also sets the bit in the visibility map. If we then
- * crash, and only the visibility map page was flushed to disk, we'll have
- * a bit set in the visibility map, but the corresponding flag on the heap
- * page is not set. If the heap page is then updated, the updater won't
- * know to clear the bit in the visibility map.  (Isn't that prevented by
- * the LSN interlock?)
- *
   *-------------------------------------------------------------------------
   */
  #include "postgres.h"
  
+#include "access/heapam.h"
  #include "access/visibilitymap.h"
+#include "miscadmin.h"
  #include "storage/bufmgr.h"
  #include "storage/bufpage.h"
  #include "storage/lmgr.h"
@@ -127,38 +109,37 @@ static void vm_extend(Relation rel, BlockNumber nvmblocks);
  /*
   *     visibilitymap_clear - clear a bit in visibility map
   *
- * Clear a bit in the visibility map, marking that not all tuples are
- * visible to all transactions anymore.
+ * You must pass a buffer containing the correct map page to this function.
+ * Call visibilitymap_pin first to pin the right one. This function doesn't do
+ * any I/O.
   */
  void
-visibilitymap_clear(Relation rel, BlockNumber heapBlk)
+visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf)
  {
         BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
         int                     mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
         int                     mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
         uint8           mask = 1 << mapBit;
-       Buffer          mapBuffer;
         char       *map;
  
  #ifdef TRACE_VISIBILITYMAP
         elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
  #endif
  
-       mapBuffer = vm_readbuf(rel, mapBlock, false);
-       if (!BufferIsValid(mapBuffer))
-               return;                                 /* nothing to do */
+       if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
+               elog(ERROR, "wrong buffer passed to visibilitymap_clear");
  
-       LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
-       map = PageGetContents(BufferGetPage(mapBuffer));
+       LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
+       map = PageGetContents(BufferGetPage(buf));
  
         if (map[mapByte] & mask)
         {
                 map[mapByte] &= ~mask;
  
-               MarkBufferDirty(mapBuffer);
+               MarkBufferDirty(buf);
         }
  
-       UnlockReleaseBuffer(mapBuffer);
+       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  }
  
  /*
@@ -173,7 +154,7 @@ visibilitymap_clear(Relation rel, BlockNumber heapBlk)
   * On entry, *buf should be InvalidBuffer or a valid buffer returned by
   * an earlier call to visibilitymap_pin or visibilitymap_test on the same
   * relation. On return, *buf is a valid buffer with the map page containing
- * the the bit for heapBlk.
+ * the bit for heapBlk.
   *
   * If the page doesn't exist in the map file yet, it is extended.
   */
@@ -194,19 +175,36 @@ visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
  }
  
  /*
+ *     visibilitymap_pin_ok - do we already have the correct page pinned?
+ *
+ * On entry, buf should be InvalidBuffer or a valid buffer returned by
+ * an earlier call to visibilitymap_pin or visibilitymap_test on the same
+ * relation.  The return value indicates whether the buffer covers the
+ * given heapBlk.
+ */
+bool
+visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
+{
+       BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
+
+       return BufferIsValid(buf) && BufferGetBlockNumber(buf) == mapBlock;
+}
+
+/*
   *     visibilitymap_set - set a bit on a previously pinned page
   *
- * recptr is the LSN of the heap page. The LSN of the visibility map page is
- * advanced to that, to make sure that the visibility map doesn't get flushed
- * to disk before the update to the heap page that made all tuples visible.
+ * recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
+ * or InvalidXLogRecPtr in normal running.  The page LSN is advanced to the
+ * one provided; in normal running, we generate a new XLOG record and set the
+ * page LSN to that value.
   *
- * This is an opportunistic function. It does nothing, unless *buf
- * contains the bit for heapBlk. Call visibilitymap_pin first to pin
- * the right map page. This function doesn't do any I/O.
+ * You must pass a buffer containing the correct map page to this function.
+ * Call visibilitymap_pin first to pin the right one. This function doesn't do
+ * any I/O.
   */
  void
  visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
-                                 Buffer *buf)
+                                 Buffer buf)
  {
         BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
         uint32          mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
@@ -218,25 +216,35 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
         elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
  #endif
  
+       Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
+
         /* Check that we have the right page pinned */
-       if (!BufferIsValid(*buf) || BufferGetBlockNumber(*buf) != mapBlock)
-               return;
+       if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
+               elog(ERROR, "wrong buffer passed to visibilitymap_set");
  
-       page = BufferGetPage(*buf);
+       page = BufferGetPage(buf);
         map = PageGetContents(page);
-       LockBuffer(*buf, BUFFER_LOCK_EXCLUSIVE);
+       LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
  
         if (!(map[mapByte] & (1 << mapBit)))
         {
+               START_CRIT_SECTION();
+
                 map[mapByte] |= (1 << mapBit);
+               MarkBufferDirty(buf);
  
-               if (XLByteLT(PageGetLSN(page), recptr))
+               if (RelationNeedsWAL(rel))
+               {
+                       if (XLogRecPtrIsInvalid(recptr))
+                               recptr = log_heap_visible(rel->rd_node, heapBlk, buf);
                         PageSetLSN(page, recptr);
-               PageSetTLI(page, ThisTimeLineID);
-               MarkBufferDirty(*buf);
+                       PageSetTLI(page, ThisTimeLineID);
+               }
+
+               END_CRIT_SECTION();
         }
  
-       LockBuffer(*buf, BUFFER_LOCK_UNLOCK);
+       LockBuffer(buf, BUFFER_LOCK_UNLOCK);
  }
  
  /*
@@ -247,7 +255,7 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
   * On entry, *buf should be InvalidBuffer or a valid buffer returned by an
   * earlier call to visibilitymap_pin or visibilitymap_test on the same
   * relation. On return, *buf is a valid buffer with the map page containing
- * the the bit for heapBlk, or InvalidBuffer. The caller is responsible for
+ * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
   * releasing *buf after it's done testing and setting bits.
   */
  bool
@@ -373,8 +381,7 @@ visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
         }
  
         /* Truncate the unused VM pages, and send smgr inval message */
-       smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks,
-                                rel->rd_istemp);
+       smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks);
  
         /*
          * We might as well update the local smgr_vm_nblocks setting. smgrtruncate
@@ -478,7 +485,7 @@ vm_extend(Relation rel, BlockNumber vm_nblocks)
         while (vm_nblocks_now < vm_nblocks)
         {
                 smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
-                                  (char *) pg, rel->rd_istemp);
+                                  (char *) pg, false);
                 vm_nblocks_now++;
         }