OSDN Git Service

Fix a violation of WAL coding rules in the recent patch to include an
authorTom Lane <tgl@sss.pgh.pa.us>
Mon, 24 Aug 2009 02:18:32 +0000 (02:18 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Mon, 24 Aug 2009 02:18:32 +0000 (02:18 +0000)
"all tuples visible" flag in heap page headers.  The flag update *must*
be applied before calling XLogInsert, but heap_update and the tuple
moving routines in VACUUM FULL were ignoring this rule.  A crash and
replay could therefore leave the flag incorrectly set, causing rows
to appear visible in seqscans when they should not be.  This might explain
recent reports of data corruption from Jeff Ross and others.

In passing, do a bit of editorialization on comments in visibilitymap.c.

src/backend/access/heap/heapam.c
src/backend/access/heap/visibilitymap.c
src/backend/commands/vacuum.c
src/backend/commands/vacuumlazy.c
src/include/access/heapam.h
src/include/access/visibilitymap.h

index 2e45c04..148d88b 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.277 2009/06/11 14:48:53 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.278 2009/08/24 02:18:31 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -78,7 +78,8 @@ static HeapScanDesc heap_beginscan_internal(Relation relation,
                                                bool allow_strat, bool allow_sync,
                                                bool is_bitmapscan);
 static XLogRecPtr log_heap_update(Relation reln, Buffer oldbuf,
-                  ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move);
+                  ItemPointerData from, Buffer newbuf, HeapTuple newtup, bool move,
+                  bool all_visible_cleared, bool new_all_visible_cleared);
 static bool HeapSatisfiesHOTUpdate(Relation relation, Bitmapset *hot_attrs,
                                           HeapTuple oldtup, HeapTuple newtup);
 
@@ -2760,21 +2761,29 @@ l2:
        /* record address of new tuple in t_ctid of old one */
        oldtup.t_data->t_ctid = heaptup->t_self;
 
+       /* clear PD_ALL_VISIBLE flags */
+       if (PageIsAllVisible(BufferGetPage(buffer)))
+       {
+               all_visible_cleared = true;
+               PageClearAllVisible(BufferGetPage(buffer));
+       }
+       if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
+       {
+               all_visible_cleared_new = true;
+               PageClearAllVisible(BufferGetPage(newbuf));
+       }
+
        if (newbuf != buffer)
                MarkBufferDirty(newbuf);
        MarkBufferDirty(buffer);
 
-       /*
-        * Note: we mustn't clear PD_ALL_VISIBLE flags before writing the WAL
-        * record, because log_heap_update looks at those flags to set the
-        * corresponding flags in the WAL record.
-        */
-
        /* XLOG stuff */
        if (!relation->rd_istemp)
        {
                XLogRecPtr      recptr = log_heap_update(relation, buffer, oldtup.t_self,
-                                                                                        newbuf, heaptup, false);
+                                                                                        newbuf, heaptup, false,
+                                                                                        all_visible_cleared,
+                                                                                        all_visible_cleared_new);
 
                if (newbuf != buffer)
                {
@@ -2785,18 +2794,6 @@ l2:
                PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
        }
 
-       /* Clear PD_ALL_VISIBLE flags */
-       if (PageIsAllVisible(BufferGetPage(buffer)))
-       {
-               all_visible_cleared = true;
-               PageClearAllVisible(BufferGetPage(buffer));
-       }
-       if (newbuf != buffer && PageIsAllVisible(BufferGetPage(newbuf)))
-       {
-               all_visible_cleared_new = true;
-               PageClearAllVisible(BufferGetPage(newbuf));
-       }
-
        END_CRIT_SECTION();
 
        if (newbuf != buffer)
@@ -3910,7 +3907,8 @@ log_heap_freeze(Relation reln, Buffer buffer,
  */
 static XLogRecPtr
 log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
-                               Buffer newbuf, HeapTuple newtup, bool move)
+                               Buffer newbuf, HeapTuple newtup, bool move,
+                               bool all_visible_cleared, bool new_all_visible_cleared)
 {
        /*
         * Note: xlhdr is declared to have adequate size and correct alignment for
@@ -3946,9 +3944,9 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
 
        xlrec.target.node = reln->rd_node;
        xlrec.target.tid = from;
-       xlrec.all_visible_cleared = PageIsAllVisible(BufferGetPage(oldbuf));
+       xlrec.all_visible_cleared = all_visible_cleared;
        xlrec.newtid = newtup->t_self;
-       xlrec.new_all_visible_cleared = PageIsAllVisible(BufferGetPage(newbuf));
+       xlrec.new_all_visible_cleared = new_all_visible_cleared;
 
        rdata[0].data = (char *) &xlrec;
        rdata[0].len = SizeOfHeapUpdate;
@@ -4015,9 +4013,11 @@ log_heap_update(Relation reln, Buffer oldbuf, ItemPointerData from,
  */
 XLogRecPtr
 log_heap_move(Relation reln, Buffer oldbuf, ItemPointerData from,
-                         Buffer newbuf, HeapTuple newtup)
+                         Buffer newbuf, HeapTuple newtup,
+                         bool all_visible_cleared, bool new_all_visible_cleared)
 {
-       return log_heap_update(reln, oldbuf, from, newbuf, newtup, true);
+       return log_heap_update(reln, oldbuf, from, newbuf, newtup, true,
+                                                  all_visible_cleared, new_all_visible_cleared);
 }
 
 /*
@@ -4222,7 +4222,7 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
        blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid));
 
        /*
-        * The visibility map always needs to be updated, even if the heap page is
+        * The visibility map may need to be fixed even if the heap page is
         * already up-to-date.
         */
        if (xlrec->all_visible_cleared)
@@ -4300,7 +4300,7 @@ heap_xlog_insert(XLogRecPtr lsn, XLogRecord *record)
        blkno = ItemPointerGetBlockNumber(&(xlrec->target.tid));
 
        /*
-        * The visibility map always needs to be updated, even if the heap page is
+        * The visibility map may need to be fixed even if the heap page is
         * already up-to-date.
         */
        if (xlrec->all_visible_cleared)
@@ -4412,7 +4412,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
        Size            freespace;
 
        /*
-        * The visibility map always needs to be updated, even if the heap page is
+        * The visibility map may need to be fixed even if the heap page is
         * already up-to-date.
         */
        if (xlrec->all_visible_cleared)
@@ -4507,7 +4507,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
 newt:;
 
        /*
-        * The visibility map always needs to be updated, even if the heap page is
+        * The visibility map may need to be fixed even if the heap page is
         * already up-to-date.
         */
        if (xlrec->new_all_visible_cleared)
index 3fc26be..50462f2 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/heap/visibilitymap.c,v 1.5 2009/06/18 10:08:08 heikki Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/heap/visibilitymap.c,v 1.6 2009/08/24 02:18:31 tgl Exp $
  *
  * INTERFACE ROUTINES
  *             visibilitymap_clear - clear a bit in the visibility map
  * NOTES
  *
  * The visibility map is a bitmap with one bit per heap page. A set bit means
- * that all tuples on the page are visible to all transactions, and doesn't
- * therefore need to be vacuumed. The map is conservative in the sense that we
- * make sure that whenever a bit is set, we know the condition is true, but if
- * a bit is not set, it might or might not be.
+ * that all tuples on the page are known visible to all transactions, and 
+ * therefore the page doesn't need to be vacuumed. The map is conservative in
+ * the sense that we make sure that whenever a bit is set, we know the
+ * condition is true, but if a bit is not set, it might or might not be true.
  *
  * There's no explicit WAL logging in the functions in this file. The callers
  * must make sure that whenever a bit is cleared, the bit is cleared on WAL
  * make VACUUM skip pages that need vacuuming, until the next anti-wraparound
  * vacuum. The visibility map is not used for anti-wraparound vacuums, because
  * an anti-wraparound vacuum needs to freeze tuples and observe the latest xid
- * present in the table, also on pages that don't have any dead tuples.
+ * present in the table, even on pages that don't have any dead tuples.
  *
  * Although the visibility map is just a hint at the moment, the PD_ALL_VISIBLE
- * flag on heap pages *must* be correct.
+ * flag on heap pages *must* be correct, because it is used to skip visibility
+ * checking.
  *
  * LOCKING
  *
  * When a bit is set, the LSN of the visibility map page is updated to make
  * sure that the visibility map update doesn't get written to disk before the
  * WAL record of the changes that made it possible to set the bit is flushed.
- * But when a bit is cleared, we don't have to do that because it's always OK
- * to clear a bit in the map from correctness point of view.
+ * But when a bit is cleared, we don't have to do that because it's always
+ * safe to clear a bit in the map from correctness point of view.
  *
  * TODO
  *
- * It would be nice to use the visibility map to skip visibility checkes in
+ * It would be nice to use the visibility map to skip visibility checks in
  * index scans.
  *
  * Currently, the visibility map is not 100% correct all the time.
  * During updates, the bit in the visibility map is cleared after releasing
- * the lock on the heap page. During the window after releasing the lock
+ * the lock on the heap page. During the window between releasing the lock
  * and clearing the bit in the visibility map, the bit in the visibility map
  * is set, but the new insertion or deletion is not yet visible to other
  * backends.
@@ -73,7 +74,7 @@
  * That might actually be OK for the index scans, though. The newly inserted
  * tuple wouldn't have an index pointer yet, so all tuples reachable from an
  * index would still be visible to all other backends, and deletions wouldn't
- * be visible to other backends yet.
+ * be visible to other backends yet.  (But HOT breaks that argument, no?)
  *
  * There's another hole in the way the PD_ALL_VISIBLE flag is set. When
  * vacuum observes that all tuples are visible to all, it sets the flag on
@@ -81,7 +82,8 @@
  * crash, and only the visibility map page was flushed to disk, we'll have
  * a bit set in the visibility map, but the corresponding flag on the heap
  * page is not set. If the heap page is then updated, the updater won't
- * know to clear the bit in the visibility map.
+ * know to clear the bit in the visibility map.  (Isn't that prevented by
+ * the LSN interlock?)
  *
  *-------------------------------------------------------------------------
  */
index 732f6d0..1058bd2 100644 (file)
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.389 2009/06/11 14:48:56 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.390 2009/08/24 02:18:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2935,6 +2935,8 @@ move_chain_tuple(Relation rel,
        OffsetNumber newoff;
        ItemId          newitemid;
        Size            tuple_len = old_tup->t_len;
+       bool            all_visible_cleared = false;
+       bool            all_visible_cleared_new = false;
 
        /*
         * make a modifiable copy of the source tuple.
@@ -3019,6 +3021,18 @@ move_chain_tuple(Relation rel,
                newtup.t_data->t_ctid = *ctid;
        *ctid = newtup.t_self;
 
+       /* clear PD_ALL_VISIBLE flags */
+       if (PageIsAllVisible(old_page))
+       {
+               all_visible_cleared = true;
+               PageClearAllVisible(old_page);
+       }
+       if (dst_buf != old_buf && PageIsAllVisible(dst_page))
+       {
+               all_visible_cleared_new = true;
+               PageClearAllVisible(dst_page);
+       }
+
        MarkBufferDirty(dst_buf);
        if (dst_buf != old_buf)
                MarkBufferDirty(old_buf);
@@ -3027,7 +3041,9 @@ move_chain_tuple(Relation rel,
        if (!rel->rd_istemp)
        {
                XLogRecPtr      recptr = log_heap_move(rel, old_buf, old_tup->t_self,
-                                                                                  dst_buf, &newtup);
+                                                                                  dst_buf, &newtup,
+                                                                                  all_visible_cleared,
+                                                                                  all_visible_cleared_new);
 
                if (old_buf != dst_buf)
                {
@@ -3040,17 +3056,14 @@ move_chain_tuple(Relation rel,
 
        END_CRIT_SECTION();
 
-       PageClearAllVisible(BufferGetPage(old_buf));
-       if (dst_buf != old_buf)
-               PageClearAllVisible(BufferGetPage(dst_buf));
-
        LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
        if (dst_buf != old_buf)
                LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
 
-       /* Clear the bits in the visibility map. */
-       visibilitymap_clear(rel, BufferGetBlockNumber(old_buf));
-       if (dst_buf != old_buf)
+       /* Clear bits in visibility map */
+       if (all_visible_cleared)
+               visibilitymap_clear(rel, BufferGetBlockNumber(old_buf));
+       if (all_visible_cleared_new)
                visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf));
 
        /* Create index entries for the moved tuple */
@@ -3084,6 +3097,8 @@ move_plain_tuple(Relation rel,
        OffsetNumber newoff;
        ItemId          newitemid;
        Size            tuple_len = old_tup->t_len;
+       bool            all_visible_cleared = false;
+       bool            all_visible_cleared_new = false;
 
        /* copy tuple */
        heap_copytuple_with_tuple(old_tup, &newtup);
@@ -3134,6 +3149,18 @@ move_plain_tuple(Relation rel,
        old_tup->t_data->t_infomask |= HEAP_MOVED_OFF;
        HeapTupleHeaderSetXvac(old_tup->t_data, myXID);
 
+       /* clear PD_ALL_VISIBLE flags */
+       if (PageIsAllVisible(old_page))
+       {
+               all_visible_cleared = true;
+               PageClearAllVisible(old_page);
+       }
+       if (PageIsAllVisible(dst_page))
+       {
+               all_visible_cleared_new = true;
+               PageClearAllVisible(dst_page);
+       }
+
        MarkBufferDirty(dst_buf);
        MarkBufferDirty(old_buf);
 
@@ -3141,7 +3168,9 @@ move_plain_tuple(Relation rel,
        if (!rel->rd_istemp)
        {
                XLogRecPtr      recptr = log_heap_move(rel, old_buf, old_tup->t_self,
-                                                                                  dst_buf, &newtup);
+                                                                                  dst_buf, &newtup,
+                                                                                  all_visible_cleared,
+                                                                                  all_visible_cleared_new);
 
                PageSetLSN(old_page, recptr);
                PageSetTLI(old_page, ThisTimeLineID);
@@ -3151,29 +3180,18 @@ move_plain_tuple(Relation rel,
 
        END_CRIT_SECTION();
 
-       /*
-        * Clear the visible-to-all hint bits on the page, and bits in the
-        * visibility map. Normally we'd release the locks on the heap pages
-        * before updating the visibility map, but doesn't really matter here
-        * because we're holding an AccessExclusiveLock on the relation anyway.
-        */
-       if (PageIsAllVisible(dst_page))
-       {
-               PageClearAllVisible(dst_page);
-               visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf));
-       }
-       if (PageIsAllVisible(old_page))
-       {
-               PageClearAllVisible(old_page);
-               visibilitymap_clear(rel, BufferGetBlockNumber(old_buf));
-       }
-
        dst_vacpage->free = PageGetFreeSpaceWithFillFactor(rel, dst_page);
        LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK);
        LockBuffer(old_buf, BUFFER_LOCK_UNLOCK);
 
        dst_vacpage->offsets_used++;
 
+       /* Clear bits in visibility map */
+       if (all_visible_cleared)
+               visibilitymap_clear(rel, BufferGetBlockNumber(old_buf));
+       if (all_visible_cleared_new)
+               visibilitymap_clear(rel, BufferGetBlockNumber(dst_buf));
+
        /* insert index' tuples if needed */
        if (ec->resultRelInfo->ri_NumIndices > 0)
        {
index 01aa11a..8fb06e4 100644 (file)
@@ -29,7 +29,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.121 2009/06/11 14:48:56 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.122 2009/08/24 02:18:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -425,8 +425,8 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 
                        if (!PageIsAllVisible(page))
                        {
-                               SetBufferCommitInfoNeedsSave(buf);
                                PageSetAllVisible(page);
+                               SetBufferCommitInfoNeedsSave(buf);
                        }
 
                        LockBuffer(buf, BUFFER_LOCK_UNLOCK);
@@ -652,19 +652,20 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
                /* Update the all-visible flag on the page */
                if (!PageIsAllVisible(page) && all_visible)
                {
-                       SetBufferCommitInfoNeedsSave(buf);
                        PageSetAllVisible(page);
+                       SetBufferCommitInfoNeedsSave(buf);
                }
                else if (PageIsAllVisible(page) && !all_visible)
                {
-                       elog(WARNING, "PD_ALL_VISIBLE flag was incorrectly set");
-                       SetBufferCommitInfoNeedsSave(buf);
+                       elog(WARNING, "PD_ALL_VISIBLE flag was incorrectly set in relation \"%s\" page %u",
+                                relname, blkno);
                        PageClearAllVisible(page);
+                       SetBufferCommitInfoNeedsSave(buf);
 
                        /*
                         * Normally, we would drop the lock on the heap page before
-                        * updating the visibility map, but since this is a can't-happen
-                        * case anyway, don't bother.
+                        * updating the visibility map, but since this case shouldn't
+                        * happen anyway, don't worry about that.
                         */
                        visibilitymap_clear(onerel, blkno);
                }
index 459b780..f31a505 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.143 2009/06/11 14:49:08 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.144 2009/08/24 02:18:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -128,7 +128,8 @@ extern void heap2_desc(StringInfo buf, uint8 xl_info, char *rec);
 
 extern XLogRecPtr log_heap_move(Relation reln, Buffer oldbuf,
                          ItemPointerData from,
-                         Buffer newbuf, HeapTuple newtup);
+                         Buffer newbuf, HeapTuple newtup,
+                         bool all_visible_cleared, bool new_all_visible_cleared);
 extern XLogRecPtr log_heap_clean(Relation reln, Buffer buffer,
                           OffsetNumber *redirected, int nredirected,
                           OffsetNumber *nowdead, int ndead,
index 3f4b3ab..3255519 100644 (file)
@@ -7,17 +7,17 @@
  * Portions Copyright (c) 2007-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/visibilitymap.h,v 1.4 2009/06/11 14:49:09 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/visibilitymap.h,v 1.5 2009/08/24 02:18:32 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef VISIBILITYMAP_H
 #define VISIBILITYMAP_H
 
-#include "utils/relcache.h"
-#include "storage/buf.h"
-#include "storage/itemptr.h"
 #include "access/xlogdefs.h"
+#include "storage/block.h"
+#include "storage/buf.h"
+#include "utils/relcache.h"
 
 extern void visibilitymap_clear(Relation rel, BlockNumber heapBlk);
 extern void visibilitymap_pin(Relation rel, BlockNumber heapBlk,