OSDN Git Service

Derive latestRemovedXid for btree deletes by reading heap pages. The
authorSimon Riggs <simon@2ndQuadrant.com>
Sun, 28 Mar 2010 09:27:02 +0000 (09:27 +0000)
committerSimon Riggs <simon@2ndQuadrant.com>
Sun, 28 Mar 2010 09:27:02 +0000 (09:27 +0000)
WAL record for btree delete contains a list of tids, even when backup
blocks are present. We follow the tids to their heap tuples, taking
care to follow LP_REDIRECT tuples. We ignore LP_DEAD tuples on the
understanding that they will always have xmin/xmax earlier than any
LP_NORMAL tuples referred to by killed index tuples. Iff all tuples
are LP_DEAD we return InvalidTransactionId. The heap relfilenode is
added to the WAL record, requiring API changes to pass down the heap
Relation. XLOG_PAGE_MAGIC updated.

src/backend/access/nbtree/nbtinsert.c
src/backend/access/nbtree/nbtpage.c
src/backend/access/nbtree/nbtree.c
src/backend/access/nbtree/nbtxlog.c
src/include/access/nbtree.h
src/include/access/xlog_internal.h

index de9bd95..cd70a4c 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.177 2010/02/26 02:00:34 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.178 2010/03/28 09:27:01 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -57,7 +57,8 @@ static void _bt_findinsertloc(Relation rel,
                                  OffsetNumber *offsetptr,
                                  int keysz,
                                  ScanKey scankey,
-                                 IndexTuple newtup);
+                                 IndexTuple newtup,
+                                 Relation heapRel);
 static void _bt_insertonpg(Relation rel, Buffer buf,
                           BTStack stack,
                           IndexTuple itup,
@@ -78,7 +79,7 @@ static void _bt_pgaddtup(Relation rel, Page page,
                         OffsetNumber itup_off, const char *where);
 static bool _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
                        int keysz, ScanKey scankey);
-static void _bt_vacuum_one_page(Relation rel, Buffer buffer);
+static void _bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel);
 
 
 /*
@@ -175,7 +176,7 @@ top:
        if (checkUnique != UNIQUE_CHECK_EXISTING)
        {
                /* do the insertion */
-               _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup);
+               _bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
                _bt_insertonpg(rel, buf, stack, itup, offset, false);
        }
        else
@@ -491,7 +492,8 @@ _bt_findinsertloc(Relation rel,
                                  OffsetNumber *offsetptr,
                                  int keysz,
                                  ScanKey scankey,
-                                 IndexTuple newtup)
+                                 IndexTuple newtup,
+                                 Relation heapRel)
 {
        Buffer          buf = *bufptr;
        Page            page = BufferGetPage(buf);
@@ -556,7 +558,7 @@ _bt_findinsertloc(Relation rel,
                 */
                if (P_ISLEAF(lpageop) && P_HAS_GARBAGE(lpageop))
                {
-                       _bt_vacuum_one_page(rel, buf);
+                       _bt_vacuum_one_page(rel, buf, heapRel);
 
                        /*
                         * remember that we vacuumed this page, because that makes the
@@ -1998,7 +2000,7 @@ _bt_isequal(TupleDesc itupdesc, Page page, OffsetNumber offnum,
  * super-exclusive "cleanup" lock (see nbtree/README).
  */
 static void
-_bt_vacuum_one_page(Relation rel, Buffer buffer)
+_bt_vacuum_one_page(Relation rel, Buffer buffer, Relation heapRel)
 {
        OffsetNumber deletable[MaxOffsetNumber];
        int                     ndeletable = 0;
@@ -2025,7 +2027,7 @@ _bt_vacuum_one_page(Relation rel, Buffer buffer)
        }
 
        if (ndeletable > 0)
-               _bt_delitems(rel, buffer, deletable, ndeletable, false, 0);
+               _bt_delitems_delete(rel, buffer, deletable, ndeletable, heapRel);
 
        /*
         * Note: if we didn't find any LP_DEAD items, then the page's
index 4aa7599..cb94c76 100644 (file)
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.121 2010/03/19 10:41:21 sriggs Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.122 2010/03/28 09:27:01 sriggs Exp $
  *
  *     NOTES
  *        Postgres btree pages look like ordinary relation pages.      The opaque
@@ -719,15 +719,12 @@ _bt_page_recyclable(Page page)
  * ensure correct locking.
  */
 void
-_bt_delitems(Relation rel, Buffer buf,
-                        OffsetNumber *itemnos, int nitems, bool isVacuum,
-                        BlockNumber lastBlockVacuumed)
+_bt_delitems_vacuum(Relation rel, Buffer buf,
+                        OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed)
 {
        Page            page = BufferGetPage(buf);
        BTPageOpaque opaque;
 
-       Assert(isVacuum || lastBlockVacuumed == 0);
-
        /* No ereport(ERROR) until changes are logged */
        START_CRIT_SECTION();
 
@@ -759,35 +756,14 @@ _bt_delitems(Relation rel, Buffer buf,
                XLogRecPtr      recptr;
                XLogRecData rdata[2];
 
-               if (isVacuum)
-               {
-                       xl_btree_vacuum xlrec_vacuum;
-
-                       xlrec_vacuum.node = rel->rd_node;
-                       xlrec_vacuum.block = BufferGetBlockNumber(buf);
-
-                       xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
-                       rdata[0].data = (char *) &xlrec_vacuum;
-                       rdata[0].len = SizeOfBtreeVacuum;
-               }
-               else
-               {
-                       xl_btree_delete xlrec_delete;
-
-                       xlrec_delete.node = rel->rd_node;
-                       xlrec_delete.block = BufferGetBlockNumber(buf);
+               xl_btree_vacuum xlrec_vacuum;
 
-                       /*
-                        * XXX: We would like to set an accurate latestRemovedXid, but
-                        * there is no easy way of obtaining a useful value. So we punt
-                        * and store InvalidTransactionId, which forces the standby to
-                        * wait for/cancel all currently running transactions.
-                        */
-                       xlrec_delete.latestRemovedXid = InvalidTransactionId;
-                       rdata[0].data = (char *) &xlrec_delete;
-                       rdata[0].len = SizeOfBtreeDelete;
-               }
+               xlrec_vacuum.node = rel->rd_node;
+               xlrec_vacuum.block = BufferGetBlockNumber(buf);
 
+               xlrec_vacuum.lastBlockVacuumed = lastBlockVacuumed;
+               rdata[0].data = (char *) &xlrec_vacuum;
+               rdata[0].len = SizeOfBtreeVacuum;
                rdata[0].buffer = InvalidBuffer;
                rdata[0].next = &(rdata[1]);
 
@@ -810,10 +786,82 @@ _bt_delitems(Relation rel, Buffer buf,
                rdata[1].buffer_std = true;
                rdata[1].next = NULL;
 
-               if (isVacuum)
-                       recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
-               else
-                       recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
+               recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_VACUUM, rdata);
+
+               PageSetLSN(page, recptr);
+               PageSetTLI(page, ThisTimeLineID);
+       }
+
+       END_CRIT_SECTION();
+}
+
+void
+_bt_delitems_delete(Relation rel, Buffer buf,
+                        OffsetNumber *itemnos, int nitems, Relation heapRel)
+{
+       Page            page = BufferGetPage(buf);
+       BTPageOpaque opaque;
+
+       Assert(nitems > 0);
+
+       /* No ereport(ERROR) until changes are logged */
+       START_CRIT_SECTION();
+
+       /* Fix the page */
+       PageIndexMultiDelete(page, itemnos, nitems);
+
+       /*
+        * We can clear the vacuum cycle ID since this page has certainly been
+        * processed by the current vacuum scan.
+        */
+       opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+       opaque->btpo_cycleid = 0;
+
+       /*
+        * Mark the page as not containing any LP_DEAD items.  This is not
+        * certainly true (there might be some that have recently been marked, but
+        * weren't included in our target-item list), but it will almost always be
+        * true and it doesn't seem worth an additional page scan to check it.
+        * Remember that BTP_HAS_GARBAGE is only a hint anyway.
+        */
+       opaque->btpo_flags &= ~BTP_HAS_GARBAGE;
+
+       MarkBufferDirty(buf);
+
+       /* XLOG stuff */
+       if (!rel->rd_istemp)
+       {
+               XLogRecPtr      recptr;
+               XLogRecData rdata[3];
+
+               xl_btree_delete xlrec_delete;
+
+               xlrec_delete.node = rel->rd_node;
+               xlrec_delete.hnode = heapRel->rd_node;
+               xlrec_delete.block = BufferGetBlockNumber(buf);
+               xlrec_delete.nitems = nitems;
+
+               rdata[0].data = (char *) &xlrec_delete;
+               rdata[0].len = SizeOfBtreeDelete;
+               rdata[0].buffer = InvalidBuffer;
+               rdata[0].next = &(rdata[1]);
+
+               /*
+                * We need the target-offsets array whether or not we store the
+                * to allow us to find the latestRemovedXid on a standby server.
+                */
+               rdata[1].data = (char *) itemnos;
+               rdata[1].len = nitems * sizeof(OffsetNumber);
+               rdata[1].buffer = InvalidBuffer;
+               rdata[1].next = &(rdata[2]);
+
+               rdata[2].data = NULL;
+               rdata[2].len = 0;
+               rdata[2].buffer = buf;
+               rdata[2].buffer_std = true;
+               rdata[2].next = NULL;
+
+               recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_DELETE, rdata);
 
                PageSetLSN(page, recptr);
                PageSetTLI(page, ThisTimeLineID);
index 01899cf..0fcde95 100644 (file)
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.176 2010/02/26 02:00:34 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtree.c,v 1.177 2010/03/28 09:27:01 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -708,7 +708,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
                buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL,
                                                                 info->strategy);
                LockBufferForCleanup(buf);
-               _bt_delitems(rel, buf, NULL, 0, true, vstate.lastBlockVacuumed);
+               _bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
                _bt_relbuf(rel, buf);
        }
 
@@ -889,7 +889,7 @@ restart:
                {
                        BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);
 
-                       _bt_delitems(rel, buf, deletable, ndeletable, true, vstate->lastBlockVacuumed);
+                       _bt_delitems_vacuum(rel, buf, deletable, ndeletable, vstate->lastBlockVacuumed);
 
                        /*
                         * Keep track of the block number of the lastBlockVacuumed, so we
index 782778c..5bc710c 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.63 2010/03/19 10:41:22 sriggs Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.64 2010/03/28 09:27:01 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -553,6 +553,139 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
        UnlockReleaseBuffer(buffer);
 }
 
+/*
+ * Get the latestRemovedXid from the heap pages pointed at by the index
+ * tuples being deleted. This puts the work for calculating latestRemovedXid
+ * into the recovery path rather than the primary path.
+ *
+ * It's possible that this generates a fair amount of I/O, since an index
+ * block may have hundreds of tuples being deleted. Repeat accesses to the
+ * same heap blocks are common, though are not yet optimised.
+ *
+ * XXX optimise later with something like XLogPrefetchBuffer()
+ */
+static TransactionId
+btree_xlog_delete_get_latestRemovedXid(XLogRecord *record)
+{
+       OffsetNumber    *unused;
+       Buffer                  ibuffer, hbuffer;
+       Page                    ipage, hpage;
+       ItemId                  iitemid, hitemid;
+       IndexTuple              itup;
+       HeapTupleHeader htuphdr;
+       BlockNumber     hblkno;
+       OffsetNumber    hoffnum;
+       TransactionId   latestRemovedXid = InvalidTransactionId;
+       TransactionId   htupxid = InvalidTransactionId;
+       int i;
+       int num_unused, num_redirect, num_dead;
+
+       xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+
+       /*
+        * Get index page
+        */
+       ibuffer = XLogReadBuffer(xlrec->node, xlrec->block, false);
+       if (!BufferIsValid(ibuffer))
+               return InvalidTransactionId;
+       ipage = (Page) BufferGetPage(ibuffer);
+
+       /*
+        * Loop through the deleted index items to obtain the TransactionId
+        * from the heap items they point to.
+        */
+       unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
+
+       for (i = 0; i < xlrec->nitems; i++)
+       {
+               /*
+                * Identify the index tuple about to be deleted
+                */
+               iitemid = PageGetItemId(ipage, unused[i]);
+               itup = (IndexTuple) PageGetItem(ipage, iitemid);
+
+               /*
+                * Locate the heap page that the index tuple points at
+                */
+               hblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
+               hbuffer = XLogReadBuffer(xlrec->hnode, hblkno, false);
+               if (!BufferIsValid(hbuffer))
+               {
+                       UnlockReleaseBuffer(ibuffer);
+                       return InvalidTransactionId;
+               }
+               hpage = (Page) BufferGetPage(hbuffer);
+
+               /*
+                * Look up the heap tuple header that the index tuple points at
+                * by using the heap node supplied with the xlrec. We can't use
+                * heap_fetch, since it uses ReadBuffer rather than XLogReadBuffer.
+                * Note that we are not looking at tuple data here, just headers.
+                */
+               hoffnum = ItemPointerGetOffsetNumber(&(itup->t_tid));
+               hitemid = PageGetItemId(hpage, hoffnum);
+
+               /*
+                * Follow any redirections until we find something useful.
+                */
+               while (ItemIdIsRedirected(hitemid))
+               {
+                       num_redirect++;
+                       hoffnum = ItemIdGetRedirect(hitemid);
+                       hitemid = PageGetItemId(hpage, hoffnum);
+                       CHECK_FOR_INTERRUPTS();
+               }
+
+               /*
+                * If the heap item has storage, then read the header. Some LP_DEAD
+                * items may not be accessible, so we ignore them.
+                */
+               if (ItemIdHasStorage(hitemid))
+               {
+                       htuphdr = (HeapTupleHeader) PageGetItem(hpage, hitemid);
+
+                       /*
+                        * Get the heap tuple's xmin/xmax and ratchet up the latestRemovedXid.
+                        * No need to consider xvac values here.
+                        */
+                       htupxid = HeapTupleHeaderGetXmin(htuphdr);
+                       if (TransactionIdFollows(htupxid, latestRemovedXid))
+                               latestRemovedXid = htupxid;
+
+                       htupxid = HeapTupleHeaderGetXmax(htuphdr);
+                       if (TransactionIdFollows(htupxid, latestRemovedXid))
+                               latestRemovedXid = htupxid;
+               }
+               else if (ItemIdIsDead(hitemid))
+               {
+                       /*
+                        * Conjecture: if hitemid is dead then it had xids before the xids
+                        * marked on LP_NORMAL items. So we just ignore this item and move
+                        * onto the next, for the purposes of calculating latestRemovedxids.
+                        */
+                       num_dead++;
+               }
+               else
+               {
+                       Assert(!ItemIdIsUsed(hitemid));
+                       num_unused++;
+               }
+
+               UnlockReleaseBuffer(hbuffer);
+       }
+
+       UnlockReleaseBuffer(ibuffer);
+
+       Assert(num_unused == 0);
+
+       /*
+        * Note that if all heap tuples were LP_DEAD then we will be
+        * returning InvalidTransactionId here. This seems very unlikely
+        * in practice.
+        */
+       return latestRemovedXid;
+}
+
 static void
 btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -584,12 +717,10 @@ btree_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
        if (record->xl_len > SizeOfBtreeDelete)
        {
                OffsetNumber *unused;
-               OffsetNumber *unend;
 
                unused = (OffsetNumber *) ((char *) xlrec + SizeOfBtreeDelete);
-               unend = (OffsetNumber *) ((char *) xlrec + record->xl_len);
 
-               PageIndexMultiDelete(page, unused, unend - unused);
+               PageIndexMultiDelete(page, unused, xlrec->nitems);
        }
 
        /*
@@ -830,6 +961,7 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
                                 * from individual btree vacuum records on that index.
                                 */
                                {
+                                       TransactionId latestRemovedXid = btree_xlog_delete_get_latestRemovedXid(record);
                                        xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
 
                                        /*
@@ -839,7 +971,7 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
                                         * here is worth some thought and possibly some effort to
                                         * improve.
                                         */
-                                       ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+                                       ResolveRecoveryConflictWithSnapshot(latestRemovedXid, xlrec->node);
                                }
                                break;
 
@@ -1012,10 +1144,10 @@ btree_desc(StringInfo buf, uint8 xl_info, char *rec)
                        {
                                xl_btree_delete *xlrec = (xl_btree_delete *) rec;
 
-                               appendStringInfo(buf, "delete: rel %u/%u/%u; blk %u, latestRemovedXid %u",
-                                                                xlrec->node.spcNode, xlrec->node.dbNode,
-                                                                xlrec->node.relNode, xlrec->block,
-                                                                xlrec->latestRemovedXid);
+                               appendStringInfo(buf, "delete: index %u/%u/%u; iblk %u, heap %u/%u/%u;",
+                                                                xlrec->node.spcNode, xlrec->node.dbNode, xlrec->node.relNode,
+                                                                xlrec->block,
+                                                                xlrec->hnode.spcNode, xlrec->hnode.dbNode, xlrec->hnode.relNode);
                                break;
                        }
                case XLOG_BTREE_DELETE_PAGE:
index e00594b..775c47d 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.133 2010/03/20 07:49:48 sriggs Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.134 2010/03/28 09:27:02 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -314,14 +314,15 @@ typedef struct xl_btree_split
  */
 typedef struct xl_btree_delete
 {
-       RelFileNode node;
+       RelFileNode node;               /* RelFileNode of the index */
        BlockNumber block;
-       TransactionId latestRemovedXid;
+       RelFileNode hnode;              /* RelFileNode of the heap the index currently points at */
+       int                     nitems;
 
        /* TARGET OFFSET NUMBERS FOLLOW AT THE END */
 } xl_btree_delete;
 
-#define SizeOfBtreeDelete      (offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId))
+#define SizeOfBtreeDelete      (offsetof(xl_btree_delete, nitems) + sizeof(int))
 
 /*
  * This is what we need to know about page reuse within btree.
@@ -349,13 +350,12 @@ typedef struct xl_btree_reuse_page
  * heap tuples.
  *
  * Any changes to any one block are registered on just one WAL record. All
- * blocks that we need to run EnsureBlockUnpinned() before we touch the changed
- * block are also given on this record as a variable length array. The array
- * is compressed by way of storing an array of block ranges, rather than an
- * actual array of blockids.
+ * blocks that we need to run EnsureBlockUnpinned() are listed as a block range
+ * starting from the last block vacuumed through until this one. Individual
+ * block numbers aren't given.
  *
  * Note that the *last* WAL record in any vacuum of an index is allowed to
- * have numItems == 0. All other WAL records must have numItems > 0.
+ * have a zero length array of offsets. Earlier records must have at least one.
  */
 typedef struct xl_btree_vacuum
 {
@@ -588,9 +588,10 @@ extern Buffer _bt_relandgetbuf(Relation rel, Buffer obuf,
 extern void _bt_relbuf(Relation rel, Buffer buf);
 extern void _bt_pageinit(Page page, Size size);
 extern bool _bt_page_recyclable(Page page);
-extern void _bt_delitems(Relation rel, Buffer buf,
-                        OffsetNumber *itemnos, int nitems, bool isVacuum,
-                        BlockNumber lastBlockVacuumed);
+extern void _bt_delitems_delete(Relation rel, Buffer buf,
+                        OffsetNumber *itemnos, int nitems, Relation heapRel);
+extern void _bt_delitems_vacuum(Relation rel, Buffer buf,
+                        OffsetNumber *itemnos, int nitems, BlockNumber lastBlockVacuumed);
 extern int     _bt_pagedel(Relation rel, Buffer buf, BTStack stack);
 
 /*
index 0787eb5..c93e384 100644 (file)
@@ -11,7 +11,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog_internal.h,v 1.30 2010/03/19 17:42:10 sriggs Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog_internal.h,v 1.31 2010/03/28 09:27:02 sriggs Exp $
  */
 #ifndef XLOG_INTERNAL_H
 #define XLOG_INTERNAL_H
@@ -71,7 +71,7 @@ typedef struct XLogContRecord
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0x9002 /* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0x9003 /* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {