OSDN Git Service

Introduce WAL records to log reuse of btree pages, allowing conflict
authorSimon Riggs <simon@2ndQuadrant.com>
Sat, 13 Feb 2010 00:59:58 +0000 (00:59 +0000)
committerSimon Riggs <simon@2ndQuadrant.com>
Sat, 13 Feb 2010 00:59:58 +0000 (00:59 +0000)
resolution during Hot Standby. Page reuse interlock requested by Tom.
Analysis and patch by me.

src/backend/access/nbtree/nbtpage.c
src/backend/access/nbtree/nbtxlog.c
src/include/access/nbtree.h

index b0eff77..5df975e 100644 (file)
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.118 2010/02/08 04:33:53 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.119 2010/02/13 00:59:58 sriggs Exp $
  *
  *     NOTES
  *        Postgres btree pages look like ordinary relation pages.      The opaque
@@ -447,6 +447,48 @@ _bt_checkpage(Relation rel, Buffer buf)
 }
 
 /*
+ * Log the reuse of a page from the FSM.
+ */
+static void
+_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
+{
+       if (rel->rd_istemp)
+               return;
+
+       /* No ereport(ERROR) until changes are logged */
+       START_CRIT_SECTION();
+
+       /*
+        * We don't do MarkBufferDirty here because we're about initialise
+        * the page, and nobody else can see it yet.
+        */
+
+       /* XLOG stuff */
+       {
+               XLogRecPtr      recptr;
+               XLogRecData rdata[1];
+               xl_btree_reuse_page xlrec_reuse;
+
+               xlrec_reuse.node = rel->rd_node;
+               xlrec_reuse.block = blkno;
+               xlrec_reuse.latestRemovedXid = latestRemovedXid;
+               rdata[0].data = (char *) &xlrec_reuse;
+               rdata[0].len = SizeOfBtreeReusePage;
+               rdata[0].buffer = InvalidBuffer;
+               rdata[0].next = NULL;
+
+               recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata);
+
+               /*
+                * We don't do PageSetLSN or PageSetTLI here because
+                * we're about initialise the page, so no need.
+                */
+       }
+
+       END_CRIT_SECTION();
+}
+
+/*
  *     _bt_getbuf() -- Get a buffer by block number for read or write.
  *
  *             blkno == P_NEW means to get an unallocated index page.  The page
@@ -510,7 +552,19 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
                        {
                                page = BufferGetPage(buf);
                                if (_bt_page_recyclable(page))
-                               {
+                               {                                       
+                                       /*
+                                        * If we are generating WAL for Hot Standby then create
+                                        * a WAL record that will allow us to conflict with
+                                        * queries running on standby.
+                                        */
+                                       if (XLogStandbyInfoActive())
+                                       {
+                                               BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+                                               _bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
+                                       }
+
                                        /* Okay to use page.  Re-initialize and return it */
                                        _bt_pageinit(page, BufferGetPageSize(buf));
                                        return buf;
index 83a7c98..f5320fb 100644 (file)
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.60 2010/02/08 04:33:53 tgl Exp $
+ *       $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.61 2010/02/13 00:59:58 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -814,26 +814,48 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 {
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
 
-       /*
-        * Btree delete records can conflict with standby queries. You might
-        * think that vacuum records would conflict as well, but we've handled
-        * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
-        * cleaned by the vacuum of the heap and so we can resolve any conflicts
-        * just once when that arrives. After that any we know that no conflicts
-        * exist from individual btree vacuum records on that index.
-        */
-       if (InHotStandby && info == XLOG_BTREE_DELETE)
+       if (InHotStandby)
        {
-               xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+               switch (info)
+               {
+                       case XLOG_BTREE_DELETE:
+                               /*
+                                * Btree delete records can conflict with standby queries. You might
+                                * think that vacuum records would conflict as well, but we've handled
+                                * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+                                * cleaned by the vacuum of the heap and so we can resolve any conflicts
+                                * just once when that arrives. After that any we know that no conflicts
+                                * exist from individual btree vacuum records on that index.
+                                */
+                               {
+                                       xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
 
-               /*
-                * XXX Currently we put everybody on death row, because
-                * currently _bt_delitems() supplies InvalidTransactionId.
-                * This can be fairly painful, so providing a better value
-                * here is worth some thought and possibly some effort to
-                * improve.
-                */
-               ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+                                       /*
+                                        * XXX Currently we put everybody on death row, because
+                                        * currently _bt_delitems() supplies InvalidTransactionId.
+                                        * This can be fairly painful, so providing a better value
+                                        * here is worth some thought and possibly some effort to
+                                        * improve.
+                                        */
+                                       ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+                               }
+                               break;
+
+                       case XLOG_BTREE_REUSE_PAGE:
+                               /*
+                                * Btree reuse page records exist to provide a conflict point when we
+                                * reuse pages in the index via the FSM. That's all it does though.
+                                */
+                               {
+                                       xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
+
+                                       ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+                               }
+                               return;
+
+                       default:
+                               break;
+               }
        }
 
        /*
index acbb0cb..f3898a4 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.128 2010/02/08 04:33:54 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.129 2010/02/13 00:59:58 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -221,6 +221,7 @@ typedef struct BTMetaPageData
 #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0               /* page deletion that makes
                                                                                                 * parent half-dead */
 #define XLOG_BTREE_VACUUM              0xC0    /* delete entries on a page during vacuum */
+#define XLOG_BTREE_REUSE_PAGE  0xD0    /* old page is about to be reused from FSM */
 
 /*
  * All that we need to find changed index tuple
@@ -322,6 +323,18 @@ typedef struct xl_btree_delete
 #define SizeOfBtreeDelete      (offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId))
 
 /*
+ * This is what we need to know about page reuse within btree.
+ */
+typedef struct xl_btree_reuse_page
+{
+       RelFileNode node;
+       BlockNumber block;
+       TransactionId   latestRemovedXid;
+} xl_btree_reuse_page;
+
+#define SizeOfBtreeReusePage   (sizeof(xl_btree_reuse_page))
+
+/*
  * This is what we need to know about vacuum of individual leaf index tuples.
  * The WAL record can represent deletion of any number of index tuples on a
  * single index page when executed by VACUUM.