OSDN Git Service

Implement function-local GUC parameter settings, as per recent discussion.
[pg-rex/syncrep.git] / src / backend / access / transam / xact.c
index f938cdc..18787d1 100644 (file)
  * xact.c
  *       top level transaction system support routines
  *
- * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
+ * See src/backend/access/transam/README for more information.
+ *
+ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.173 2004/07/28 14:23:27 tgl Exp $
- *
- * NOTES
- *             Transaction aborts can now occur two ways:
- *
- *             1)      system dies from some internal cause  (syntax error, etc..)
- *             2)      user types ABORT
- *
- *             These two cases used to be treated identically, but now
- *             we need to distinguish them.  Why?      consider the following
- *             two situations:
- *
- *                             case 1                                                  case 2
- *                             ------                                                  ------
- *             1) user types BEGIN                             1) user types BEGIN
- *             2) user does something                  2) user does something
- *             3) user does not like what              3) system aborts for some reason
- *                she sees and types ABORT
- *
- *             In case 1, we want to abort the transaction and return to the
- *             default state.  In case 2, there may be more commands coming
- *             our way which are part of the same transaction block and we have
- *             to ignore these commands until we see a COMMIT transaction or
- *             ROLLBACK.
- *
- *             Internal aborts are now handled by AbortTransactionBlock(), just as
- *             they always have been, and user aborts are now handled by
- *             UserAbortTransactionBlock().  Both of them rely on AbortTransaction()
- *             to do all the real work.  The only difference is what state we
- *             enter after AbortTransaction() does its work:
- *
- *             * AbortTransactionBlock() leaves us in TBLOCK_ABORT and
- *             * UserAbortTransactionBlock() leaves us in TBLOCK_ENDABORT
- *
- *             Low-level transaction abort handling is divided into two phases:
- *             * AbortTransaction() executes as soon as we realize the transaction
- *               has failed.  It should release all shared resources (locks etc)
- *               so that we do not delay other backends unnecessarily.
- *             * CleanupTransaction() executes when we finally see a user COMMIT
- *               or ROLLBACK command; it cleans things up and gets us out of
- *               the transaction internally.  In particular, we mustn't destroy
- *               TopTransactionContext until this point.
- *
- *      NOTES
- *             The essential aspects of the transaction system are:
- *
- *                             o  transaction id generation
- *                             o  transaction log updating
- *                             o  memory cleanup
- *                             o  cache invalidation
- *                             o  lock cleanup
- *
- *             Hence, the functional division of the transaction code is
- *             based on which of the above things need to be done during
- *             a start/commit/abort transaction.  For instance, the
- *             routine AtCommit_Memory() takes care of all the memory
- *             cleanup stuff done at commit time.
- *
- *             The code is layered as follows:
- *
- *                             StartTransaction
- *                             CommitTransaction
- *                             AbortTransaction
- *                             CleanupTransaction
- *
- *             are provided to do the lower level work like recording
- *             the transaction status in the log and doing memory cleanup.
- *             above these routines are another set of functions:
- *
- *                             StartTransactionCommand
- *                             CommitTransactionCommand
- *                             AbortCurrentTransaction
- *
- *             These are the routines used in the postgres main processing
- *             loop.  They are sensitive to the current transaction block state
- *             and make calls to the lower level routines appropriately.
- *
- *             Support for transaction blocks is provided via the functions:
- *
- *                             BeginTransactionBlock
- *                             CommitTransactionBlock
- *                             AbortTransactionBlock
- *
- *             These are invoked only in response to a user "BEGIN WORK", "COMMIT",
- *             or "ROLLBACK" command.  The tricky part about these functions
- *             is that they are called within the postgres main loop, in between
- *             the StartTransactionCommand() and CommitTransactionCommand().
- *
- *             For example, consider the following sequence of user commands:
- *
- *             1)              begin
- *             2)              select * from foo
- *             3)              insert into foo (bar = baz)
- *             4)              commit
- *
- *             in the main processing loop, this results in the following
- *             transaction sequence:
- *
- *                     /       StartTransactionCommand();
- *             1) /    ProcessUtility();                               << begin
- *                \            BeginTransactionBlock();
- *                     \       CommitTransactionCommand();
- *
- *                     /       StartTransactionCommand();
- *             2) <    ProcessQuery();                                 << select * from foo
- *                     \       CommitTransactionCommand();
- *
- *                     /       StartTransactionCommand();
- *             3) <    ProcessQuery();                                 << insert into foo (bar = baz)
- *                     \       CommitTransactionCommand();
- *
- *                     /       StartTransactionCommand();
- *             4) /    ProcessUtility();                               << commit
- *                \            CommitTransactionBlock();
- *                     \       CommitTransactionCommand();
- *
- *             The point of this example is to demonstrate the need for
- *             StartTransactionCommand() and CommitTransactionCommand() to
- *             be state smart -- they should do nothing in between the calls
- *             to BeginTransactionBlock() and EndTransactionBlock() and
- *             outside these calls they need to do normal start/commit
- *             processing.
- *
- *             Furthermore, suppose the "select * from foo" caused an abort
- *             condition.      We would then want to abort the transaction and
- *             ignore all subsequent commands up to the "commit".
- *             -cim 3/23/90
+ *       $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.247 2007/09/03 00:39:13 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include <time.h>
 #include <unistd.h>
 
+#include "access/multixact.h"
 #include "access/subtrans.h"
+#include "access/transam.h"
+#include "access/twophase.h"
 #include "access/xact.h"
-#include "catalog/heap.h"
-#include "catalog/index.h"
+#include "access/xlogutils.h"
 #include "catalog/namespace.h"
 #include "commands/async.h"
 #include "commands/tablecmds.h"
 #include "commands/trigger.h"
-#include "commands/user.h"
 #include "executor/spi.h"
 #include "libpq/be-fsstubs.h"
 #include "miscadmin.h"
+#include "pgstat.h"
 #include "storage/fd.h"
-#include "storage/proc.h"
-#include "storage/sinval.h"
+#include "storage/lmgr.h"
+#include "storage/procarray.h"
 #include "storage/smgr.h"
+#include "utils/combocid.h"
+#include "utils/flatfiles.h"
 #include "utils/guc.h"
 #include "utils/inval.h"
 #include "utils/memutils.h"
-#include "utils/portal.h"
-#include "utils/resowner.h"
-#include "pgstat.h"
+#include "utils/relcache.h"
+
 
+/*
+ *     User-tweakable parameters
+ */
+int                    DefaultXactIsoLevel = XACT_READ_COMMITTED;
+int                    XactIsoLevel;
+
+bool           DefaultXactReadOnly = false;
+bool           XactReadOnly;
+
+bool           XactSyncCommit = true;
+
+int                    CommitDelay = 0;        /* precommit delay in microseconds */
+int                    CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 
 
 /*
  */
 typedef enum TransState
 {
-       TRANS_DEFAULT,
-       TRANS_START,
-       TRANS_INPROGRESS,
-       TRANS_COMMIT,
-       TRANS_ABORT
+       TRANS_DEFAULT,                          /* idle */
+       TRANS_START,                            /* transaction starting */
+       TRANS_INPROGRESS,                       /* inside a valid transaction */
+       TRANS_COMMIT,                           /* commit in progress */
+       TRANS_ABORT,                            /* abort in progress */
+       TRANS_PREPARE                           /* prepare in progress */
 } TransState;
 
 /*
  *     transaction block states - transaction state of client queries
+ *
+ * Note: the subtransaction states are used only for non-topmost
+ * transactions; the others appear only in the topmost transaction.
  */
 typedef enum TBlockState
 {
        /* not-in-transaction-block states */
-       TBLOCK_DEFAULT,
-       TBLOCK_STARTED,
+       TBLOCK_DEFAULT,                         /* idle */
+       TBLOCK_STARTED,                         /* running single-query transaction */
 
        /* transaction block states */
-       TBLOCK_BEGIN,
-       TBLOCK_INPROGRESS,
-       TBLOCK_END,
-       TBLOCK_ABORT,
-       TBLOCK_ENDABORT,
+       TBLOCK_BEGIN,                           /* starting transaction block */
+       TBLOCK_INPROGRESS,                      /* live transaction */
+       TBLOCK_END,                                     /* COMMIT received */
+       TBLOCK_ABORT,                           /* failed xact, awaiting ROLLBACK */
+       TBLOCK_ABORT_END,                       /* failed xact, ROLLBACK received */
+       TBLOCK_ABORT_PENDING,           /* live xact, ROLLBACK received */
+       TBLOCK_PREPARE,                         /* live xact, PREPARE received */
 
        /* subtransaction states */
-       TBLOCK_SUBBEGIN,
-       TBLOCK_SUBINPROGRESS,
-       TBLOCK_SUBEND,
-       TBLOCK_SUBABORT,
-       TBLOCK_SUBABORT_PENDING,
-       TBLOCK_SUBENDABORT_ALL,
-       TBLOCK_SUBENDABORT_RELEASE,
-       TBLOCK_SUBENDABORT
+       TBLOCK_SUBBEGIN,                        /* starting a subtransaction */
+       TBLOCK_SUBINPROGRESS,           /* live subtransaction */
+       TBLOCK_SUBEND,                          /* RELEASE received */
+       TBLOCK_SUBABORT,                        /* failed subxact, awaiting ROLLBACK */
+       TBLOCK_SUBABORT_END,            /* failed subxact, ROLLBACK received */
+       TBLOCK_SUBABORT_PENDING,        /* live subxact, ROLLBACK received */
+       TBLOCK_SUBRESTART,                      /* live subxact, ROLLBACK TO received */
+       TBLOCK_SUBABORT_RESTART         /* failed subxact, ROLLBACK TO received */
 } TBlockState;
 
 /*
@@ -213,60 +111,24 @@ typedef enum TBlockState
  */
 typedef struct TransactionStateData
 {
-       TransactionId   transactionIdData;              /* my XID */
-       char               *name;                                       /* savepoint name, if any */
-       int                             savepointLevel;                 /* savepoint level */
-       CommandId               commandId;                              /* current CID */
-       TransState              state;                                  /* low-level state */
-       TBlockState             blockState;                             /* high-level state */
-       int                             nestingLevel;                   /* nest depth */
-       MemoryContext   curTransactionContext;  /* my xact-lifetime context */
-       ResourceOwner   curTransactionOwner;    /* my query resources */
-       List               *childXids;                          /* subcommitted child XIDs */
-       AclId                   currentUser;                    /* subxact start current_user */
-       bool                    prevXactReadOnly;               /* entry-time xact r/o state */
-       struct TransactionStateData *parent;    /* back link to parent */
+       TransactionId transactionId;    /* my XID, or Invalid if none */
+       SubTransactionId subTransactionId;      /* my subxact ID */
+       char       *name;                       /* savepoint name, if any */
+       int                     savepointLevel; /* savepoint level */
+       TransState      state;                  /* low-level state */
+       TBlockState blockState;         /* high-level state */
+       int                     nestingLevel;   /* transaction nesting depth */
+       int                     gucNestLevel;   /* GUC context nesting depth */
+       MemoryContext curTransactionContext;            /* my xact-lifetime context */
+       ResourceOwner curTransactionOwner;      /* my query resources */
+       List       *childXids;          /* subcommitted child XIDs */
+       Oid                     currentUser;    /* subxact start current_user */
+       bool            prevXactReadOnly;               /* entry-time xact r/o state */
+       struct TransactionStateData *parent;            /* back link to parent */
 } TransactionStateData;
 
 typedef TransactionStateData *TransactionState;
 
-
-static void AbortTransaction(void);
-static void AtAbort_Memory(void);
-static void AtCleanup_Memory(void);
-static void AtCommit_LocalCache(void);
-static void AtCommit_Memory(void);
-static void AtStart_Cache(void);
-static void AtStart_Memory(void);
-static void AtStart_ResourceOwner(void);
-static void CallEOXactCallbacks(bool isCommit);
-static void CleanupTransaction(void);
-static void CommitTransaction(void);
-static void RecordTransactionAbort(void);
-static void StartTransaction(void);
-
-static void RecordSubTransactionCommit(void);
-static void StartSubTransaction(void);
-static void CommitSubTransaction(void);
-static void AbortSubTransaction(void);
-static void CleanupSubTransaction(void);
-static void StartAbortedSubTransaction(void);
-static void PushTransaction(void);
-static void PopTransaction(void);
-static void CommitTransactionToLevel(int level);
-static char *CleanupAbortedSubTransactions(bool returnName);
-
-static void AtSubAbort_Memory(void);
-static void AtSubCleanup_Memory(void);
-static void AtSubCommit_Memory(void);
-static void AtSubStart_Memory(void);
-static void AtSubStart_ResourceOwner(void);
-
-static void ShowTransactionState(const char *str);
-static void ShowTransactionStateRec(TransactionState state);
-static const char *BlockStateAsString(TBlockState blockState);
-static const char *TransStateAsString(TransState state);
-
 /*
  * CurrentTransactionState always points to the current transaction state
  * block.  It will point to TopTransactionStateData when not in a
@@ -274,13 +136,14 @@ static const char *TransStateAsString(TransState state);
  */
 static TransactionStateData TopTransactionStateData = {
        0,                                                      /* transaction id */
+       0,                                                      /* subtransaction id */
        NULL,                                           /* savepoint name */
        0,                                                      /* savepoint level */
-       FirstCommandId,                         /* command id */
        TRANS_DEFAULT,                          /* transaction state */
        TBLOCK_DEFAULT,                         /* transaction block state from the client
                                                                 * perspective */
-       0,                                                      /* nesting level */
+       0,                                                      /* transaction nesting depth */
+       0,                                                      /* GUC context nesting depth */
        NULL,                                           /* cur transaction context */
        NULL,                                           /* cur transaction resource owner */
        NIL,                                            /* subcommitted child Xids */
@@ -292,42 +155,105 @@ static TransactionStateData TopTransactionStateData = {
 static TransactionState CurrentTransactionState = &TopTransactionStateData;
 
 /*
- * These vars hold the value of now(), ie, the transaction start time.
- * This does not change as we enter and exit subtransactions, so we don't
- * keep it inside the TransactionState stack.
+ * The subtransaction ID and command ID assignment counters are global
+ * to a whole transaction, so we do not keep them in the state stack.
  */
-static AbsoluteTime xactStartTime;                     /* integer part */
-static int             xactStartTimeUsec;                      /* microsecond part */
+static SubTransactionId currentSubTransactionId;
+static CommandId currentCommandId;
 
+/*
+ * xactStartTimestamp is the value of transaction_timestamp().
+ * stmtStartTimestamp is the value of statement_timestamp().
+ * xactStopTimestamp is the time at which we log a commit or abort WAL record.
+ * These do not change as we enter and exit subtransactions, so we don't
+ * keep them inside the TransactionState stack.
+ */
+static TimestampTz xactStartTimestamp;
+static TimestampTz stmtStartTimestamp;
+static TimestampTz xactStopTimestamp;
 
 /*
- *     User-tweakable parameters
+ * GID to be used for preparing the current transaction.  This is also
+ * global to a whole transaction, so we don't keep it in the state stack.
  */
-int                    DefaultXactIsoLevel = XACT_READ_COMMITTED;
-int                    XactIsoLevel;
+static char *prepareGID;
 
-bool           DefaultXactReadOnly = false;
-bool           XactReadOnly;
+/*
+ * Some commands want to force synchronous commit.
+ */
+static bool forceSyncCommit = false;
 
-int                    CommitDelay = 0;        /* precommit delay in microseconds */
-int                    CommitSiblings = 5; /* number of concurrent xacts needed to
-                                                                * sleep */
+/*
+ * Private context for transaction-abort work --- we reserve space for this
+ * at startup to ensure that AbortTransaction and AbortSubTransaction can work
+ * when we've run out of memory.
+ */
+static MemoryContext TransactionAbortContext = NULL;
+
+/*
+ * List of add-on start- and end-of-xact callbacks
+ */
+typedef struct XactCallbackItem
+{
+       struct XactCallbackItem *next;
+       XactCallback callback;
+       void       *arg;
+} XactCallbackItem;
 
+static XactCallbackItem *Xact_callbacks = NULL;
 
 /*
- * List of add-on end-of-xact callbacks
+ * List of add-on start- and end-of-subxact callbacks
  */
-typedef struct EOXactCallbackItem
+typedef struct SubXactCallbackItem
 {
-       struct EOXactCallbackItem *next;
-       EOXactCallback callback;
+       struct SubXactCallbackItem *next;
+       SubXactCallback callback;
        void       *arg;
-} EOXactCallbackItem;
+} SubXactCallbackItem;
+
+static SubXactCallbackItem *SubXact_callbacks = NULL;
+
 
-static EOXactCallbackItem *EOXact_callbacks = NULL;
+/* local function prototypes */
+static void AssignSubTransactionId(TransactionState s);
+static void AbortTransaction(void);
+static void AtAbort_Memory(void);
+static void AtCleanup_Memory(void);
+static void AtAbort_ResourceOwner(void);
+static void AtCommit_LocalCache(void);
+static void AtCommit_Memory(void);
+static void AtStart_Cache(void);
+static void AtStart_Memory(void);
+static void AtStart_ResourceOwner(void);
+static void CallXactCallbacks(XactEvent event);
+static void CallSubXactCallbacks(SubXactEvent event,
+                                        SubTransactionId mySubid,
+                                        SubTransactionId parentSubid);
+static void CleanupTransaction(void);
+static void CommitTransaction(void);
+static void RecordTransactionAbort(void);
+static void StartTransaction(void);
+
+static void RecordSubTransactionCommit(void);
+static void StartSubTransaction(void);
+static void CommitSubTransaction(void);
+static void AbortSubTransaction(void);
+static void CleanupSubTransaction(void);
+static void PushTransaction(void);
+static void PopTransaction(void);
+
+static void AtSubAbort_Memory(void);
+static void AtSubCleanup_Memory(void);
+static void AtSubAbort_ResourceOwner(void);
+static void AtSubCommit_Memory(void);
+static void AtSubStart_Memory(void);
+static void AtSubStart_ResourceOwner(void);
 
-static void (*_RollbackFunc) (void *) = NULL;
-static void *_RollbackData = NULL;
+static void ShowTransactionState(const char *str);
+static void ShowTransactionStateRec(TransactionState state);
+static const char *BlockStateAsString(TBlockState blockState);
+static const char *TransStateAsString(TransState state);
 
 
 /* ----------------------------------------------------------------
@@ -338,32 +264,22 @@ static void *_RollbackData = NULL;
 /*
  *     IsTransactionState
  *
- *     This returns true if we are currently running a query
- *     within an executing transaction.
+ *     This returns true if we are inside a valid transaction; that is,
+ *     it is safe to initiate database access, take heavyweight locks, etc.
  */
 bool
 IsTransactionState(void)
 {
        TransactionState s = CurrentTransactionState;
 
-       switch (s->state)
-       {
-               case TRANS_DEFAULT:
-                       return false;
-               case TRANS_START:
-                       return true;
-               case TRANS_INPROGRESS:
-                       return true;
-               case TRANS_COMMIT:
-                       return true;
-               case TRANS_ABORT:
-                       return true;
-       }
-
        /*
-        * Shouldn't get here, but lint is not happy with this...
+        * TRANS_DEFAULT and TRANS_ABORT are obviously unsafe states.  However,
+        * we also reject the startup/shutdown states TRANS_START, TRANS_COMMIT,
+        * TRANS_PREPARE since it might be too soon or too late within those
+        * transition states to do anything interesting.  Hence, the only "valid"
+        * state is TRANS_INPROGRESS.
         */
-       return false;
+       return (s->state == TRANS_INPROGRESS);
 }
 
 /*
@@ -377,7 +293,7 @@ IsAbortedTransactionBlockState(void)
 {
        TransactionState s = CurrentTransactionState;
 
-       if (s->blockState == TBLOCK_ABORT || 
+       if (s->blockState == TBLOCK_ABORT ||
                s->blockState == TBLOCK_SUBABORT)
                return true;
 
@@ -389,24 +305,106 @@ IsAbortedTransactionBlockState(void)
  *     GetTopTransactionId
  *
  * Get the ID of the main transaction, even if we are currently inside
- * a subtransaction.
+ * a subtransaction.  If we are not in a transaction at all, or if we
+ * are in transaction startup and haven't yet assigned an XID,
+ * InvalidTransactionId is returned.
  */
 TransactionId
 GetTopTransactionId(void)
 {
-       return TopTransactionStateData.transactionIdData;
+       return TopTransactionStateData.transactionId;
 }
 
 
 /*
  *     GetCurrentTransactionId
+ *
+ * We do not assign XIDs to subtransactions until/unless this is called.
+ * When we do assign an XID to a subtransaction, recursively make sure
+ * its parent has one as well (this maintains the invariant that a child
+ * transaction has an XID following its parent's).
  */
 TransactionId
 GetCurrentTransactionId(void)
 {
        TransactionState s = CurrentTransactionState;
 
-       return s->transactionIdData;
+       if (!TransactionIdIsValid(s->transactionId))
+               AssignSubTransactionId(s);
+
+       return s->transactionId;
+}
+
+static void
+AssignSubTransactionId(TransactionState s)
+{
+       ResourceOwner currentOwner;
+
+       Assert(s->parent != NULL);
+       Assert(s->state == TRANS_INPROGRESS);
+       if (!TransactionIdIsValid(s->parent->transactionId))
+               AssignSubTransactionId(s->parent);
+
+       /*
+        * Generate a new Xid and record it in PG_PROC and pg_subtrans.
+        *
+        * NB: we must make the subtrans entry BEFORE the Xid appears anywhere in
+        * shared storage other than PG_PROC; because if there's no room for it in
+        * PG_PROC, the subtrans entry is needed to ensure that other backends see
+        * the Xid as "running".  See GetNewTransactionId.
+        */
+       s->transactionId = GetNewTransactionId(true);
+
+       SubTransSetParent(s->transactionId, s->parent->transactionId);
+
+       /*
+        * Acquire lock on the transaction XID.  (We assume this cannot block.) We
+        * have to be sure that the lock is assigned to the transaction's
+        * ResourceOwner.
+        */
+       currentOwner = CurrentResourceOwner;
+       PG_TRY();
+       {
+               CurrentResourceOwner = s->curTransactionOwner;
+
+               XactLockTableInsert(s->transactionId);
+       }
+       PG_CATCH();
+       {
+               /* Ensure CurrentResourceOwner is restored on error */
+               CurrentResourceOwner = currentOwner;
+               PG_RE_THROW();
+       }
+       PG_END_TRY();
+       CurrentResourceOwner = currentOwner;
+}
+
+
+/*
+ *     GetCurrentTransactionIdIfAny
+ *
+ * Unlike GetCurrentTransactionId, this will return InvalidTransactionId
+ * if we are currently not in a transaction, or in a transaction or
+ * subtransaction that has not yet assigned itself an XID.
+ */
+TransactionId
+GetCurrentTransactionIdIfAny(void)
+{
+       TransactionState s = CurrentTransactionState;
+
+       return s->transactionId;
+}
+
+
+/*
+ *     GetCurrentSubTransactionId
+ */
+SubTransactionId
+GetCurrentSubTransactionId(void)
+{
+       TransactionState s = CurrentTransactionState;
+
+       return s->subTransactionId;
 }
 
 
@@ -416,32 +414,59 @@ GetCurrentTransactionId(void)
 CommandId
 GetCurrentCommandId(void)
 {
-       TransactionState s = CurrentTransactionState;
-
-       return s->commandId;
+       /* this is global to a transaction, not subtransaction-local */
+       return currentCommandId;
 }
 
+/*
+ *     GetCurrentTransactionStartTimestamp
+ */
+TimestampTz
+GetCurrentTransactionStartTimestamp(void)
+{
+       return xactStartTimestamp;
+}
 
 /*
- *     GetCurrentTransactionStartTime
+ *     GetCurrentStatementStartTimestamp
  */
-AbsoluteTime
-GetCurrentTransactionStartTime(void)
+TimestampTz
+GetCurrentStatementStartTimestamp(void)
 {
-       return xactStartTime;
+       return stmtStartTimestamp;
 }
 
+/*
+ *     GetCurrentTransactionStopTimestamp
+ *
+ * We return current time if the transaction stop time hasn't been set
+ * (which can happen if we decide we don't need to log an XLOG record).
+ */
+TimestampTz
+GetCurrentTransactionStopTimestamp(void)
+{
+       if (xactStopTimestamp != 0)
+               return xactStopTimestamp;
+       return GetCurrentTimestamp();
+}
 
 /*
- *     GetCurrentTransactionStartTimeUsec
+ *     SetCurrentStatementStartTimestamp
  */
-AbsoluteTime
-GetCurrentTransactionStartTimeUsec(int *msec)
+void
+SetCurrentStatementStartTimestamp(void)
 {
-       *msec = xactStartTimeUsec;
-       return xactStartTime;
+       stmtStartTimestamp = GetCurrentTimestamp();
 }
 
+/*
+ *     SetCurrentTransactionStopTimestamp
+ */
+static inline void
+SetCurrentTransactionStopTimestamp(void)
+{
+       xactStopTimestamp = GetCurrentTimestamp();
+}
 
 /*
  *     GetCurrentTransactionNestLevel
@@ -460,41 +485,50 @@ GetCurrentTransactionNestLevel(void)
 
 /*
  *     TransactionIdIsCurrentTransactionId
- *
- *     During bootstrap, we cheat and say "it's not my transaction ID" even though
- *     it is.  Along with transam.c's cheat to say that the bootstrap XID is
- *     already committed, this causes the tqual.c routines to see previously
- *     inserted tuples as committed, which is what we need during bootstrap.
  */
 bool
 TransactionIdIsCurrentTransactionId(TransactionId xid)
 {
-       TransactionState s = CurrentTransactionState;
+       TransactionState s;
 
-       if (AMI_OVERRIDE)
-       {
-               Assert(xid == BootstrapTransactionId);
+       /*
+        * We always say that BootstrapTransactionId is "not my transaction ID"
+        * even when it is (ie, during bootstrap).      Along with the fact that
+        * transam.c always treats BootstrapTransactionId as already committed,
+        * this causes the tqual.c routines to see all tuples as committed, which
+        * is what we need during bootstrap.  (Bootstrap mode only inserts tuples,
+        * it never updates or deletes them, so all tuples can be presumed good
+        * immediately.)
+        *
+        * Likewise, InvalidTransactionId and FrozenTransactionId are certainly
+        * not my transaction ID, so we can just return "false" immediately for
+        * any non-normal XID.
+        */
+       if (!TransactionIdIsNormal(xid))
                return false;
-       }
 
        /*
-        * We will return true for the Xid of the current subtransaction,
-        * any of its subcommitted children, any of its parents, or any of
-        * their previously subcommitted children.
+        * We will return true for the Xid of the current subtransaction, any of
+        * its subcommitted children, any of its parents, or any of their
+        * previously subcommitted children.  However, a transaction being aborted
+        * is no longer "current", even though it may still have an entry on the
+        * state stack.
         */
-       while (s != NULL)
+       for (s = CurrentTransactionState; s != NULL; s = s->parent)
        {
-               ListCell *cell;
+               ListCell   *cell;
 
-               if (TransactionIdEquals(xid, s->transactionIdData))
+               if (s->state == TRANS_ABORT)
+                       continue;
+               if (!TransactionIdIsValid(s->transactionId))
+                       continue;                       /* it can't have any child XIDs either */
+               if (TransactionIdEquals(xid, s->transactionId))
                        return true;
                foreach(cell, s->childXids)
                {
-                       if (TransactionIdEquals(xid, lfirst_int(cell)))
+                       if (TransactionIdEquals(xid, lfirst_xid(cell)))
                                return true;
                }
-
-               s = s->parent;
        }
 
        return false;
@@ -507,19 +541,20 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
 void
 CommandCounterIncrement(void)
 {
-       TransactionState s = CurrentTransactionState;
-
-       s->commandId += 1;
-       if (s->commandId == FirstCommandId) /* check for overflow */
+       currentCommandId += 1;
+       if (currentCommandId == FirstCommandId)         /* check for overflow */
+       {
+               currentCommandId -= 1;
                ereport(ERROR,
                                (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
-                                errmsg("cannot have more than 2^32-1 commands in a transaction")));
+                 errmsg("cannot have more than 2^32-1 commands in a transaction")));
+       }
 
-       /* Propagate new command ID into query snapshots, if set */
-       if (QuerySnapshot)
-               QuerySnapshot->curcid = s->commandId;
+       /* Propagate new command ID into static snapshots, if set */
        if (SerializableSnapshot)
-               SerializableSnapshot->curcid = s->commandId;
+               SerializableSnapshot->curcid = currentCommandId;
+       if (LatestSnapshot)
+               LatestSnapshot->curcid = currentCommandId;
 
        /*
         * make cache changes visible to me.
@@ -528,6 +563,18 @@ CommandCounterIncrement(void)
        AtStart_Cache();
 }
 
+/*
+ * ForceSyncCommit
+ *
+ * Interface routine to allow commands to force a synchronous commit of the
+ * current top-level transaction
+ */
+void
+ForceSyncCommit(void)
+{
+       forceSyncCommit = true;
+}
+
 
 /* ----------------------------------------------------------------
  *                                             StartTransaction stuff
@@ -552,6 +599,21 @@ AtStart_Memory(void)
        TransactionState s = CurrentTransactionState;
 
        /*
+        * If this is the first time through, create a private context for
+        * AbortTransaction to work in.  By reserving some space now, we can
+        * insulate AbortTransaction from out-of-memory scenarios.  Like
+        * ErrorContext, we set it up with slow growth rate and a nonzero
+        * minimum size, so that space will be reserved immediately.
+        */
+       if (TransactionAbortContext == NULL)
+               TransactionAbortContext =
+                       AllocSetContextCreate(TopMemoryContext,
+                                                                 "TransactionAbortContext",
+                                                                 32 * 1024,
+                                                                 32 * 1024,
+                                                                 32 * 1024);
+
+       /*
         * We shouldn't have a transaction context already.
         */
        Assert(TopTransactionContext == NULL);
@@ -642,8 +704,8 @@ AtSubStart_ResourceOwner(void)
        Assert(s->parent != NULL);
 
        /*
-        * Create a resource owner for the subtransaction.  We make it a
-        * child of the immediate parent's resource owner.
+        * Create a resource owner for the subtransaction.      We make it a child of
+        * the immediate parent's resource owner.
         */
        s->curTransactionOwner =
                ResourceOwnerCreate(s->parent->curTransactionOwner,
@@ -665,24 +727,25 @@ void
 RecordTransactionCommit(void)
 {
        int                     nrels;
-       RelFileNode *rptr;
+       RelFileNode *rels;
        int                     nchildren;
        TransactionId *children;
 
        /* Get data needed for commit record */
-       nrels = smgrGetPendingDeletes(true, &rptr);
+       nrels = smgrGetPendingDeletes(true, &rels);
        nchildren = xactGetCommittedChildren(&children);
 
        /*
-        * If we made neither any XLOG entries nor any temp-rel updates,
-        * and have no files to be deleted, we can omit recording the transaction
-        * commit at all.  (This test includes the effects of subtransactions,
-        * so the presence of committed subxacts need not alone force a write.)
+        * If we made neither any XLOG entries nor any temp-rel updates, and have
+        * no files to be deleted, we can omit recording the transaction commit at
+        * all.  (This test includes the effects of subtransactions, so the
+        * presence of committed subxacts need not alone force a write.)
         */
        if (MyXactMadeXLogEntry || MyXactMadeTempRelUpdate || nrels > 0)
        {
                TransactionId xid = GetCurrentTransactionId();
                bool            madeTCentries;
+               bool            isAsyncCommit = false;
                XLogRecPtr      recptr;
 
                /* Tell bufmgr and smgr to prepare for commit */
@@ -691,11 +754,8 @@ RecordTransactionCommit(void)
                START_CRIT_SECTION();
 
                /*
-                * We only need to log the commit in XLOG if the transaction made
-                * any transaction-controlled XLOG entries or will delete files.
-                * (If it made no transaction-controlled XLOG entries, its XID
-                * appears nowhere in permanent storage, so no one else will ever care
-                * if it committed.)
+                * We only need to log the commit in XLOG if the transaction made any
+                * transaction-controlled XLOG entries or will delete files.
                 */
                madeTCentries = (MyLastRecPtr.xrecoff != 0);
                if (madeTCentries || nrels > 0)
@@ -704,28 +764,49 @@ RecordTransactionCommit(void)
                        int                     lastrdata = 0;
                        xl_xact_commit xlrec;
 
-                       xlrec.xtime = time(NULL);
+                       /*
+                        * Mark ourselves as within our "commit critical section".  This
+                        * forces any concurrent checkpoint to wait until we've updated
+                        * pg_clog.  Without this, it is possible for the checkpoint to
+                        * set REDO after the XLOG record but fail to flush the pg_clog
+                        * update to disk, leading to loss of the transaction commit if
+                        * the system crashes a little later.
+                        *
+                        * Note: we could, but don't bother to, set this flag in
+                        * RecordTransactionAbort.  That's because loss of a transaction
+                        * abort is noncritical; the presumption would be that it aborted,
+                        * anyway.
+                        *
+                        * It's safe to change the inCommit flag of our own backend
+                        * without holding the ProcArrayLock, since we're the only one
+                        * modifying it.  This makes checkpoint's determination of which
+                        * xacts are inCommit a bit fuzzy, but it doesn't matter.
+                        */
+                       MyProc->inCommit = true;
+
+                       SetCurrentTransactionStopTimestamp();
+                       xlrec.xact_time = xactStopTimestamp;
                        xlrec.nrels = nrels;
                        xlrec.nsubxacts = nchildren;
-                       rdata[0].buffer = InvalidBuffer;
                        rdata[0].data = (char *) (&xlrec);
                        rdata[0].len = MinSizeOfXactCommit;
+                       rdata[0].buffer = InvalidBuffer;
                        /* dump rels to delete */
                        if (nrels > 0)
                        {
                                rdata[0].next = &(rdata[1]);
-                               rdata[1].buffer = InvalidBuffer;
-                               rdata[1].data = (char *) rptr;
+                               rdata[1].data = (char *) rels;
                                rdata[1].len = nrels * sizeof(RelFileNode);
+                               rdata[1].buffer = InvalidBuffer;
                                lastrdata = 1;
                        }
                        /* dump committed child Xids */
                        if (nchildren > 0)
                        {
                                rdata[lastrdata].next = &(rdata[2]);
-                               rdata[2].buffer = InvalidBuffer;
                                rdata[2].data = (char *) children;
                                rdata[2].len = nchildren * sizeof(TransactionId);
+                               rdata[2].buffer = InvalidBuffer;
                                lastrdata = 2;
                        }
                        rdata[lastrdata].next = NULL;
@@ -739,11 +820,11 @@ RecordTransactionCommit(void)
                }
 
                /*
-                * We must flush our XLOG entries to disk if we made any XLOG
-                * entries, whether in or out of transaction control.  For
-                * example, if we reported a nextval() result to the client, this
-                * ensures that any XLOG record generated by nextval will hit the
-                * disk before we report the transaction committed.
+                * We must flush our XLOG entries to disk if we made any XLOG entries,
+                * whether in or out of transaction control.  For example, if we
+                * reported a nextval() result to the client, this ensures that any
+                * XLOG record generated by nextval will hit the disk before we report
+                * the transaction committed.
                 *
                 * Note: if we generated a commit record above, MyXactMadeXLogEntry
                 * will certainly be set now.
@@ -751,40 +832,77 @@ RecordTransactionCommit(void)
                if (MyXactMadeXLogEntry)
                {
                        /*
-                        * Sleep before flush! So we can flush more than one commit
-                        * records per single fsync.  (The idea is some other backend
-                        * may do the XLogFlush while we're sleeping.  This needs work
-                        * still, because on most Unixen, the minimum select() delay
-                        * is 10msec or more, which is way too long.)
-                        *
-                        * We do not sleep if enableFsync is not turned on, nor if there
-                        * are fewer than CommitSiblings other backends with active
-                        * transactions.
+                        * If the user has set synchronous_commit = off, and we're
+                        * not doing cleanup of any rels nor committing any command
+                        * that wanted to force sync commit, then we can defer fsync.
                         */
-                       if (CommitDelay > 0 && enableFsync &&
-                               CountActiveBackends() >= CommitSiblings)
-                               pg_usleep(CommitDelay);
+                       if (XactSyncCommit || forceSyncCommit || nrels > 0)
+                       {
+                               /*
+                                * Synchronous commit case.
+                                *
+                                * Sleep before flush! So we can flush more than one commit
+                                * records per single fsync.  (The idea is some other backend
+                                * may do the XLogFlush while we're sleeping.  This needs work
+                                * still, because on most Unixen, the minimum select() delay
+                                * is 10msec or more, which is way too long.)
+                                *
+                                * We do not sleep if enableFsync is not turned on, nor if
+                                * there are fewer than CommitSiblings other backends with
+                                * active transactions.
+                                */
+                               if (CommitDelay > 0 && enableFsync &&
+                                       CountActiveBackends() >= CommitSiblings)
+                                       pg_usleep(CommitDelay);
+
+                               XLogFlush(recptr);
+                       }
+                       else
+                       {
+                               /*
+                                * Asynchronous commit case.
+                                */
+                               isAsyncCommit = true;
 
-                       XLogFlush(recptr);
+                               /*
+                                * Report the latest async commit LSN, so that
+                                * the WAL writer knows to flush this commit.
+                                */
+                               XLogSetAsyncCommitLSN(recptr);
+                       }
                }
 
                /*
-                * We must mark the transaction committed in clog if its XID
-                * appears either in permanent rels or in local temporary rels. We
-                * test this by seeing if we made transaction-controlled entries
-                * *OR* local-rel tuple updates.  Note that if we made only the
-                * latter, we have not emitted an XLOG record for our commit, and
-                * so in the event of a crash the clog update might be lost.  This
-                * is okay because no one else will ever care whether we
-                * committed.
+                * We must mark the transaction committed in clog if its XID appears
+                * either in permanent rels or in local temporary rels. We test this
+                * by seeing if we made transaction-controlled entries *OR* local-rel
+                * tuple updates.  Note that if we made only the latter, we have not
+                * emitted an XLOG record for our commit, and so in the event of a
+                * crash the clog update might be lost.  This is okay because no one
+                * else will ever care whether we committed.
+                *
+                * The recptr here refers to the last xlog entry by this transaction
+                * so is the correct value to use for setting the clog.
                 */
                if (madeTCentries || MyXactMadeTempRelUpdate)
                {
-                       TransactionIdCommit(xid);
-                       /* to avoid race conditions, the parent must commit first */
-                       TransactionIdCommitTree(nchildren, children);
+                       if (isAsyncCommit)
+                       {
+                               TransactionIdAsyncCommit(xid, recptr);
+                               /* to avoid race conditions, the parent must commit first */
+                               TransactionIdAsyncCommitTree(nchildren, children, recptr);
+                       }
+                       else
+                       {
+                               TransactionIdCommit(xid);
+                               /* to avoid race conditions, the parent must commit first */
+                               TransactionIdCommitTree(nchildren, children);
+                       }
                }
 
+               /* Checkpoint can proceed now */
+               MyProc->inCommit = false;
+
                END_CRIT_SECTION();
        }
 
@@ -793,12 +911,9 @@ RecordTransactionCommit(void)
        MyXactMadeXLogEntry = false;
        MyXactMadeTempRelUpdate = false;
 
-       /* Show myself as out of the transaction in PGPROC array */
-       MyProc->logRec.xrecoff = 0;
-
        /* And clean up local data */
-       if (rptr)
-               pfree(rptr);
+       if (rels)
+               pfree(rels);
        if (children)
                pfree(children);
 }
@@ -823,9 +938,8 @@ static void
 AtCommit_Memory(void)
 {
        /*
-        * Now that we're "out" of a transaction, have the system allocate
-        * things in the top memory context instead of per-transaction
-        * contexts.
+        * Now that we're "out" of a transaction, have the system allocate things
+        * in the top memory context instead of per-transaction contexts.
         */
        MemoryContextSwitchTo(TopMemoryContext);
 
@@ -846,9 +960,6 @@ AtCommit_Memory(void)
 
 /*
  * AtSubCommit_Memory
- *
- * We do not throw away the child's CurTransactionContext, since the data
- * it contains will be needed at upper commit.
  */
 static void
 AtSubCommit_Memory(void)
@@ -860,10 +971,22 @@ AtSubCommit_Memory(void)
        /* Return to parent transaction level's memory context. */
        CurTransactionContext = s->parent->curTransactionContext;
        MemoryContextSwitchTo(CurTransactionContext);
-}
 
-/*
- * AtSubCommit_childXids
+       /*
+        * Ordinarily we cannot throw away the child's CurTransactionContext,
+        * since the data it contains will be needed at upper commit.  However, if
+        * there isn't actually anything in it, we can throw it away.  This avoids
+        * a small memory leak in the common case of "trivial" subxacts.
+        */
+       if (MemoryContextIsEmpty(s->curTransactionContext))
+       {
+               MemoryContextDelete(s->curTransactionContext);
+               s->curTransactionContext = NULL;
+       }
+}
+
+/*
+ * AtSubCommit_childXids
  *
  * Pass my own XID and my child XIDs up to my parent as committed children.
  */
@@ -875,13 +998,28 @@ AtSubCommit_childXids(void)
 
        Assert(s->parent != NULL);
 
-       old_cxt = MemoryContextSwitchTo(s->parent->curTransactionContext);
+       /*
+        * We keep the child-XID lists in TopTransactionContext; this avoids
+        * setting up child-transaction contexts for what might be just a few
+        * bytes of grandchild XIDs.
+        */
+       old_cxt = MemoryContextSwitchTo(TopTransactionContext);
 
-       s->parent->childXids = list_concat(s->parent->childXids, s->childXids);
-       s->childXids = NIL;                     /* ensure list not doubly referenced */
+       s->parent->childXids = lappend_xid(s->parent->childXids,
+                                                                          s->transactionId);
+
+       if (s->childXids != NIL)
+       {
+               s->parent->childXids = list_concat(s->parent->childXids,
+                                                                                  s->childXids);
 
-       s->parent->childXids = lappend_int(s->parent->childXids,
-                                                                          s->transactionIdData);
+               /*
+                * list_concat doesn't free the list header for the second list; do so
+                * here to avoid memory leakage (kluge)
+                */
+               pfree(s->childXids);
+               s->childXids = NIL;
+       }
 
        MemoryContextSwitchTo(old_cxt);
 }
@@ -893,21 +1031,21 @@ static void
 RecordSubTransactionCommit(void)
 {
        /*
-        * We do not log the subcommit in XLOG; it doesn't matter until
-        * the top-level transaction commits.
+        * We do not log the subcommit in XLOG; it doesn't matter until the
+        * top-level transaction commits.
         *
-        * We must mark the subtransaction subcommitted in clog if its XID
-        * appears either in permanent rels or in local temporary rels. We
-        * test this by seeing if we made transaction-controlled entries
-        * *OR* local-rel tuple updates.  (The test here actually covers the
-        * entire transaction tree so far, so it may mark subtransactions that
-        * don't really need it, but it's probably not worth being tenser.
-        * Note that if a prior subtransaction dirtied these variables, then
-        * RecordTransactionCommit will have to do the full pushup anyway...)
+        * We must mark the subtransaction subcommitted in clog if its XID appears
+        * either in permanent rels or in local temporary rels. We test this by
+        * seeing if we made transaction-controlled entries *OR* local-rel tuple
+        * updates.  (The test here actually covers the entire transaction tree so
+        * far, so it may mark subtransactions that don't really need it, but it's
+        * probably not worth being tenser. Note that if a prior subtransaction
+        * dirtied these variables, then RecordTransactionCommit will have to do
+        * the full pushup anyway...)
         */
        if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate)
        {
-               TransactionId   xid = GetCurrentTransactionId();
+               TransactionId xid = GetCurrentTransactionId();
 
                /* XXX does this really need to be a critical section? */
                START_CRIT_SECTION();
@@ -931,23 +1069,23 @@ static void
 RecordTransactionAbort(void)
 {
        int                     nrels;
-       RelFileNode *rptr;
-       int                     nchildren;
-       TransactionId  *children;
+       RelFileNode *rels;
+       int                     nchildren;
+       TransactionId *children;
 
        /* Get data needed for abort record */
-       nrels = smgrGetPendingDeletes(false, &rptr);
+       nrels = smgrGetPendingDeletes(false, &rels);
        nchildren = xactGetCommittedChildren(&children);
 
        /*
         * If we made neither any transaction-controlled XLOG entries nor any
         * temp-rel updates, and are not going to delete any files, we can omit
-        * recording the transaction abort at all.  No one will ever care that
-        * it aborted.  (These tests cover our whole transaction tree.)
+        * recording the transaction abort at all.      No one will ever care that it
+        * aborted.  (These tests cover our whole transaction tree.)
         */
        if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate || nrels > 0)
        {
-               TransactionId   xid = GetCurrentTransactionId();
+               TransactionId xid = GetCurrentTransactionId();
 
                /*
                 * Catch the scenario where we aborted partway through
@@ -959,14 +1097,16 @@ RecordTransactionAbort(void)
                START_CRIT_SECTION();
 
                /*
-                * We only need to log the abort in XLOG if the transaction made
-                * any transaction-controlled XLOG entries or will delete files.
-                * (If it made no transaction-controlled XLOG entries, its XID
-                * appears nowhere in permanent storage, so no one else will ever care
-                * if it committed.)
+                * We only need to log the abort in XLOG if the transaction made any
+                * transaction-controlled XLOG entries or will delete files. (If it
+                * made no transaction-controlled XLOG entries, its XID appears
+                * nowhere in permanent storage, so no one else will ever care if it
+                * committed.)
                 *
                 * We do not flush XLOG to disk unless deleting files, since the
                 * default assumption after a crash would be that we aborted, anyway.
+                * For the same reason, we don't need to worry about interlocking
+                * against checkpoint start.
                 */
                if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
                {
@@ -975,28 +1115,29 @@ RecordTransactionAbort(void)
                        xl_xact_abort xlrec;
                        XLogRecPtr      recptr;
 
-                       xlrec.xtime = time(NULL);
+                       SetCurrentTransactionStopTimestamp();
+                       xlrec.xact_time = xactStopTimestamp;
                        xlrec.nrels = nrels;
                        xlrec.nsubxacts = nchildren;
-                       rdata[0].buffer = InvalidBuffer;
                        rdata[0].data = (char *) (&xlrec);
                        rdata[0].len = MinSizeOfXactAbort;
+                       rdata[0].buffer = InvalidBuffer;
                        /* dump rels to delete */
                        if (nrels > 0)
                        {
                                rdata[0].next = &(rdata[1]);
-                               rdata[1].buffer = InvalidBuffer;
-                               rdata[1].data = (char *) rptr;
+                               rdata[1].data = (char *) rels;
                                rdata[1].len = nrels * sizeof(RelFileNode);
+                               rdata[1].buffer = InvalidBuffer;
                                lastrdata = 1;
                        }
                        /* dump committed child Xids */
                        if (nchildren > 0)
                        {
                                rdata[lastrdata].next = &(rdata[2]);
-                               rdata[2].buffer = InvalidBuffer;
                                rdata[2].data = (char *) children;
                                rdata[2].len = nchildren * sizeof(TransactionId);
+                               rdata[2].buffer = InvalidBuffer;
                                lastrdata = 2;
                        }
                        rdata[lastrdata].next = NULL;
@@ -1013,12 +1154,12 @@ RecordTransactionAbort(void)
                 * necessary but we may as well do it while we are here.
                 *
                 * The ordering here isn't critical but it seems best to mark the
-                * parent last.  That reduces the chance that concurrent
-                * TransactionIdDidAbort calls will decide they need to do redundant
-                * work.
+                * parent first.  This assures an atomic transition of all the
+                * subtransactions to aborted state from the point of view of
+                * concurrent TransactionIdDidAbort calls.
                 */
-               TransactionIdAbortTree(nchildren, children);
                TransactionIdAbort(xid);
+               TransactionIdAbortTree(nchildren, children);
 
                END_CRIT_SECTION();
        }
@@ -1028,12 +1169,9 @@ RecordTransactionAbort(void)
        MyXactMadeXLogEntry = false;
        MyXactMadeTempRelUpdate = false;
 
-       /* Show myself as out of the transaction in PGPROC array */
-       MyProc->logRec.xrecoff = 0;
-
        /* And clean up local data */
-       if (rptr)
-               pfree(rptr);
+       if (rels)
+               pfree(rels);
        if (children)
                pfree(children);
 }
@@ -1045,34 +1183,72 @@ static void
 AtAbort_Memory(void)
 {
        /*
-        * Make sure we are in a valid context (not a child of
-        * TopTransactionContext...).  Note that it is possible for this code
-        * to be called when we aren't in a transaction at all; go directly to
-        * TopMemoryContext in that case.
+        * Switch into TransactionAbortContext, which should have some free
+        * space even if nothing else does.  We'll work in this context until
+        * we've finished cleaning up.
+        *
+        * It is barely possible to get here when we've not been able to create
+        * TransactionAbortContext yet; if so use TopMemoryContext.
         */
-       if (TopTransactionContext != NULL)
-       {
-               MemoryContextSwitchTo(TopTransactionContext);
-
-               /*
-                * We do not want to destroy the transaction's global state yet,
-                * so we can't free any memory here.
-                */
-       }
+       if (TransactionAbortContext != NULL)
+               MemoryContextSwitchTo(TransactionAbortContext);
        else
                MemoryContextSwitchTo(TopMemoryContext);
 }
 
-
 /*
  * AtSubAbort_Memory
  */
 static void
 AtSubAbort_Memory(void)
 {
-       Assert(TopTransactionContext != NULL);
+       Assert(TransactionAbortContext != NULL);
+
+       MemoryContextSwitchTo(TransactionAbortContext);
+}
+
+
+/*
+ *     AtAbort_ResourceOwner
+ */
+static void
+AtAbort_ResourceOwner(void)
+{
+       /*
+        * Make sure we have a valid ResourceOwner, if possible (else it will be
+        * NULL, which is OK)
+        */
+       CurrentResourceOwner = TopTransactionResourceOwner;
+}
+
+/*
+ * AtSubAbort_ResourceOwner
+ */
+static void
+AtSubAbort_ResourceOwner(void)
+{
+       TransactionState s = CurrentTransactionState;
+
+       /* Make sure we have a valid ResourceOwner */
+       CurrentResourceOwner = s->curTransactionOwner;
+}
+
+
+/*
+ * AtSubAbort_childXids
+ */
+static void
+AtSubAbort_childXids(void)
+{
+       TransactionState s = CurrentTransactionState;
 
-       MemoryContextSwitchTo(TopTransactionContext);
+       /*
+        * We keep the child-XID lists in TopTransactionContext (see
+        * AtSubCommit_childXids).      This means we'd better free the list
+        * explicitly at abort to avoid leakage.
+        */
+       list_free(s->childXids);
+       s->childXids = NIL;
 }
 
 /*
@@ -1082,21 +1258,22 @@ static void
 RecordSubTransactionAbort(void)
 {
        int                     nrels;
-       RelFileNode *rptr;
-       int                     nchildren;
-       TransactionId  *children;
+       RelFileNode *rels;
+       TransactionId xid = GetCurrentTransactionId();
+       int                     nchildren;
+       TransactionId *children;
 
        /* Get data needed for abort record */
-       nrels = smgrGetPendingDeletes(false, &rptr);
+       nrels = smgrGetPendingDeletes(false, &rels);
        nchildren = xactGetCommittedChildren(&children);
 
        /*
         * If we made neither any transaction-controlled XLOG entries nor any
         * temp-rel updates, and are not going to delete any files, we can omit
-        * recording the transaction abort at all.  No one will ever care that
-        * it aborted.  (These tests cover our whole transaction tree, and
-        * therefore may mark subxacts that don't really need it, but it's
-        * probably not worth being tenser.)
+        * recording the transaction abort at all.      No one will ever care that it
+        * aborted.  (These tests cover our whole transaction tree, and therefore
+        * may mark subxacts that don't really need it, but it's probably not
+        * worth being tenser.)
         *
         * In this case we needn't worry about marking subcommitted children as
         * aborted, because they didn't mark themselves as subcommitted in the
@@ -1104,43 +1281,41 @@ RecordSubTransactionAbort(void)
         */
        if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate || nrels > 0)
        {
-               TransactionId   xid = GetCurrentTransactionId();
-
                START_CRIT_SECTION();
 
                /*
-                * We only need to log the abort in XLOG if the transaction made
-                * any transaction-controlled XLOG entries or will delete files.
+                * We only need to log the abort in XLOG if the transaction made any
+                * transaction-controlled XLOG entries or will delete files.
                 */
                if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
                {
                        XLogRecData rdata[3];
-                       int lastrdata = 0;
+                       int                     lastrdata = 0;
                        xl_xact_abort xlrec;
-                       XLogRecPtr      recptr;
+                       XLogRecPtr      recptr;
 
-                       xlrec.xtime = time(NULL);
+                       xlrec.xact_time = GetCurrentTimestamp();
                        xlrec.nrels = nrels;
                        xlrec.nsubxacts = nchildren;
-                       rdata[0].buffer = InvalidBuffer;
                        rdata[0].data = (char *) (&xlrec);
                        rdata[0].len = MinSizeOfXactAbort;
+                       rdata[0].buffer = InvalidBuffer;
                        /* dump rels to delete */
                        if (nrels > 0)
                        {
                                rdata[0].next = &(rdata[1]);
-                               rdata[1].buffer = InvalidBuffer;
-                               rdata[1].data = (char *) rptr;
+                               rdata[1].data = (char *) rels;
                                rdata[1].len = nrels * sizeof(RelFileNode);
+                               rdata[1].buffer = InvalidBuffer;
                                lastrdata = 1;
                        }
                        /* dump committed child Xids */
                        if (nchildren > 0)
                        {
                                rdata[lastrdata].next = &(rdata[2]);
-                               rdata[2].buffer = InvalidBuffer;
                                rdata[2].data = (char *) children;
                                rdata[2].len = nchildren * sizeof(TransactionId);
+                               rdata[2].buffer = InvalidBuffer;
                                lastrdata = 2;
                        }
                        rdata[lastrdata].next = NULL;
@@ -1154,17 +1329,26 @@ RecordSubTransactionAbort(void)
 
                /*
                 * Mark the transaction aborted in clog.  This is not absolutely
-                * necessary but we may as well do it while we are here.
+                * necessary but XactLockTableWait makes use of it to avoid waiting
+                * for already-aborted subtransactions.
                 */
-               TransactionIdAbortTree(nchildren, children);
                TransactionIdAbort(xid);
+               TransactionIdAbortTree(nchildren, children);
 
                END_CRIT_SECTION();
        }
 
+       /*
+        * We can immediately remove failed XIDs from PGPROC's cache of running
+        * child XIDs. It's easiest to do it here while we have the child XID
+        * array at hand, even though in the main-transaction case the equivalent
+        * work happens just after return from RecordTransactionAbort.
+        */
+       XidCacheRemoveRunningXids(xid, nchildren, children);
+
        /* And clean up local data */
-       if (rptr)
-               pfree(rptr);
+       if (rels)
+               pfree(rels);
        if (children)
                pfree(children);
 }
@@ -1180,14 +1364,19 @@ RecordSubTransactionAbort(void)
 static void
 AtCleanup_Memory(void)
 {
+       Assert(CurrentTransactionState->parent == NULL);
+
        /*
-        * Now that we're "out" of a transaction, have the system allocate
-        * things in the top memory context instead of per-transaction
-        * contexts.
+        * Now that we're "out" of a transaction, have the system allocate things
+        * in the top memory context instead of per-transaction contexts.
         */
        MemoryContextSwitchTo(TopMemoryContext);
 
-       Assert(CurrentTransactionState->parent == NULL);
+       /*
+        * Clear the special abort context for next time.
+        */
+       if (TransactionAbortContext != NULL)
+               MemoryContextResetAndDeleteChildren(TransactionAbortContext);
 
        /*
         * Release all transaction-local memory.
@@ -1220,11 +1409,19 @@ AtSubCleanup_Memory(void)
        CurTransactionContext = s->parent->curTransactionContext;
 
        /*
-        * Delete the subxact local memory contexts. Its CurTransactionContext
-        * can go too (note this also kills CurTransactionContexts from any
-        * children of the subxact).
+        * Clear the special abort context for next time.
+        */
+       if (TransactionAbortContext != NULL)
+               MemoryContextResetAndDeleteChildren(TransactionAbortContext);
+
+       /*
+        * Delete the subxact local memory contexts. Its CurTransactionContext can
+        * go too (note this also kills CurTransactionContexts from any children
+        * of the subxact).
         */
-       MemoryContextDelete(s->curTransactionContext);
+       if (s->curTransactionContext)
+               MemoryContextDelete(s->curTransactionContext);
+       s->curTransactionContext = NULL;
 }
 
 /* ----------------------------------------------------------------
@@ -1238,7 +1435,13 @@ AtSubCleanup_Memory(void)
 static void
 StartTransaction(void)
 {
-       TransactionState s = CurrentTransactionState;
+       TransactionState s;
+
+       /*
+        * Let's just make sure the state stack is empty
+        */
+       s = &TopTransactionStateData;
+       CurrentTransactionState = s;
 
        /*
         * check the current transaction state
@@ -1252,6 +1455,7 @@ StartTransaction(void)
         * start processing
         */
        s->state = TRANS_START;
+       s->transactionId = InvalidTransactionId;        /* until assigned */
 
        /*
         * Make sure we've freed any old snapshot, and reset xact state variables
@@ -1259,6 +1463,14 @@ StartTransaction(void)
        FreeXactSnapshot();
        XactIsoLevel = DefaultXactIsoLevel;
        XactReadOnly = DefaultXactReadOnly;
+       forceSyncCommit = false;
+
+       /*
+        * reinitialize within-transaction counters
+        */
+       s->subTransactionId = TopSubTransactionId;
+       currentSubTransactionId = TopSubTransactionId;
+       currentCommandId = FirstCommandId;
 
        /*
         * must initialize resource-management stuff first
@@ -1269,27 +1481,34 @@ StartTransaction(void)
        /*
         * generate a new transaction id
         */
-       s->transactionIdData = GetNewTransactionId(false);
+       s->transactionId = GetNewTransactionId(false);
+
+       XactLockTableInsert(s->transactionId);
 
-       XactLockTableInsert(s->transactionIdData);
+       PG_TRACE1(transaction__start, s->transactionId);
 
        /*
-        * set now()
+        * set transaction_timestamp() (a/k/a now()).  We want this to be the same
+        * as the first command's statement_timestamp(), so don't do a fresh
+        * GetCurrentTimestamp() call (which'd be expensive anyway).  Also,
+        * mark xactStopTimestamp as unset.
         */
-       xactStartTime = GetCurrentAbsoluteTimeUsec(&(xactStartTimeUsec));
+       xactStartTimestamp = stmtStartTimestamp;
+       xactStopTimestamp = 0;
+       pgstat_report_txn_timestamp(xactStartTimestamp);
 
        /*
         * initialize current transaction state fields
         */
-       s->commandId = FirstCommandId;
        s->nestingLevel = 1;
+       s->gucNestLevel = 1;
        s->childXids = NIL;
 
        /*
-        * You might expect to see "s->currentUser = GetUserId();" here, but
-        * you won't because it doesn't work during startup; the userid isn't
-        * set yet during a backend's first transaction start.  We only use
-        * the currentUser field in sub-transaction state structs.
+        * You might expect to see "s->currentUser = GetUserId();" here, but you
+        * won't because it doesn't work during startup; the userid isn't set yet
+        * during a backend's first transaction start.  We only use the
+        * currentUser field in sub-transaction state structs.
         *
         * prevXactReadOnly is also valid only in sub-transactions.
         */
@@ -1297,9 +1516,10 @@ StartTransaction(void)
        /*
         * initialize other subsystems for new transaction
         */
+       AtStart_GUC();
        AtStart_Inval();
        AtStart_Cache();
-       DeferredTriggerBeginXact();
+       AfterTriggerBeginXact();
 
        /*
         * done with start processing, set current transaction state to "in
@@ -1310,8 +1530,11 @@ StartTransaction(void)
        ShowTransactionState("StartTransaction");
 }
 
+
 /*
  *     CommitTransaction
+ *
+ * NB: if you change this routine, better look at PrepareTransaction too!
  */
 static void
 CommitTransaction(void)
@@ -1329,33 +1552,40 @@ CommitTransaction(void)
        Assert(s->parent == NULL);
 
        /*
-        * Tell the trigger manager that this transaction is about to be
-        * committed. He'll invoke all trigger deferred until XACT before we
-        * really start on committing the transaction.
+        * Do pre-commit processing (most of this stuff requires database access,
+        * and in fact could still cause an error...)
+        *
+        * It is possible for CommitHoldablePortals to invoke functions that queue
+        * deferred triggers, and it's also possible that triggers create holdable
+        * cursors.  So we have to loop until there's nothing left to do.
         */
-       DeferredTriggerEndXact();
+       for (;;)
+       {
+               /*
+                * Fire all currently pending deferred triggers.
+                */
+               AfterTriggerFireDeferred();
 
-       /*
-        * Similarly, let ON COMMIT management do its thing before we start to
-        * commit.
-        */
-       PreCommit_on_commit_actions();
+               /*
+                * Convert any open holdable cursors into static portals.  If there
+                * weren't any, we are done ... otherwise loop back to check if they
+                * queued deferred triggers.  Lather, rinse, repeat.
+                */
+               if (!CommitHoldablePortals())
+                       break;
+       }
 
-       /* Prevent cancel/die interrupt while cleaning up */
-       HOLD_INTERRUPTS();
+       /* Now we can shut down the deferred-trigger manager */
+       AfterTriggerEndXact(true);
 
-       /*
-        * set the current transaction state information appropriately during
-        * the abort processing
-        */
-       s->state = TRANS_COMMIT;
+       /* Close any open regular cursors */
+       AtCommit_Portals();
 
        /*
-        * Do pre-commit processing (most of this stuff requires database
-        * access, and in fact could still cause an error...)
+        * Let ON COMMIT management do its thing (must happen after closing
+        * cursors, to avoid dangling-reference problems)
         */
-
-       AtCommit_Portals();
+       PreCommit_on_commit_actions();
 
        /* close large objects before lower-level cleanup */
        AtEOXact_LargeObject(true);
@@ -1363,67 +1593,106 @@ CommitTransaction(void)
        /* NOTIFY commit must come before lower-level cleanup */
        AtCommit_Notify();
 
-       /* Update the flat password file if we changed pg_shadow or pg_group */
-       /* This should be the last step before commit */
-       AtEOXact_UpdatePasswordFile(true);
+       /*
+        * Update flat files if we changed pg_database, pg_authid or
+        * pg_auth_members.  This should be the last step before commit.
+        */
+       AtEOXact_UpdateFlatFiles(true);
+
+       /* Prevent cancel/die interrupt while cleaning up */
+       HOLD_INTERRUPTS();
+
+       /*
+        * set the current transaction state information appropriately during
+        * commit processing
+        */
+       s->state = TRANS_COMMIT;
 
        /*
         * Here is where we really truly commit.
         */
        RecordTransactionCommit();
 
-       /*
+       /*----------
         * Let others know about no transaction in progress by me. Note that
         * this must be done _before_ releasing locks we hold and _after_
         * RecordTransactionCommit.
         *
-        * LWLockAcquire(SInvalLock) is required: UPDATE with xid 0 is blocked by
-        * xid 1' UPDATE, xid 1 is doing commit while xid 2 gets snapshot - if
-        * xid 2' GetSnapshotData sees xid 1 as running then it must see xid 0
-        * as running as well or it will see two tuple versions - one deleted
-        * by xid 1 and one inserted by xid 0.  See notes in GetSnapshotData.
+        * LWLockAcquire(ProcArrayLock) is required; consider this example:
+        *              UPDATE with xid 0 is blocked by xid 1's UPDATE.
+        *              xid 1 is doing commit while xid 2 gets snapshot.
+        * If xid 2's GetSnapshotData sees xid 1 as running then it must see
+        * xid 0 as running as well, or it will be able to see two tuple versions
+        * - one deleted by xid 1 and one inserted by xid 0.  See notes in
+        * GetSnapshotData.
+        *
+        * Note: MyProc may be null during bootstrap.
+        *----------
         */
        if (MyProc != NULL)
        {
-               /* Lock SInvalLock because that's what GetSnapshotData uses. */
-               LWLockAcquire(SInvalLock, LW_EXCLUSIVE);
+               /* Lock ProcArrayLock because that's what GetSnapshotData uses. */
+               LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
                MyProc->xid = InvalidTransactionId;
                MyProc->xmin = InvalidTransactionId;
-               LWLockRelease(SInvalLock);
+               MyProc->inVacuum = false;               /* must be cleared with xid/xmin */
+
+               /* Clear the subtransaction-XID cache too while holding the lock */
+               MyProc->subxids.nxids = 0;
+               MyProc->subxids.overflowed = false;
+
+               LWLockRelease(ProcArrayLock);
        }
 
+       PG_TRACE1(transaction__commit, s->transactionId);
+
        /*
-        * This is all post-commit cleanup.  Note that if an error is raised
-        * here, it's too late to abort the transaction.  This should be just
+        * This is all post-commit cleanup.  Note that if an error is raised here,
+        * it's too late to abort the transaction.  This should be just
         * noncritical resource releasing.
         *
         * The ordering of operations is not entirely random.  The idea is:
-        * release resources visible to other backends (eg, files, buffer
-        * pins); then release locks; then release backend-local resources. We
-        * want to release locks at the point where any backend waiting for us
-        * will see our transaction as being fully cleaned up.
+        * release resources visible to other backends (eg, files, buffer pins);
+        * then release locks; then release backend-local resources. We want to
+        * release locks at the point where any backend waiting for us will see
+        * our transaction as being fully cleaned up.
         *
-        * Resources that can be associated with individual queries are
-        * handled by the ResourceOwner mechanism.  The other calls here
-        * are for backend-wide state.
+        * Resources that can be associated with individual queries are handled by
+        * the ResourceOwner mechanism.  The other calls here are for backend-wide
+        * state.
         */
 
-       smgrDoPendingDeletes(true);
-       /* smgrcommit already done */
+       CallXactCallbacks(XACT_EVENT_COMMIT);
 
        ResourceOwnerRelease(TopTransactionResourceOwner,
                                                 RESOURCE_RELEASE_BEFORE_LOCKS,
                                                 true, true);
 
+       /* Check we've released all buffer pins */
+       AtEOXact_Buffers(true);
+
+       /* Clean up the relation cache */
+       AtEOXact_RelationCache(true);
+
        /*
-        * Make catalog changes visible to all backends.  This has to happen
-        * after relcache references are dropped (see comments for
-        * AtEOXact_RelationCache), but before locks are released (if anyone
-        * is waiting for lock on a relation we've modified, we want them to
-        * know about the catalog change before they start using the relation).
+        * Make catalog changes visible to all backends.  This has to happen after
+        * relcache references are dropped (see comments for
+        * AtEOXact_RelationCache), but before locks are released (if anyone is
+        * waiting for lock on a relation we've modified, we want them to know
+        * about the catalog change before they start using the relation).
         */
        AtEOXact_Inval(true);
 
+       /*
+        * Likewise, dropping of files deleted during the transaction is best done
+        * after releasing relcache and buffer pins.  (This is not strictly
+        * necessary during commit, since such pins should have been released
+        * already, but this ordering is definitely critical during abort.)
+        */
+       smgrDoPendingDeletes(true);
+
+       AtEOXact_MultiXact();
+
        ResourceOwnerRelease(TopTransactionResourceOwner,
                                                 RESOURCE_RELEASE_LOCKS,
                                                 true, true);
@@ -1431,13 +1700,19 @@ CommitTransaction(void)
                                                 RESOURCE_RELEASE_AFTER_LOCKS,
                                                 true, true);
 
-       CallEOXactCallbacks(true);
-       AtEOXact_GUC(true, false);
+       /* Check we've released all catcache entries */
+       AtEOXact_CatCache(true);
+
+       AtEOXact_GUC(true, 1);
        AtEOXact_SPI(true);
-       AtEOXact_on_commit_actions(true, s->transactionIdData);
+       AtEOXact_on_commit_actions(true);
        AtEOXact_Namespace(true);
+       /* smgrcommit already done */
        AtEOXact_Files();
-       pgstat_count_xact_commit();
+       AtEOXact_ComboCid();
+       AtEOXact_HashTables(true);
+       AtEOXact_PgStat(true);
+       pgstat_report_txn_timestamp(0);
 
        CurrentResourceOwner = NULL;
        ResourceOwnerDelete(TopTransactionResourceOwner);
@@ -1447,7 +1722,10 @@ CommitTransaction(void)
 
        AtCommit_Memory();
 
+       s->transactionId = InvalidTransactionId;
+       s->subTransactionId = InvalidSubTransactionId;
        s->nestingLevel = 0;
+       s->gucNestLevel = 0;
        s->childXids = NIL;
 
        /*
@@ -1459,119 +1737,362 @@ CommitTransaction(void)
        RESUME_INTERRUPTS();
 }
 
+
 /*
- *     AbortTransaction
+ *     PrepareTransaction
+ *
+ * NB: if you change this routine, better look at CommitTransaction too!
  */
 static void
-AbortTransaction(void)
+PrepareTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
+       TransactionId xid = GetCurrentTransactionId();
+       GlobalTransaction gxact;
+       TimestampTz prepared_at;
 
-       /* Prevent cancel/die interrupt while cleaning up */
-       HOLD_INTERRUPTS();
+       ShowTransactionState("PrepareTransaction");
 
        /*
-        * Release any LW locks we might be holding as quickly as possible.
-        * (Regular locks, however, must be held till we finish aborting.)
-        * Releasing LW locks is critical since we might try to grab them
-        * again while cleaning up!
+        * check the current transaction state
         */
-       LWLockReleaseAll();
-
-       /* Clean up buffer I/O and buffer context locks, too */
-       AbortBufferIO();
-       UnlockBuffers();
+       if (s->state != TRANS_INPROGRESS)
+               elog(WARNING, "PrepareTransaction while in %s state",
+                        TransStateAsString(s->state));
+       Assert(s->parent == NULL);
 
        /*
-        * Also clean up any open wait for lock, since the lock manager will
-        * choke if we try to wait for another lock before doing this.
+        * Do pre-commit processing (most of this stuff requires database access,
+        * and in fact could still cause an error...)
+        *
+        * It is possible for PrepareHoldablePortals to invoke functions that
+        * queue deferred triggers, and it's also possible that triggers create
+        * holdable cursors.  So we have to loop until there's nothing left to do.
         */
-       LockWaitCancel();
+       for (;;)
+       {
+               /*
+                * Fire all currently pending deferred triggers.
+                */
+               AfterTriggerFireDeferred();
+
+               /*
+                * Convert any open holdable cursors into static portals.  If there
+                * weren't any, we are done ... otherwise loop back to check if they
+                * queued deferred triggers.  Lather, rinse, repeat.
+                */
+               if (!PrepareHoldablePortals())
+                       break;
+       }
+
+       /* Now we can shut down the deferred-trigger manager */
+       AfterTriggerEndXact(true);
+
+       /* Close any open regular cursors */
+       AtCommit_Portals();
 
        /*
-        * check the current transaction state
+        * Let ON COMMIT management do its thing (must happen after closing
+        * cursors, to avoid dangling-reference problems)
         */
-       if (s->state != TRANS_INPROGRESS)
-               elog(WARNING, "AbortTransaction while in %s state",
-                        TransStateAsString(s->state));
-       Assert(s->parent == NULL);
+       PreCommit_on_commit_actions();
+
+       /* close large objects before lower-level cleanup */
+       AtEOXact_LargeObject(true);
+
+       /* NOTIFY and flatfiles will be handled below */
+
+       /* Prevent cancel/die interrupt while cleaning up */
+       HOLD_INTERRUPTS();
 
        /*
         * set the current transaction state information appropriately during
-        * the abort processing
+        * prepare processing
         */
-       s->state = TRANS_ABORT;
+       s->state = TRANS_PREPARE;
 
-       /* Make sure we are in a valid memory context */
-       AtAbort_Memory();
+       prepared_at = GetCurrentTimestamp();
+
+       /* Tell bufmgr and smgr to prepare for commit */
+       BufmgrCommit();
+
+       /*
+        * Reserve the GID for this transaction. This could fail if the requested
+        * GID is invalid or already in use.
+        */
+       gxact = MarkAsPreparing(xid, prepareGID, prepared_at,
+                                                       GetUserId(), MyDatabaseId);
+       prepareGID = NULL;
 
        /*
-        * Reset user id which might have been changed transiently.  We cannot
-        * use s->currentUser, but must get the session userid from miscinit.c.
+        * Collect data for the 2PC state file.  Note that in general, no actual
+        * state change should happen in the called modules during this step,
+        * since it's still possible to fail before commit, and in that case we
+        * want transaction abort to be able to clean up.  (In particular, the
+        * AtPrepare routines may error out if they find cases they cannot
+        * handle.)  State cleanup should happen in the PostPrepare routines
+        * below.  However, some modules can go ahead and clear state here because
+        * they wouldn't do anything with it during abort anyway.
         *
-        * (Note: it is not necessary to restore session authorization here
-        * because that can only be changed via GUC, and GUC will take care of
-        * rolling it back if need be.  However, an error within a SECURITY
-        * DEFINER function could send control here with the wrong current
-        * userid.)
+        * Note: because the 2PC state file records will be replayed in the same
+        * order they are made, the order of these calls has to match the order in
+        * which we want things to happen during COMMIT PREPARED or ROLLBACK
+        * PREPARED; in particular, pay attention to whether things should happen
+        * before or after releasing the transaction's locks.
         */
-       SetUserId(GetSessionUserId());
+       StartPrepare(gxact);
+
+       AtPrepare_Notify();
+       AtPrepare_UpdateFlatFiles();
+       AtPrepare_Inval();
+       AtPrepare_Locks();
+       AtPrepare_PgStat();
 
        /*
-        * do abort processing
+        * Here is where we really truly prepare.
+        *
+        * We have to record transaction prepares even if we didn't make any
+        * updates, because the transaction manager might get confused if we lose
+        * a global transaction.
+        */
+       EndPrepare(gxact);
+
+       /*
+        * Now we clean up backend-internal state and release internal resources.
         */
-       DeferredTriggerAbortXact();
-       AtAbort_Portals();
-       AtEOXact_LargeObject(false);                    /* 'false' means it's abort */
-       AtAbort_Notify();
-       AtEOXact_UpdatePasswordFile(false);
 
-       /* Advertise the fact that we aborted in pg_clog. */
-       RecordTransactionAbort();
+       /* Break the chain of back-links in the XLOG records I output */
+       MyLastRecPtr.xrecoff = 0;
+       MyXactMadeXLogEntry = false;
+       MyXactMadeTempRelUpdate = false;
 
        /*
-        * Let others know about no transaction in progress by me. Note that
-        * this must be done _before_ releasing locks we hold and _after_
-        * RecordTransactionAbort.
+        * Let others know about no transaction in progress by me.      This has to be
+        * done *after* the prepared transaction has been marked valid, else
+        * someone may think it is unlocked and recyclable.
         */
-       if (MyProc != NULL)
-       {
-               /* Lock SInvalLock because that's what GetSnapshotData uses. */
-               LWLockAcquire(SInvalLock, LW_EXCLUSIVE);
-               MyProc->xid = InvalidTransactionId;
-               MyProc->xmin = InvalidTransactionId;
-               LWLockRelease(SInvalLock);
-       }
+
+       /* Lock ProcArrayLock because that's what GetSnapshotData uses. */
+       LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+       MyProc->xid = InvalidTransactionId;
+       MyProc->xmin = InvalidTransactionId;
+       MyProc->inVacuum = false;       /* must be cleared with xid/xmin */
+
+       /* Clear the subtransaction-XID cache too while holding the lock */
+       MyProc->subxids.nxids = 0;
+       MyProc->subxids.overflowed = false;
+
+       LWLockRelease(ProcArrayLock);
 
        /*
-        * Post-abort cleanup.  See notes in CommitTransaction() concerning
-        * ordering.
+        * This is all post-transaction cleanup.  Note that if an error is raised
+        * here, it's too late to abort the transaction.  This should be just
+        * noncritical resource releasing.      See notes in CommitTransaction.
         */
 
-       smgrDoPendingDeletes(false);
-       smgrabort();
+       CallXactCallbacks(XACT_EVENT_PREPARE);
 
        ResourceOwnerRelease(TopTransactionResourceOwner,
                                                 RESOURCE_RELEASE_BEFORE_LOCKS,
-                                                false, true);
-       AtEOXact_Inval(false);
+                                                true, true);
+
+       /* Check we've released all buffer pins */
+       AtEOXact_Buffers(true);
+
+       /* Clean up the relation cache */
+       AtEOXact_RelationCache(true);
+
+       /* notify and flatfiles don't need a postprepare call */
+
+       PostPrepare_PgStat();
+
+       PostPrepare_Inval();
+
+       PostPrepare_smgr();
+
+       AtEOXact_MultiXact();
+
+       PostPrepare_Locks(xid);
+
        ResourceOwnerRelease(TopTransactionResourceOwner,
                                                 RESOURCE_RELEASE_LOCKS,
-                                                false, true);
+                                                true, true);
        ResourceOwnerRelease(TopTransactionResourceOwner,
                                                 RESOURCE_RELEASE_AFTER_LOCKS,
-                                                false, true);
+                                                true, true);
 
-       CallEOXactCallbacks(false);
-       AtEOXact_GUC(false, false);
-       AtEOXact_SPI(false);
-       AtEOXact_on_commit_actions(false, s->transactionIdData);
-       AtEOXact_Namespace(false);
+       /* Check we've released all catcache entries */
+       AtEOXact_CatCache(true);
+
+       /* PREPARE acts the same as COMMIT as far as GUC is concerned */
+       AtEOXact_GUC(true, 1);
+       AtEOXact_SPI(true);
+       AtEOXact_on_commit_actions(true);
+       AtEOXact_Namespace(true);
+       /* smgrcommit already done */
        AtEOXact_Files();
-       SetReindexProcessing(InvalidOid, InvalidOid);
-       pgstat_count_xact_rollback();
+       AtEOXact_ComboCid();
+       AtEOXact_HashTables(true);
+       /* don't call AtEOXact_PgStat here */
 
-       /*
+       CurrentResourceOwner = NULL;
+       ResourceOwnerDelete(TopTransactionResourceOwner);
+       s->curTransactionOwner = NULL;
+       CurTransactionResourceOwner = NULL;
+       TopTransactionResourceOwner = NULL;
+
+       AtCommit_Memory();
+
+       s->transactionId = InvalidTransactionId;
+       s->subTransactionId = InvalidSubTransactionId;
+       s->nestingLevel = 0;
+       s->gucNestLevel = 0;
+       s->childXids = NIL;
+
+       /*
+        * done with 1st phase commit processing, set current transaction state
+        * back to default
+        */
+       s->state = TRANS_DEFAULT;
+
+       RESUME_INTERRUPTS();
+}
+
+
+/*
+ *     AbortTransaction
+ */
+static void
+AbortTransaction(void)
+{
+       TransactionState s = CurrentTransactionState;
+
+       /* Prevent cancel/die interrupt while cleaning up */
+       HOLD_INTERRUPTS();
+
+       /* Make sure we have a valid memory context and resource owner */
+       AtAbort_Memory();
+       AtAbort_ResourceOwner();
+
+       /*
+        * Release any LW locks we might be holding as quickly as possible.
+        * (Regular locks, however, must be held till we finish aborting.)
+        * Releasing LW locks is critical since we might try to grab them again
+        * while cleaning up!
+        */
+       LWLockReleaseAll();
+
+       /* Clean up buffer I/O and buffer context locks, too */
+       AbortBufferIO();
+       UnlockBuffers();
+
+       /*
+        * Also clean up any open wait for lock, since the lock manager will choke
+        * if we try to wait for another lock before doing this.
+        */
+       LockWaitCancel();
+
+       /*
+        * check the current transaction state
+        */
+       if (s->state != TRANS_INPROGRESS && s->state != TRANS_PREPARE)
+               elog(WARNING, "AbortTransaction while in %s state",
+                        TransStateAsString(s->state));
+       Assert(s->parent == NULL);
+
+       /*
+        * set the current transaction state information appropriately during the
+        * abort processing
+        */
+       s->state = TRANS_ABORT;
+
+       /*
+        * Reset user id which might have been changed transiently.  We cannot use
+        * s->currentUser, since it may not be set yet; instead rely on internal
+        * state of miscinit.c.
+        *
+        * (Note: it is not necessary to restore session authorization here
+        * because that can only be changed via GUC, and GUC will take care of
+        * rolling it back if need be.  However, an error within a SECURITY
+        * DEFINER function could send control here with the wrong current
+        * userid.)
+        */
+       AtAbort_UserId();
+
+       /*
+        * do abort processing
+        */
+       AfterTriggerEndXact(false);
+       AtAbort_Portals();
+       AtEOXact_LargeObject(false);    /* 'false' means it's abort */
+       AtAbort_Notify();
+       AtEOXact_UpdateFlatFiles(false);
+
+       /*
+        * Advertise the fact that we aborted in pg_clog (assuming that we got as
+        * far as assigning an XID to advertise).
+        */
+       if (TransactionIdIsValid(s->transactionId))
+               RecordTransactionAbort();
+
+       /*
+        * Let others know about no transaction in progress by me. Note that this
+        * must be done _before_ releasing locks we hold and _after_
+        * RecordTransactionAbort.
+        */
+       if (MyProc != NULL)
+       {
+               /* Lock ProcArrayLock because that's what GetSnapshotData uses. */
+               LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+               MyProc->xid = InvalidTransactionId;
+               MyProc->xmin = InvalidTransactionId;
+               MyProc->inVacuum = false;               /* must be cleared with xid/xmin */
+               MyProc->inCommit = false;               /* be sure this gets cleared */
+
+               /* Clear the subtransaction-XID cache too while holding the lock */
+               MyProc->subxids.nxids = 0;
+               MyProc->subxids.overflowed = false;
+
+               LWLockRelease(ProcArrayLock);
+       }
+
+       PG_TRACE1(transaction__abort, s->transactionId);
+
+       /*
+        * Post-abort cleanup.  See notes in CommitTransaction() concerning
+        * ordering.
+        */
+
+       CallXactCallbacks(XACT_EVENT_ABORT);
+
+       ResourceOwnerRelease(TopTransactionResourceOwner,
+                                                RESOURCE_RELEASE_BEFORE_LOCKS,
+                                                false, true);
+       AtEOXact_Buffers(false);
+       AtEOXact_RelationCache(false);
+       AtEOXact_Inval(false);
+       smgrDoPendingDeletes(false);
+       AtEOXact_MultiXact();
+       ResourceOwnerRelease(TopTransactionResourceOwner,
+                                                RESOURCE_RELEASE_LOCKS,
+                                                false, true);
+       ResourceOwnerRelease(TopTransactionResourceOwner,
+                                                RESOURCE_RELEASE_AFTER_LOCKS,
+                                                false, true);
+       AtEOXact_CatCache(false);
+
+       AtEOXact_GUC(false, 1);
+       AtEOXact_SPI(false);
+       AtEOXact_on_commit_actions(false);
+       AtEOXact_Namespace(false);
+       smgrabort();
+       AtEOXact_Files();
+       AtEOXact_ComboCid();
+       AtEOXact_HashTables(false);
+       AtEOXact_PgStat(false);
+       pgstat_report_txn_timestamp(0);
+
+       /*
         * State remains TRANS_ABORT until CleanupTransaction().
         */
        RESUME_INTERRUPTS();
@@ -1589,22 +2110,27 @@ CleanupTransaction(void)
         * State should still be TRANS_ABORT from AbortTransaction().
         */
        if (s->state != TRANS_ABORT)
-               elog(FATAL, "CleanupTransaction and not in abort state");
+               elog(FATAL, "CleanupTransaction: unexpected state %s",
+                        TransStateAsString(s->state));
 
        /*
         * do abort cleanup processing
         */
        AtCleanup_Portals();            /* now safe to release portal memory */
 
-       CurrentResourceOwner = NULL; /* and resource owner */
-       ResourceOwnerDelete(TopTransactionResourceOwner);
+       CurrentResourceOwner = NULL;    /* and resource owner */
+       if (TopTransactionResourceOwner)
+               ResourceOwnerDelete(TopTransactionResourceOwner);
        s->curTransactionOwner = NULL;
        CurTransactionResourceOwner = NULL;
        TopTransactionResourceOwner = NULL;
 
        AtCleanup_Memory();                     /* and transaction memory */
 
+       s->transactionId = InvalidTransactionId;
+       s->subTransactionId = InvalidSubTransactionId;
        s->nestingLevel = 0;
+       s->gucNestLevel = 0;
        s->childXids = NIL;
 
        /*
@@ -1625,8 +2151,8 @@ StartTransactionCommand(void)
        switch (s->blockState)
        {
                        /*
-                        * if we aren't in a transaction block, we just do our usual
-                        * start transaction.
+                        * if we aren't in a transaction block, we just do our usual start
+                        * transaction.
                         */
                case TBLOCK_DEFAULT:
                        StartTransaction();
@@ -1634,19 +2160,23 @@ StartTransactionCommand(void)
                        break;
 
                        /*
-                        * This is the case when we are somewhere in a transaction block
-                        * and about to start a new command.  For now we do nothing
-                        * but someday we may do command-local resource initialization.
+                        * We are somewhere in a transaction block or subtransaction and
+                        * about to start a new command.  For now we do nothing, but
+                        * someday we may do command-local resource initialization. (Note
+                        * that any needed CommandCounterIncrement was done by the
+                        * previous CommitTransactionCommand.)
                         */
                case TBLOCK_INPROGRESS:
                case TBLOCK_SUBINPROGRESS:
                        break;
 
                        /*
-                        * Here we are in the middle of a transaction block but one of
-                        * the commands caused an abort so we do nothing but remain in
-                        * the abort state.  Eventually we will get to the "END
-                        * TRANSACTION" which will set things straight.
+                        * Here we are in a failed transaction block (one of the commands
+                        * caused an abort) so we do nothing but remain in the abort
+                        * state.  Eventually we will get a ROLLBACK command which will
+                        * get us out of this state.  (It is up to other code to ensure
+                        * that no commands other than ROLLBACK will be processed in these
+                        * states.)
                         */
                case TBLOCK_ABORT:
                case TBLOCK_SUBABORT:
@@ -1658,12 +2188,14 @@ StartTransactionCommand(void)
                case TBLOCK_SUBBEGIN:
                case TBLOCK_END:
                case TBLOCK_SUBEND:
-               case TBLOCK_SUBENDABORT_ALL:
-               case TBLOCK_SUBENDABORT:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
                case TBLOCK_SUBABORT_PENDING:
-               case TBLOCK_SUBENDABORT_RELEASE:
-               case TBLOCK_ENDABORT:
-                       elog(FATAL, "StartTransactionCommand: unexpected state %s",
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
+               case TBLOCK_PREPARE:
+                       elog(ERROR, "StartTransactionCommand: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
        }
@@ -1689,18 +2221,16 @@ CommitTransactionCommand(void)
                        /*
                         * This shouldn't happen, because it means the previous
                         * StartTransactionCommand didn't set the STARTED state
-                        * appropriately, or we didn't manage previous pending
-                        * abort states.
+                        * appropriately.
                         */
                case TBLOCK_DEFAULT:
-               case TBLOCK_SUBABORT_PENDING:
                        elog(FATAL, "CommitTransactionCommand: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
 
                        /*
                         * If we aren't in a transaction block, just do our usual
-                        * transaction commit.
+                        * transaction commit, and return to the idle state.
                         */
                case TBLOCK_STARTED:
                        CommitTransaction();
@@ -1708,10 +2238,10 @@ CommitTransactionCommand(void)
                        break;
 
                        /*
-                        * This is the case right after we get a "BEGIN TRANSACTION"
-                        * command, but the user hasn't done anything else yet, so we
-                        * change to the "transaction block in progress" state and
-                        * return.
+                        * We are completing a "BEGIN TRANSACTION" command, so we change
+                        * to the "transaction block in progress" state and return.  (We
+                        * assume the BEGIN did nothing to the database, so we need no
+                        * CommandCounterIncrement.)
                         */
                case TBLOCK_BEGIN:
                        s->blockState = TBLOCK_INPROGRESS;
@@ -1719,113 +2249,146 @@ CommitTransactionCommand(void)
 
                        /*
                         * This is the case when we have finished executing a command
-                        * someplace within a transaction block.  We increment the
-                        * command counter and return.
+                        * someplace within a transaction block.  We increment the command
+                        * counter and return.
                         */
                case TBLOCK_INPROGRESS:
+               case TBLOCK_SUBINPROGRESS:
                        CommandCounterIncrement();
                        break;
 
                        /*
-                        * This is the case when we just got the "END TRANSACTION"
-                        * statement, so we commit the transaction and go back to the
-                        * default state.
+                        * We are completing a "COMMIT" command.  Do it and return to the
+                        * idle state.
                         */
                case TBLOCK_END:
-                       /* commit all open subtransactions */
-                       if (s->nestingLevel > 1)
-                               CommitTransactionToLevel(2);
-                       s = CurrentTransactionState;
-                       Assert(s->parent == NULL);
-                       /* and now the outer transaction */
                        CommitTransaction();
                        s->blockState = TBLOCK_DEFAULT;
                        break;
 
                        /*
-                        * Here we are in the middle of a transaction block but one of
-                        * the commands caused an abort so we do nothing but remain in
-                        * the abort state.  Eventually we will get to the "END
-                        * TRANSACTION" which will set things straight.
+                        * Here we are in the middle of a transaction block but one of the
+                        * commands caused an abort so we do nothing but remain in the
+                        * abort state.  Eventually we will get a ROLLBACK comand.
                         */
                case TBLOCK_ABORT:
+               case TBLOCK_SUBABORT:
                        break;
 
                        /*
-                        * Here we were in an aborted transaction block which just
-                        * processed the "END TRANSACTION" command from the user, so
-                        * clean up and return to the default state.
+                        * Here we were in an aborted transaction block and we just got
+                        * the ROLLBACK command from the user, so clean up the
+                        * already-aborted transaction and return to the idle state.
                         */
-               case TBLOCK_ENDABORT:
+               case TBLOCK_ABORT_END:
                        CleanupTransaction();
                        s->blockState = TBLOCK_DEFAULT;
                        break;
 
                        /*
-                        * Ditto, but in a subtransaction.  AbortOutOfAnyTransaction
-                        * will do the dirty work.
+                        * Here we were in a perfectly good transaction block but the user
+                        * told us to ROLLBACK anyway.  We have to abort the transaction
+                        * and then clean up.
                         */
-               case TBLOCK_SUBENDABORT_ALL:
-                       AbortOutOfAnyTransaction();
-                       s = CurrentTransactionState;            /* changed by AbortOutOfAnyTransaction */
-                       /* AbortOutOfAnyTransaction sets the blockState */
+               case TBLOCK_ABORT_PENDING:
+                       AbortTransaction();
+                       CleanupTransaction();
+                       s->blockState = TBLOCK_DEFAULT;
                        break;
 
                        /*
-                        * We were just issued a SAVEPOINT inside a transaction block.
-                        * Start a subtransaction.  (BeginTransactionBlock already
-                        * did PushTransaction, so as to have someplace to put the
-                        * SUBBEGIN state.)
+                        * We are completing a "PREPARE TRANSACTION" command.  Do it and
+                        * return to the idle state.
                         */
-               case TBLOCK_SUBBEGIN:
-                       StartSubTransaction();
-                       s->blockState = TBLOCK_SUBINPROGRESS;
+               case TBLOCK_PREPARE:
+                       PrepareTransaction();
+                       s->blockState = TBLOCK_DEFAULT;
                        break;
 
                        /*
-                        * Inside a subtransaction, increment the command counter.
+                        * We were just issued a SAVEPOINT inside a transaction block.
+                        * Start a subtransaction.      (DefineSavepoint already did
+                        * PushTransaction, so as to have someplace to put the SUBBEGIN
+                        * state.)
                         */
-               case TBLOCK_SUBINPROGRESS:
-                       CommandCounterIncrement();
+               case TBLOCK_SUBBEGIN:
+                       StartSubTransaction();
+                       s->blockState = TBLOCK_SUBINPROGRESS;
                        break;
 
                        /*
-                        * We were issued a RELEASE command, so we end the current
-                        * subtransaction and return to the parent transaction.
+                        * We were issued a COMMIT or RELEASE command, so we end the
+                        * current subtransaction and return to the parent transaction.
+                        * The parent might be ended too, so repeat till we are all the
+                        * way out or find an INPROGRESS transaction.
                         */
                case TBLOCK_SUBEND:
-                       CommitSubTransaction();
-                       PopTransaction();
-                       s = CurrentTransactionState;            /* changed by pop */
+                       do
+                       {
+                               CommitSubTransaction();
+                               s = CurrentTransactionState;    /* changed by pop */
+                       } while (s->blockState == TBLOCK_SUBEND);
+                       /* If we had a COMMIT command, finish off the main xact too */
+                       if (s->blockState == TBLOCK_END)
+                       {
+                               Assert(s->parent == NULL);
+                               CommitTransaction();
+                               s->blockState = TBLOCK_DEFAULT;
+                       }
+                       else if (s->blockState == TBLOCK_PREPARE)
+                       {
+                               Assert(s->parent == NULL);
+                               PrepareTransaction();
+                               s->blockState = TBLOCK_DEFAULT;
+                       }
+                       else
+                       {
+                               Assert(s->blockState == TBLOCK_INPROGRESS ||
+                                          s->blockState == TBLOCK_SUBINPROGRESS);
+                       }
                        break;
 
                        /*
-                        * If we are in an aborted subtransaction, do nothing.
+                        * The current already-failed subtransaction is ending due to a
+                        * ROLLBACK or ROLLBACK TO command, so pop it and recursively
+                        * examine the parent (which could be in any of several states).
                         */
-               case TBLOCK_SUBABORT:
+               case TBLOCK_SUBABORT_END:
+                       CleanupSubTransaction();
+                       CommitTransactionCommand();
                        break;
 
                        /*
-                        * The current subtransaction is ending.  Do the equivalent
-                        * of a ROLLBACK TO followed by a RELEASE command.
+                        * As above, but it's not dead yet, so abort first.
                         */
-               case TBLOCK_SUBENDABORT_RELEASE:
-                       CleanupAbortedSubTransactions(false);
+               case TBLOCK_SUBABORT_PENDING:
+                       AbortSubTransaction();
+                       CleanupSubTransaction();
+                       CommitTransactionCommand();
                        break;
 
                        /*
-                        * The current subtransaction is ending due to a ROLLBACK
-                        * TO command, so close all savepoints up to the target
-                        * level.  When finished, recreate the savepoint.
+                        * The current subtransaction is the target of a ROLLBACK TO
+                        * command.  Abort and pop it, then start a new subtransaction
+                        * with the same name.
                         */
-               case TBLOCK_SUBENDABORT:
+               case TBLOCK_SUBRESTART:
                        {
-                               char *name = CleanupAbortedSubTransactions(true);
+                               char       *name;
+                               int                     savepointLevel;
+
+                               /* save name and keep Cleanup from freeing it */
+                               name = s->name;
+                               s->name = NULL;
+                               savepointLevel = s->savepointLevel;
+
+                               AbortSubTransaction();
+                               CleanupSubTransaction();
 
-                               Assert(PointerIsValid(name));
-                               DefineSavepoint(name);
-                               s = CurrentTransactionState; /* changed by DefineSavepoint */
-                               pfree(name);
+                               DefineSavepoint(NULL);
+                               s = CurrentTransactionState;    /* changed by push */
+                               s->name = name;
+                               s->savepointLevel = savepointLevel;
 
                                /* This is the same as TBLOCK_SUBBEGIN case */
                                AssertState(s->blockState == TBLOCK_SUBBEGIN);
@@ -1833,54 +2396,35 @@ CommitTransactionCommand(void)
                                s->blockState = TBLOCK_SUBINPROGRESS;
                        }
                        break;
-       }
-}
-
-/*
- * CleanupAbortedSubTransactions
- *
- * Helper function for CommitTransactionCommand.  Aborts and cleans up
- * dead subtransactions after a ROLLBACK TO command.  Optionally returns
- * the name of the last dead subtransaction so it can be reused to redefine
- * the savepoint.  (Caller is responsible for pfree'ing the result.)
- */
-static char *
-CleanupAbortedSubTransactions(bool returnName)
-{
-       TransactionState s = CurrentTransactionState;
-       char *name = NULL;
-       
-       AssertState(PointerIsValid(s->parent));
-       Assert(s->parent->blockState == TBLOCK_SUBINPROGRESS ||
-                  s->parent->blockState == TBLOCK_INPROGRESS ||
-                  s->parent->blockState == TBLOCK_SUBABORT_PENDING);
 
-       /*
-        * Abort everything up to the target level.  The current
-        * subtransaction only needs cleanup.  If we need to save the name,
-        * look for the last subtransaction in TBLOCK_SUBABORT_PENDING state.
-        */
-       if (returnName && s->parent->blockState != TBLOCK_SUBABORT_PENDING)
-               name = MemoryContextStrdup(TopMemoryContext, s->name);
+                       /*
+                        * Same as above, but the subtransaction had already failed, so we
+                        * don't need AbortSubTransaction.
+                        */
+               case TBLOCK_SUBABORT_RESTART:
+                       {
+                               char       *name;
+                               int                     savepointLevel;
 
-       CleanupSubTransaction();
-       PopTransaction();
-       s = CurrentTransactionState;            /* changed by pop */
+                               /* save name and keep Cleanup from freeing it */
+                               name = s->name;
+                               s->name = NULL;
+                               savepointLevel = s->savepointLevel;
 
-       while (s->blockState == TBLOCK_SUBABORT_PENDING)
-       {
-               AbortSubTransaction();
-               if (returnName && s->parent->blockState != TBLOCK_SUBABORT_PENDING)
-                       name = MemoryContextStrdup(TopMemoryContext, s->name);
-               CleanupSubTransaction();
-               PopTransaction();
-               s = CurrentTransactionState;
-       }
+                               CleanupSubTransaction();
 
-       AssertState(s->blockState == TBLOCK_SUBINPROGRESS ||
-                               s->blockState == TBLOCK_INPROGRESS);
+                               DefineSavepoint(NULL);
+                               s = CurrentTransactionState;    /* changed by push */
+                               s->name = name;
+                               s->savepointLevel = savepointLevel;
 
-       return name;
+                               /* This is the same as TBLOCK_SUBBEGIN case */
+                               AssertState(s->blockState == TBLOCK_SUBBEGIN);
+                               StartSubTransaction();
+                               s->blockState = TBLOCK_SUBINPROGRESS;
+                       }
+                       break;
+       }
 }
 
 /*
@@ -1893,15 +2437,30 @@ AbortCurrentTransaction(void)
 
        switch (s->blockState)
        {
-               /*
-                * we aren't in a transaction, so we do nothing.
-                */
                case TBLOCK_DEFAULT:
+                       if (s->state == TRANS_DEFAULT)
+                       {
+                               /* we are idle, so nothing to do */
+                       }
+                       else
+                       {
+                               /*
+                                * We can get here after an error during transaction start
+                                * (state will be TRANS_START).  Need to clean up the
+                                * incompletely started transaction.  First, adjust the
+                                * low-level state to suppress warning message from
+                                * AbortTransaction.
+                                */
+                               if (s->state == TRANS_START)
+                                       s->state = TRANS_INPROGRESS;
+                               AbortTransaction();
+                               CleanupTransaction();
+                       }
                        break;
 
                        /*
-                        * if we aren't in a transaction block, we just do the basic
-                        * abort & cleanup transaction.
+                        * if we aren't in a transaction block, we just do the basic abort
+                        * & cleanup transaction.
                         */
                case TBLOCK_STARTED:
                        AbortTransaction();
@@ -1910,33 +2469,33 @@ AbortCurrentTransaction(void)
                        break;
 
                        /*
-                        * If we are in TBLOCK_BEGIN it means something screwed up
-                        * right after reading "BEGIN TRANSACTION" so we enter the
-                        * abort state.  Eventually an "END TRANSACTION" will fix
-                        * things.
+                        * If we are in TBLOCK_BEGIN it means something screwed up right
+                        * after reading "BEGIN TRANSACTION".  We assume that the user
+                        * will interpret the error as meaning the BEGIN failed to get him
+                        * into a transaction block, so we should abort and return to idle
+                        * state.
                         */
                case TBLOCK_BEGIN:
                        AbortTransaction();
-                       s->blockState = TBLOCK_ABORT;
-                       /* CleanupTransaction happens when we exit TBLOCK_ENDABORT */
+                       CleanupTransaction();
+                       s->blockState = TBLOCK_DEFAULT;
                        break;
 
                        /*
-                        * This is the case when we are somewhere in a transaction block
-                        * and we've gotten a failure, so we abort the transaction and
-                        * set up the persistent ABORT state.  We will stay in ABORT
-                        * until we get an "END TRANSACTION".
+                        * We are somewhere in a transaction block and we've gotten a
+                        * failure, so we abort the transaction and set up the persistent
+                        * ABORT state.  We will stay in ABORT until we get a ROLLBACK.
                         */
                case TBLOCK_INPROGRESS:
                        AbortTransaction();
                        s->blockState = TBLOCK_ABORT;
-                       /* CleanupTransaction happens when we exit TBLOCK_ENDABORT */
+                       /* CleanupTransaction happens when we exit TBLOCK_ABORT_END */
                        break;
 
                        /*
-                        * Here, the system was fouled up just after the user wanted
-                        * to end the transaction block so we abort the transaction
-                        * and return to the default state.
+                        * Here, we failed while trying to COMMIT.      Clean up the
+                        * transaction and return to idle state (we do not want to stay in
+                        * the transaction).
                         */
                case TBLOCK_END:
                        AbortTransaction();
@@ -1945,75 +2504,77 @@ AbortCurrentTransaction(void)
                        break;
 
                        /*
-                        * Here, we are already in an aborted transaction state and
-                        * are waiting for an "END TRANSACTION" to come along and lo
-                        * and behold, we abort again! So we just remain in the abort
-                        * state.
+                        * Here, we are already in an aborted transaction state and are
+                        * waiting for a ROLLBACK, but for some reason we failed again! So
+                        * we just remain in the abort state.
                         */
                case TBLOCK_ABORT:
                case TBLOCK_SUBABORT:
                        break;
 
                        /*
-                        * Here we were in an aborted transaction block which just
-                        * processed the "END TRANSACTION" command but somehow aborted
-                        * again.. since we must have done the abort processing, we
-                        * clean up and return to the default state.
+                        * We are in a failed transaction and we got the ROLLBACK command.
+                        * We have already aborted, we just need to cleanup and go to idle
+                        * state.
                         */
-               case TBLOCK_ENDABORT:
+               case TBLOCK_ABORT_END:
                        CleanupTransaction();
                        s->blockState = TBLOCK_DEFAULT;
                        break;
 
                        /*
-                        * If we are just starting a subtransaction, put it
-                        * in aborted state.
+                        * We are in a live transaction and we got a ROLLBACK command.
+                        * Abort, cleanup, go to idle state.
                         */
-               case TBLOCK_SUBBEGIN:
-                       StartAbortedSubTransaction();
-                       s->blockState = TBLOCK_SUBABORT;
+               case TBLOCK_ABORT_PENDING:
+                       AbortTransaction();
+                       CleanupTransaction();
+                       s->blockState = TBLOCK_DEFAULT;
+                       break;
+
+                       /*
+                        * Here, we failed while trying to PREPARE.  Clean up the
+                        * transaction and return to idle state (we do not want to stay in
+                        * the transaction).
+                        */
+               case TBLOCK_PREPARE:
+                       AbortTransaction();
+                       CleanupTransaction();
+                       s->blockState = TBLOCK_DEFAULT;
                        break;
 
+                       /*
+                        * We got an error inside a subtransaction.  Abort just the
+                        * subtransaction, and go to the persistent SUBABORT state until
+                        * we get ROLLBACK.
+                        */
                case TBLOCK_SUBINPROGRESS:
                        AbortSubTransaction();
                        s->blockState = TBLOCK_SUBABORT;
                        break;
 
                        /*
-                        * If we are aborting an ending transaction,
-                        * we have to abort the parent transaction too.
+                        * If we failed while trying to create a subtransaction, clean up
+                        * the broken subtransaction and abort the parent.      The same
+                        * applies if we get a failure while ending a subtransaction.
                         */
+               case TBLOCK_SUBBEGIN:
                case TBLOCK_SUBEND:
                case TBLOCK_SUBABORT_PENDING:
+               case TBLOCK_SUBRESTART:
                        AbortSubTransaction();
                        CleanupSubTransaction();
-                       PopTransaction();
-                       s = CurrentTransactionState;            /* changed by pop */
-                       Assert(s->blockState != TBLOCK_SUBEND &&
-                                       s->blockState != TBLOCK_SUBENDABORT);
                        AbortCurrentTransaction();
                        break;
 
                        /*
                         * Same as above, except the Abort() was already done.
                         */
-               case TBLOCK_SUBENDABORT:
-               case TBLOCK_SUBENDABORT_RELEASE:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_SUBABORT_RESTART:
                        CleanupSubTransaction();
-                       PopTransaction();
-                       s = CurrentTransactionState;            /* changed by pop */
-                       Assert(s->blockState != TBLOCK_SUBEND &&
-                                       s->blockState != TBLOCK_SUBENDABORT);
                        AbortCurrentTransaction();
                        break;
-
-                       /*
-                        * We are already aborting the whole transaction tree.
-                        * Do nothing, CommitTransactionCommand will call
-                        * AbortOutOfAnyTransaction and set things straight.
-                        */
-               case TBLOCK_SUBENDABORT_ALL:
-                       break;
        }
 }
 
@@ -2029,12 +2590,14 @@ AbortCurrentTransaction(void)
  *     could issue more commands and possibly cause a failure after the statement
  *     completes).  Subtransactions are verboten too.
  *
- *     stmtNode: pointer to parameter block for statement; this is used in
- *     a very klugy way to determine whether we are inside a function.
- *     stmtType: statement type name for error messages.
+ *     isTopLevel: passed down from ProcessUtility to determine whether we are
+ *     inside a function or multi-query querystring.  (We will always fail if
+ *     this is false, but it's convenient to centralize the check here instead of
+ *     making callers do it.)
+ *  stmtType: statement type name, for error messages.
  */
 void
-PreventTransactionChain(void *stmtNode, const char *stmtType)
+PreventTransactionChain(bool isTopLevel, const char *stmtType)
 {
        /*
         * xact block already started?
@@ -2057,15 +2620,14 @@ PreventTransactionChain(void *stmtNode, const char *stmtType)
                                                stmtType)));
 
        /*
-        * Are we inside a function call?  If the statement's parameter block
-        * was allocated in QueryContext, assume it is an interactive command.
-        * Otherwise assume it is coming from a function.
+        * inside a function call?
         */
-       if (!MemoryContextContains(QueryContext, stmtNode))
+       if (!isTopLevel)
                ereport(ERROR,
                                (errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
                /* translator: %s represents an SQL statement name */
-                        errmsg("%s cannot be executed from a function", stmtType)));
+                                errmsg("%s cannot be executed from a function or multi-command string",
+                                               stmtType)));
 
        /* If we got past IsTransactionBlock test, should be in default state */
        if (CurrentTransactionState->blockState != TBLOCK_DEFAULT &&
@@ -2087,12 +2649,12 @@ PreventTransactionChain(void *stmtNode, const char *stmtType)
  *     use of the current statement's results.  Likewise subtransactions.
  *     Thus this is an inverse for PreventTransactionChain.
  *
- *     stmtNode: pointer to parameter block for statement; this is used in
- *     a very klugy way to determine whether we are inside a function.
- *     stmtType: statement type name for error messages.
+ *     isTopLevel: passed down from ProcessUtility to determine whether we are
+ *     inside a function.
+ *     stmtType: statement type name, for error messages.
  */
 void
-RequireTransactionChain(void *stmtNode, const char *stmtType)
+RequireTransactionChain(bool isTopLevel, const char *stmtType)
 {
        /*
         * xact block already started?
@@ -2107,16 +2669,15 @@ RequireTransactionChain(void *stmtNode, const char *stmtType)
                return;
 
        /*
-        * Are we inside a function call?  If the statement's parameter block
-        * was allocated in QueryContext, assume it is an interactive command.
-        * Otherwise assume it is coming from a function.
+        * inside a function call?
         */
-       if (!MemoryContextContains(QueryContext, stmtNode))
+       if (!isTopLevel)
                return;
+
        ereport(ERROR,
                        (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
        /* translator: %s represents an SQL statement name */
-                        errmsg("%s may only be used in transaction blocks",
+                        errmsg("%s can only be used in transaction blocks",
                                        stmtType)));
 }
 
@@ -2127,11 +2688,11 @@ RequireTransactionChain(void *stmtNode, const char *stmtType)
  *     a transaction block than when running as single commands.  ANALYZE is
  *     currently the only example.
  *
- *     stmtNode: pointer to parameter block for statement; this is used in
- *     a very klugy way to determine whether we are inside a function.
+ *     isTopLevel: passed down from ProcessUtility to determine whether we are
+ *     inside a function.
  */
 bool
-IsInTransactionChain(void *stmtNode)
+IsInTransactionChain(bool isTopLevel)
 {
        /*
         * Return true on same conditions that would make PreventTransactionChain
@@ -2143,7 +2704,7 @@ IsInTransactionChain(void *stmtNode)
        if (IsSubTransaction())
                return true;
 
-       if (!MemoryContextContains(QueryContext, stmtNode))
+       if (!isTopLevel)
                return true;
 
        if (CurrentTransactionState->blockState != TBLOCK_DEFAULT &&
@@ -2155,43 +2716,44 @@ IsInTransactionChain(void *stmtNode)
 
 
 /*
- * Register or deregister callback functions for end-of-xact cleanup
+ * Register or deregister callback functions for start- and end-of-xact
+ * operations.
  *
  * These functions are intended for use by dynamically loaded modules.
  * For built-in modules we generally just hardwire the appropriate calls
  * (mainly because it's easier to control the order that way, where needed).
  *
- * Note that the callback occurs post-commit or post-abort, so the callback
- * functions can only do noncritical cleanup.
+ * At transaction end, the callback occurs post-commit or post-abort, so the
+ * callback functions can only do noncritical cleanup.
  */
 void
-RegisterEOXactCallback(EOXactCallback callback, void *arg)
+RegisterXactCallback(XactCallback callback, void *arg)
 {
-       EOXactCallbackItem *item;
+       XactCallbackItem *item;
 
-       item = (EOXactCallbackItem *)
-               MemoryContextAlloc(TopMemoryContext, sizeof(EOXactCallbackItem));
+       item = (XactCallbackItem *)
+               MemoryContextAlloc(TopMemoryContext, sizeof(XactCallbackItem));
        item->callback = callback;
        item->arg = arg;
-       item->next = EOXact_callbacks;
-       EOXact_callbacks = item;
+       item->next = Xact_callbacks;
+       Xact_callbacks = item;
 }
 
 void
-UnregisterEOXactCallback(EOXactCallback callback, void *arg)
+UnregisterXactCallback(XactCallback callback, void *arg)
 {
-       EOXactCallbackItem *item;
-       EOXactCallbackItem *prev;
+       XactCallbackItem *item;
+       XactCallbackItem *prev;
 
        prev = NULL;
-       for (item = EOXact_callbacks; item; prev = item, item = item->next)
+       for (item = Xact_callbacks; item; prev = item, item = item->next)
        {
                if (item->callback == callback && item->arg == arg)
                {
                        if (prev)
                                prev->next = item->next;
                        else
-                               EOXact_callbacks = item->next;
+                               Xact_callbacks = item->next;
                        pfree(item);
                        break;
                }
@@ -2199,16 +2761,71 @@ UnregisterEOXactCallback(EOXactCallback callback, void *arg)
 }
 
 static void
-CallEOXactCallbacks(bool isCommit)
+CallXactCallbacks(XactEvent event)
+{
+       XactCallbackItem *item;
+
+       for (item = Xact_callbacks; item; item = item->next)
+               (*item->callback) (event, item->arg);
+}
+
+
+/*
+ * Register or deregister callback functions for start- and end-of-subxact
+ * operations.
+ *
+ * Pretty much same as above, but for subtransaction events.
+ *
+ * At subtransaction end, the callback occurs post-subcommit or post-subabort,
+ * so the callback functions can only do noncritical cleanup.  At
+ * subtransaction start, the callback is called when the subtransaction has
+ * finished initializing.
+ */
+void
+RegisterSubXactCallback(SubXactCallback callback, void *arg)
+{
+       SubXactCallbackItem *item;
+
+       item = (SubXactCallbackItem *)
+               MemoryContextAlloc(TopMemoryContext, sizeof(SubXactCallbackItem));
+       item->callback = callback;
+       item->arg = arg;
+       item->next = SubXact_callbacks;
+       SubXact_callbacks = item;
+}
+
+void
+UnregisterSubXactCallback(SubXactCallback callback, void *arg)
 {
-       EOXactCallbackItem *item;
+       SubXactCallbackItem *item;
+       SubXactCallbackItem *prev;
 
-       for (item = EOXact_callbacks; item; item = item->next)
+       prev = NULL;
+       for (item = SubXact_callbacks; item; prev = item, item = item->next)
        {
-               (*item->callback) (isCommit, item->arg);
+               if (item->callback == callback && item->arg == arg)
+               {
+                       if (prev)
+                               prev->next = item->next;
+                       else
+                               SubXact_callbacks = item->next;
+                       pfree(item);
+                       break;
+               }
        }
 }
 
+static void
+CallSubXactCallbacks(SubXactEvent event,
+                                        SubTransactionId mySubid,
+                                        SubTransactionId parentSubid)
+{
+       SubXactCallbackItem *item;
+
+       for (item = SubXact_callbacks; item; item = item->next)
+               (*item->callback) (event, mySubid, parentSubid, item->arg);
+}
+
 
 /* ----------------------------------------------------------------
  *                                        transaction block support
@@ -2227,8 +2844,7 @@ BeginTransactionBlock(void)
        switch (s->blockState)
        {
                        /*
-                        * We are not inside a transaction block, so allow one
-                        * to begin.
+                        * We are not inside a transaction block, so allow one to begin.
                         */
                case TBLOCK_STARTED:
                        s->blockState = TBLOCK_BEGIN;
@@ -2246,17 +2862,19 @@ BeginTransactionBlock(void)
                                         errmsg("there is already a transaction in progress")));
                        break;
 
-                       /* These cases are invalid.  Reject them altogether. */
+                       /* These cases are invalid. */
                case TBLOCK_DEFAULT:
                case TBLOCK_BEGIN:
                case TBLOCK_SUBBEGIN:
-               case TBLOCK_ENDABORT:
                case TBLOCK_END:
-               case TBLOCK_SUBENDABORT_ALL:
-               case TBLOCK_SUBENDABORT:
-               case TBLOCK_SUBABORT_PENDING:
-               case TBLOCK_SUBENDABORT_RELEASE:
                case TBLOCK_SUBEND:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
+               case TBLOCK_SUBABORT_PENDING:
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
+               case TBLOCK_PREPARE:
                        elog(FATAL, "BeginTransactionBlock: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
@@ -2264,77 +2882,167 @@ BeginTransactionBlock(void)
 }
 
 /*
- *     EndTransactionBlock
- *             This executes a COMMIT command.
+ *     PrepareTransactionBlock
+ *             This executes a PREPARE command.
  *
- * Since COMMIT may actually do a ROLLBACK, the result indicates what
- * happened: TRUE for COMMIT, FALSE for ROLLBACK.
+ * Since PREPARE may actually do a ROLLBACK, the result indicates what
+ * happened: TRUE for PREPARE, FALSE for ROLLBACK.
+ *
+ * Note that we don't actually do anything here except change blockState.
+ * The real work will be done in the upcoming PrepareTransaction().
+ * We do it this way because it's not convenient to change memory context,
+ * resource owner, etc while executing inside a Portal.
  */
 bool
-EndTransactionBlock(void)
+PrepareTransactionBlock(char *gid)
 {
-       TransactionState s = CurrentTransactionState;
+       TransactionState s;
+       bool            result;
+
+       /* Set up to commit the current transaction */
+       result = EndTransactionBlock();
+
+       /* If successful, change outer tblock state to PREPARE */
+       if (result)
+       {
+               s = CurrentTransactionState;
+
+               while (s->parent != NULL)
+                       s = s->parent;
+
+               if (s->blockState == TBLOCK_END)
+               {
+                       /* Save GID where PrepareTransaction can find it again */
+                       prepareGID = MemoryContextStrdup(TopTransactionContext, gid);
+
+                       s->blockState = TBLOCK_PREPARE;
+               }
+               else
+               {
+                       /*
+                        * ignore case where we are not in a transaction;
+                        * EndTransactionBlock already issued a warning.
+                        */
+                       Assert(s->blockState == TBLOCK_STARTED);
+                       /* Don't send back a PREPARE result tag... */
+                       result = false;
+               }
+       }
+
+       return result;
+}
+
+/*
+ *     EndTransactionBlock
+ *             This executes a COMMIT command.
+ *
+ * Since COMMIT may actually do a ROLLBACK, the result indicates what
+ * happened: TRUE for COMMIT, FALSE for ROLLBACK.
+ *
+ * Note that we don't actually do anything here except change blockState.
+ * The real work will be done in the upcoming CommitTransactionCommand().
+ * We do it this way because it's not convenient to change memory context,
+ * resource owner, etc while executing inside a Portal.
+ */
+bool
+EndTransactionBlock(void)
+{
+       TransactionState s = CurrentTransactionState;
        bool            result = false;
 
        switch (s->blockState)
        {
-               /*
-                * We are in a transaction block which should commit when we
-                * get to the upcoming CommitTransactionCommand() so we set the
-                * state to "END".      CommitTransactionCommand() will recognize this
-                * and commit the transaction and return us to the default state.
-                */
+                       /*
+                        * We are in a transaction block, so tell CommitTransactionCommand
+                        * to COMMIT.
+                        */
                case TBLOCK_INPROGRESS:
-               case TBLOCK_SUBINPROGRESS:
                        s->blockState = TBLOCK_END;
                        result = true;
                        break;
 
                        /*
-                        * We are in a transaction block which aborted. Since the
-                        * AbortTransaction() was already done, we need only
-                        * change to the special "END ABORT" state.  The upcoming
-                        * CommitTransactionCommand() will recognise this and then put us
-                        * back in the default state.
+                        * We are in a failed transaction block.  Tell
+                        * CommitTransactionCommand it's time to exit the block.
                         */
                case TBLOCK_ABORT:
-                       s->blockState = TBLOCK_ENDABORT;
+                       s->blockState = TBLOCK_ABORT_END;
+                       break;
+
+                       /*
+                        * We are in a live subtransaction block.  Set up to subcommit all
+                        * open subtransactions and then commit the main transaction.
+                        */
+               case TBLOCK_SUBINPROGRESS:
+                       while (s->parent != NULL)
+                       {
+                               if (s->blockState == TBLOCK_SUBINPROGRESS)
+                                       s->blockState = TBLOCK_SUBEND;
+                               else
+                                       elog(FATAL, "EndTransactionBlock: unexpected state %s",
+                                                BlockStateAsString(s->blockState));
+                               s = s->parent;
+                       }
+                       if (s->blockState == TBLOCK_INPROGRESS)
+                               s->blockState = TBLOCK_END;
+                       else
+                               elog(FATAL, "EndTransactionBlock: unexpected state %s",
+                                        BlockStateAsString(s->blockState));
+                       result = true;
                        break;
 
                        /*
-                        * Here we are inside an aborted subtransaction.  Go to the "abort
-                        * the whole tree" state so that CommitTransactionCommand() calls
-                        * AbortOutOfAnyTransaction.
+                        * Here we are inside an aborted subtransaction.  Treat the COMMIT
+                        * as ROLLBACK: set up to abort everything and exit the main
+                        * transaction.
                         */
                case TBLOCK_SUBABORT:
-                       s->blockState = TBLOCK_SUBENDABORT_ALL;
+                       while (s->parent != NULL)
+                       {
+                               if (s->blockState == TBLOCK_SUBINPROGRESS)
+                                       s->blockState = TBLOCK_SUBABORT_PENDING;
+                               else if (s->blockState == TBLOCK_SUBABORT)
+                                       s->blockState = TBLOCK_SUBABORT_END;
+                               else
+                                       elog(FATAL, "EndTransactionBlock: unexpected state %s",
+                                                BlockStateAsString(s->blockState));
+                               s = s->parent;
+                       }
+                       if (s->blockState == TBLOCK_INPROGRESS)
+                               s->blockState = TBLOCK_ABORT_PENDING;
+                       else if (s->blockState == TBLOCK_ABORT)
+                               s->blockState = TBLOCK_ABORT_END;
+                       else
+                               elog(FATAL, "EndTransactionBlock: unexpected state %s",
+                                        BlockStateAsString(s->blockState));
                        break;
 
-               case TBLOCK_STARTED:
                        /*
-                        * here, the user issued COMMIT when not inside a
-                        * transaction. Issue a WARNING and go to abort state.  The
-                        * upcoming call to CommitTransactionCommand() will then put us
-                        * back into the default state.
+                        * The user issued COMMIT when not inside a transaction.  Issue a
+                        * WARNING, staying in TBLOCK_STARTED state.  The upcoming call to
+                        * CommitTransactionCommand() will then close the transaction and
+                        * put us back into the default state.
                         */
+               case TBLOCK_STARTED:
                        ereport(WARNING,
                                        (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
                                         errmsg("there is no transaction in progress")));
-                       AbortTransaction();
-                       s->blockState = TBLOCK_ENDABORT;
+                       result = true;
                        break;
 
-                       /* these cases are invalid. */
+                       /* These cases are invalid. */
                case TBLOCK_DEFAULT:
                case TBLOCK_BEGIN:
-               case TBLOCK_ENDABORT:
-               case TBLOCK_END:
                case TBLOCK_SUBBEGIN:
+               case TBLOCK_END:
                case TBLOCK_SUBEND:
-               case TBLOCK_SUBENDABORT_ALL:
-               case TBLOCK_SUBENDABORT:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
                case TBLOCK_SUBABORT_PENDING:
-               case TBLOCK_SUBENDABORT_RELEASE:
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
+               case TBLOCK_PREPARE:
                        elog(FATAL, "EndTransactionBlock: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
@@ -2346,6 +3054,8 @@ EndTransactionBlock(void)
 /*
  *     UserAbortTransactionBlock
  *             This executes a ROLLBACK command.
+ *
+ * As above, we don't actually do anything here except change blockState.
  */
 void
 UserAbortTransactionBlock(void)
@@ -2355,52 +3065,53 @@ UserAbortTransactionBlock(void)
        switch (s->blockState)
        {
                        /*
-                        * We are inside a failed transaction block and we got an
-                        * abort command from the user.  Abort processing is already
-                        * done, we just need to move to the ENDABORT state so we will
-                        * end up in the default state after the upcoming
-                        * CommitTransactionCommand().
-                        */
-               case TBLOCK_ABORT:
-                       s->blockState = TBLOCK_ENDABORT;
-                       break;
-
-                       /*
-                        * We are inside a failed subtransaction and we got an
-                        * abort command from the user.  Abort processing is already
-                        * done, so go to the "abort all" state and
-                        * CommitTransactionCommand will call AbortOutOfAnyTransaction
-                        * to set things straight.
+                        * We are inside a transaction block and we got a ROLLBACK command
+                        * from the user, so tell CommitTransactionCommand to abort and
+                        * exit the transaction block.
                         */
-               case TBLOCK_SUBABORT:
-                       s->blockState = TBLOCK_SUBENDABORT_ALL;
+               case TBLOCK_INPROGRESS:
+                       s->blockState = TBLOCK_ABORT_PENDING;
                        break;
 
                        /*
-                        * We are inside a transaction block and we got an abort
-                        * command from the user, so we move to the ENDABORT state and
-                        * do abort processing so we will end up in the default state
-                        * after the upcoming CommitTransactionCommand().
+                        * We are inside a failed transaction block and we got a ROLLBACK
+                        * command from the user.  Abort processing is already done, so
+                        * CommitTransactionCommand just has to cleanup and go back to
+                        * idle state.
                         */
-               case TBLOCK_INPROGRESS:
-                       AbortTransaction();
-                       s->blockState = TBLOCK_ENDABORT;
+               case TBLOCK_ABORT:
+                       s->blockState = TBLOCK_ABORT_END;
                        break;
 
                        /*
-                        * We are inside a subtransaction.  Abort the current
-                        * subtransaction and go to the "abort all" state, so
-                        * CommitTransactionCommand will call AbortOutOfAnyTransaction
-                        * to set things straight.
+                        * We are inside a subtransaction.      Mark everything up to top
+                        * level as exitable.
                         */
                case TBLOCK_SUBINPROGRESS:
-                       AbortSubTransaction();
-                       s->blockState = TBLOCK_SUBENDABORT_ALL;
+               case TBLOCK_SUBABORT:
+                       while (s->parent != NULL)
+                       {
+                               if (s->blockState == TBLOCK_SUBINPROGRESS)
+                                       s->blockState = TBLOCK_SUBABORT_PENDING;
+                               else if (s->blockState == TBLOCK_SUBABORT)
+                                       s->blockState = TBLOCK_SUBABORT_END;
+                               else
+                                       elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
+                                                BlockStateAsString(s->blockState));
+                               s = s->parent;
+                       }
+                       if (s->blockState == TBLOCK_INPROGRESS)
+                               s->blockState = TBLOCK_ABORT_PENDING;
+                       else if (s->blockState == TBLOCK_ABORT)
+                               s->blockState = TBLOCK_ABORT_END;
+                       else
+                               elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
+                                        BlockStateAsString(s->blockState));
                        break;
 
                        /*
-                        * The user issued ABORT when not inside a transaction. Issue
-                        * WARNING and go to abort state.  The upcoming call to
+                        * The user issued ABORT when not inside a transaction. Issue a
+                        * WARNING and go to abort state.  The upcoming call to
                         * CommitTransactionCommand() will then put us back into the
                         * default state.
                         */
@@ -2408,21 +3119,22 @@ UserAbortTransactionBlock(void)
                        ereport(WARNING,
                                        (errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
                                         errmsg("there is no transaction in progress")));
-                       AbortTransaction();
-                       s->blockState = TBLOCK_ENDABORT;
+                       s->blockState = TBLOCK_ABORT_PENDING;
                        break;
 
                        /* These cases are invalid. */
                case TBLOCK_DEFAULT:
                case TBLOCK_BEGIN:
+               case TBLOCK_SUBBEGIN:
                case TBLOCK_END:
-               case TBLOCK_ENDABORT:
                case TBLOCK_SUBEND:
-               case TBLOCK_SUBENDABORT_ALL:
-               case TBLOCK_SUBENDABORT:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
                case TBLOCK_SUBABORT_PENDING:
-               case TBLOCK_SUBENDABORT_RELEASE:
-               case TBLOCK_SUBBEGIN:
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
+               case TBLOCK_PREPARE:
                        elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
@@ -2436,7 +3148,7 @@ UserAbortTransactionBlock(void)
 void
 DefineSavepoint(char *name)
 {
-       TransactionState        s = CurrentTransactionState;
+       TransactionState s = CurrentTransactionState;
 
        switch (s->blockState)
        {
@@ -2444,31 +3156,33 @@ DefineSavepoint(char *name)
                case TBLOCK_SUBINPROGRESS:
                        /* Normal subtransaction start */
                        PushTransaction();
-                       s = CurrentTransactionState;    /* changed by push */
+                       s = CurrentTransactionState;            /* changed by push */
+
                        /*
-                        * Note that we are allocating the savepoint name in the
-                        * parent transaction's CurTransactionContext, since we
-                        * don't yet have a transaction context for the new guy.
+                        * Savepoint names, like the TransactionState block itself, live
+                        * in TopTransactionContext.
                         */
-                       s->name = MemoryContextStrdup(CurTransactionContext, name);
-                       s->blockState = TBLOCK_SUBBEGIN;
+                       if (name)
+                               s->name = MemoryContextStrdup(TopTransactionContext, name);
                        break;
 
-                       /* These cases are invalid.  Reject them altogether. */
+                       /* These cases are invalid. */
                case TBLOCK_DEFAULT:
                case TBLOCK_STARTED:
                case TBLOCK_BEGIN:
                case TBLOCK_SUBBEGIN:
+               case TBLOCK_END:
+               case TBLOCK_SUBEND:
                case TBLOCK_ABORT:
                case TBLOCK_SUBABORT:
-               case TBLOCK_ENDABORT:
-               case TBLOCK_END:
-               case TBLOCK_SUBENDABORT_ALL:
-               case TBLOCK_SUBENDABORT:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
                case TBLOCK_SUBABORT_PENDING:
-               case TBLOCK_SUBENDABORT_RELEASE:
-               case TBLOCK_SUBEND:
-                       elog(FATAL, "BeginTransactionBlock: unexpected state %s",
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
+               case TBLOCK_PREPARE:
+                       elog(FATAL, "DefineSavepoint: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
        }
@@ -2476,56 +3190,62 @@ DefineSavepoint(char *name)
 
 /*
  * ReleaseSavepoint
- *             This executes a RELEASE command.
+ *             This executes a RELEASE command.
+ *
+ * As above, we don't actually do anything here except change blockState.
  */
 void
 ReleaseSavepoint(List *options)
 {
-       TransactionState        s = CurrentTransactionState;
-       TransactionState        target = s;
-       char                       *name = NULL;
-       ListCell                   *cell;
+       TransactionState s = CurrentTransactionState;
+       TransactionState target,
+                               xact;
+       ListCell   *cell;
+       char       *name = NULL;
 
-       /*
-        * Check valid block state transaction status.
-        */
        switch (s->blockState)
        {
+                       /*
+                        * We can't rollback to a savepoint if there is no savepoint
+                        * defined.
+                        */
                case TBLOCK_INPROGRESS:
-               case TBLOCK_ABORT:
                        ereport(ERROR,
                                        (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
                                         errmsg("no such savepoint")));
                        break;
 
                        /*
-                        * We are in a non-aborted subtransaction.  This is
-                        * the only valid case.
+                        * We are in a non-aborted subtransaction.      This is the only valid
+                        * case.
                         */
                case TBLOCK_SUBINPROGRESS:
                        break;
 
-                       /* these cases are invalid. */
+                       /* These cases are invalid. */
                case TBLOCK_DEFAULT:
                case TBLOCK_STARTED:
                case TBLOCK_BEGIN:
-               case TBLOCK_ENDABORT:
-               case TBLOCK_END:
-               case TBLOCK_SUBABORT:
                case TBLOCK_SUBBEGIN:
+               case TBLOCK_END:
                case TBLOCK_SUBEND:
-               case TBLOCK_SUBENDABORT_ALL:
-               case TBLOCK_SUBENDABORT:
+               case TBLOCK_ABORT:
+               case TBLOCK_SUBABORT:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
                case TBLOCK_SUBABORT_PENDING:
-               case TBLOCK_SUBENDABORT_RELEASE:
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
+               case TBLOCK_PREPARE:
                        elog(FATAL, "ReleaseSavepoint: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
        }
 
-       foreach (cell, options)
+       foreach(cell, options)
        {
-               DefElem *elem = lfirst(cell);
+               DefElem    *elem = lfirst(cell);
 
                if (strcmp(elem->defname, "savepoint_name") == 0)
                        name = strVal(elem->arg);
@@ -2533,11 +3253,10 @@ ReleaseSavepoint(List *options)
 
        Assert(PointerIsValid(name));
 
-       while (target != NULL)
+       for (target = s; PointerIsValid(target); target = target->parent)
        {
                if (PointerIsValid(target->name) && strcmp(target->name, name) == 0)
                        break;
-               target = target->parent;
        }
 
        if (!PointerIsValid(target))
@@ -2545,30 +3264,52 @@ ReleaseSavepoint(List *options)
                                (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
                                 errmsg("no such savepoint")));
 
-       CommitTransactionToLevel(target->nestingLevel);
+       /* disallow crossing savepoint level boundaries */
+       if (target->savepointLevel != s->savepointLevel)
+               ereport(ERROR,
+                               (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+                                errmsg("no such savepoint")));
+
+       /*
+        * Mark "commit pending" all subtransactions up to the target
+        * subtransaction.      The actual commits will happen when control gets to
+        * CommitTransactionCommand.
+        */
+       xact = CurrentTransactionState;
+       for (;;)
+       {
+               Assert(xact->blockState == TBLOCK_SUBINPROGRESS);
+               xact->blockState = TBLOCK_SUBEND;
+               if (xact == target)
+                       break;
+               xact = xact->parent;
+               Assert(PointerIsValid(xact));
+       }
 }
 
 /*
  * RollbackToSavepoint
- *             This executes a ROLLBACK TO <savepoint> command.
+ *             This executes a ROLLBACK TO <savepoint> command.
+ *
+ * As above, we don't actually do anything here except change blockState.
  */
 void
 RollbackToSavepoint(List *options)
 {
        TransactionState s = CurrentTransactionState;
        TransactionState target,
-                                        xact;
-       ListCell                *cell;
-       char                    *name = NULL;
+                               xact;
+       ListCell   *cell;
+       char       *name = NULL;
 
        switch (s->blockState)
        {
-               /*
-                * We can't rollback to a savepoint if there is no saveopint
-                * defined.
-                */
-               case TBLOCK_ABORT:
+                       /*
+                        * We can't rollback to a savepoint if there is no savepoint
+                        * defined.
+                        */
                case TBLOCK_INPROGRESS:
+               case TBLOCK_ABORT:
                        ereport(ERROR,
                                        (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
                                         errmsg("no such savepoint")));
@@ -2577,34 +3318,32 @@ RollbackToSavepoint(List *options)
                        /*
                         * There is at least one savepoint, so proceed.
                         */
-               case TBLOCK_SUBABORT:
                case TBLOCK_SUBINPROGRESS:
-                       /*
-                        * Have to do AbortSubTransaction, but first check
-                        * if this is the right subtransaction
-                        */
+               case TBLOCK_SUBABORT:
                        break;
 
-                       /* these cases are invalid. */
+                       /* These cases are invalid. */
                case TBLOCK_DEFAULT:
                case TBLOCK_STARTED:
                case TBLOCK_BEGIN:
+               case TBLOCK_SUBBEGIN:
                case TBLOCK_END:
-               case TBLOCK_ENDABORT:
                case TBLOCK_SUBEND:
-               case TBLOCK_SUBENDABORT_ALL:
-               case TBLOCK_SUBENDABORT:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
                case TBLOCK_SUBABORT_PENDING:
-               case TBLOCK_SUBENDABORT_RELEASE:
-               case TBLOCK_SUBBEGIN:
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
+               case TBLOCK_PREPARE:
                        elog(FATAL, "RollbackToSavepoint: unexpected state %s",
                                 BlockStateAsString(s->blockState));
                        break;
        }
 
-       foreach (cell, options)
+       foreach(cell, options)
        {
-               DefElem *elem = lfirst(cell);
+               DefElem    *elem = lfirst(cell);
 
                if (strcmp(elem->defname, "savepoint_name") == 0)
                        name = strVal(elem->arg);
@@ -2612,19 +3351,10 @@ RollbackToSavepoint(List *options)
 
        Assert(PointerIsValid(name));
 
-       target = CurrentTransactionState;
-
-       while (target != NULL)
+       for (target = s; PointerIsValid(target); target = target->parent)
        {
                if (PointerIsValid(target->name) && strcmp(target->name, name) == 0)
                        break;
-               target = target->parent;
-
-               /* we don't cross savepoint level boundaries */
-               if (target->savepointLevel != s->savepointLevel)
-                       ereport(ERROR,
-                                       (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
-                                        errmsg("no such savepoint")));
        }
 
        if (!PointerIsValid(target))
@@ -2632,45 +3362,173 @@ RollbackToSavepoint(List *options)
                                (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
                                 errmsg("no such savepoint")));
 
-       /*
-        * Abort the current subtransaction, if needed.  We can't Cleanup the
-        * savepoint yet, so signal CommitTransactionCommand to do it and
-        * close all savepoints up to the target level.
-        */
-       if (s->blockState == TBLOCK_SUBINPROGRESS)
-               AbortSubTransaction();
-       s->blockState = TBLOCK_SUBENDABORT;
+       /* disallow crossing savepoint level boundaries */
+       if (target->savepointLevel != s->savepointLevel)
+               ereport(ERROR,
+                               (errcode(ERRCODE_S_E_INVALID_SPECIFICATION),
+                                errmsg("no such savepoint")));
 
        /*
         * Mark "abort pending" all subtransactions up to the target
-        * subtransaction.  (Except the current subtransaction!)
+        * subtransaction.      The actual aborts will happen when control gets to
+        * CommitTransactionCommand.
         */
        xact = CurrentTransactionState;
-
-       while (xact != target)
+       for (;;)
        {
+               if (xact == target)
+                       break;
+               if (xact->blockState == TBLOCK_SUBINPROGRESS)
+                       xact->blockState = TBLOCK_SUBABORT_PENDING;
+               else if (xact->blockState == TBLOCK_SUBABORT)
+                       xact->blockState = TBLOCK_SUBABORT_END;
+               else
+                       elog(FATAL, "RollbackToSavepoint: unexpected state %s",
+                                BlockStateAsString(xact->blockState));
                xact = xact->parent;
                Assert(PointerIsValid(xact));
-               Assert(xact->blockState == TBLOCK_SUBINPROGRESS);
-               xact->blockState = TBLOCK_SUBABORT_PENDING;
        }
+
+       /* And mark the target as "restart pending" */
+       if (xact->blockState == TBLOCK_SUBINPROGRESS)
+               xact->blockState = TBLOCK_SUBRESTART;
+       else if (xact->blockState == TBLOCK_SUBABORT)
+               xact->blockState = TBLOCK_SUBABORT_RESTART;
+       else
+               elog(FATAL, "RollbackToSavepoint: unexpected state %s",
+                        BlockStateAsString(xact->blockState));
+}
+
+/*
+ * BeginInternalSubTransaction
+ *             This is the same as DefineSavepoint except it allows TBLOCK_STARTED,
+ *             TBLOCK_END, and TBLOCK_PREPARE states, and therefore it can safely be
+ *             used in functions that might be called when not inside a BEGIN block
+ *             or when running deferred triggers at COMMIT/PREPARE time.  Also, it
+ *             automatically does CommitTransactionCommand/StartTransactionCommand
+ *             instead of expecting the caller to do it.
+ */
+void
+BeginInternalSubTransaction(char *name)
+{
+       TransactionState s = CurrentTransactionState;
+
+       switch (s->blockState)
+       {
+               case TBLOCK_STARTED:
+               case TBLOCK_INPROGRESS:
+               case TBLOCK_END:
+               case TBLOCK_PREPARE:
+               case TBLOCK_SUBINPROGRESS:
+                       /* Normal subtransaction start */
+                       PushTransaction();
+                       s = CurrentTransactionState;            /* changed by push */
+
+                       /*
+                        * Savepoint names, like the TransactionState block itself, live
+                        * in TopTransactionContext.
+                        */
+                       if (name)
+                               s->name = MemoryContextStrdup(TopTransactionContext, name);
+                       break;
+
+                       /* These cases are invalid. */
+               case TBLOCK_DEFAULT:
+               case TBLOCK_BEGIN:
+               case TBLOCK_SUBBEGIN:
+               case TBLOCK_SUBEND:
+               case TBLOCK_ABORT:
+               case TBLOCK_SUBABORT:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
+               case TBLOCK_SUBABORT_PENDING:
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
+                       elog(FATAL, "BeginInternalSubTransaction: unexpected state %s",
+                                BlockStateAsString(s->blockState));
+                       break;
+       }
+
+       CommitTransactionCommand();
+       StartTransactionCommand();
 }
 
 /*
- * RollbackAndReleaseSavepoint
+ * ReleaseCurrentSubTransaction
  *
- * Executes a ROLLBACK TO command, immediately followed by a RELEASE
- * of the same savepoint.
+ * RELEASE (ie, commit) the innermost subtransaction, regardless of its
+ * savepoint name (if any).
+ * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this.
  */
 void
-RollbackAndReleaseSavepoint(List *options)
+ReleaseCurrentSubTransaction(void)
 {
-       TransactionState s;
+       TransactionState s = CurrentTransactionState;
 
-       RollbackToSavepoint(options);
-       s = CurrentTransactionState;
-       Assert(s->blockState == TBLOCK_SUBENDABORT);
-       s->blockState = TBLOCK_SUBENDABORT_RELEASE;
+       if (s->blockState != TBLOCK_SUBINPROGRESS)
+               elog(ERROR, "ReleaseCurrentSubTransaction: unexpected state %s",
+                        BlockStateAsString(s->blockState));
+       Assert(s->state == TRANS_INPROGRESS);
+       MemoryContextSwitchTo(CurTransactionContext);
+       CommitSubTransaction();
+       s = CurrentTransactionState;    /* changed by pop */
+       Assert(s->state == TRANS_INPROGRESS);
+}
+
+/*
+ * RollbackAndReleaseCurrentSubTransaction
+ *
+ * ROLLBACK and RELEASE (ie, abort) the innermost subtransaction, regardless
+ * of its savepoint name (if any).
+ * NB: do NOT use CommitTransactionCommand/StartTransactionCommand with this.
+ */
+void
+RollbackAndReleaseCurrentSubTransaction(void)
+{
+       TransactionState s = CurrentTransactionState;
+
+       switch (s->blockState)
+       {
+                       /* Must be in a subtransaction */
+               case TBLOCK_SUBINPROGRESS:
+               case TBLOCK_SUBABORT:
+                       break;
+
+                       /* These cases are invalid. */
+               case TBLOCK_DEFAULT:
+               case TBLOCK_STARTED:
+               case TBLOCK_BEGIN:
+               case TBLOCK_SUBBEGIN:
+               case TBLOCK_INPROGRESS:
+               case TBLOCK_END:
+               case TBLOCK_SUBEND:
+               case TBLOCK_ABORT:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
+               case TBLOCK_SUBABORT_PENDING:
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
+               case TBLOCK_PREPARE:
+                       elog(FATAL, "RollbackAndReleaseCurrentSubTransaction: unexpected state %s",
+                                BlockStateAsString(s->blockState));
+                       break;
+       }
+
+       /*
+        * Abort the current subtransaction, if needed.
+        */
+       if (s->blockState == TBLOCK_SUBINPROGRESS)
+               AbortSubTransaction();
+
+       /* And clean it up, too */
+       CleanupSubTransaction();
+
+       s = CurrentTransactionState;    /* changed by pop */
+       AssertState(s->blockState == TBLOCK_SUBINPROGRESS ||
+                               s->blockState == TBLOCK_INPROGRESS ||
+                               s->blockState == TBLOCK_STARTED);
 }
 
 /*
@@ -2688,7 +3546,8 @@ AbortOutOfAnyTransaction(void)
        /*
         * Get out of any transaction or nested transaction
         */
-       do {
+       do
+       {
                switch (s->blockState)
                {
                        case TBLOCK_DEFAULT:
@@ -2698,42 +3557,39 @@ AbortOutOfAnyTransaction(void)
                        case TBLOCK_BEGIN:
                        case TBLOCK_INPROGRESS:
                        case TBLOCK_END:
+                       case TBLOCK_ABORT_PENDING:
+                       case TBLOCK_PREPARE:
                                /* In a transaction, so clean up */
                                AbortTransaction();
                                CleanupTransaction();
                                s->blockState = TBLOCK_DEFAULT;
                                break;
                        case TBLOCK_ABORT:
-                       case TBLOCK_ENDABORT:
+                       case TBLOCK_ABORT_END:
                                /* AbortTransaction already done, still need Cleanup */
                                CleanupTransaction();
                                s->blockState = TBLOCK_DEFAULT;
                                break;
-                       case TBLOCK_SUBBEGIN:
+
                                /*
-                                * We didn't get as far as starting the subxact, so there's
-                                * nothing to abort.  Just pop back to parent.
+                                * In a subtransaction, so clean it up and abort parent too
                                 */
-                               PopTransaction();
-                               s = CurrentTransactionState;            /* changed by pop */
-                               break;
+                       case TBLOCK_SUBBEGIN:
                        case TBLOCK_SUBINPROGRESS:
                        case TBLOCK_SUBEND:
                        case TBLOCK_SUBABORT_PENDING:
-                               /* In a subtransaction, so clean it up and abort parent too */
+                       case TBLOCK_SUBRESTART:
                                AbortSubTransaction();
                                CleanupSubTransaction();
-                               PopTransaction();
-                               s = CurrentTransactionState;            /* changed by pop */
+                               s = CurrentTransactionState;    /* changed by pop */
                                break;
+
                        case TBLOCK_SUBABORT:
-                       case TBLOCK_SUBENDABORT_ALL:
-                       case TBLOCK_SUBENDABORT:
-                       case TBLOCK_SUBENDABORT_RELEASE:
+                       case TBLOCK_SUBABORT_END:
+                       case TBLOCK_SUBABORT_RESTART:
                                /* As above, but AbortSubTransaction already done */
                                CleanupSubTransaction();
-                               PopTransaction();
-                               s = CurrentTransactionState;            /* changed by pop */
+                               s = CurrentTransactionState;    /* changed by pop */
                                break;
                }
        } while (s->blockState != TBLOCK_DEFAULT);
@@ -2743,28 +3599,6 @@ AbortOutOfAnyTransaction(void)
 }
 
 /*
- * CommitTransactionToLevel
- *
- * Commit everything from the current transaction level
- * up to the specified level (inclusive).
- */
-void
-CommitTransactionToLevel(int level)
-{
-       TransactionState s = CurrentTransactionState;
-
-       Assert(s->state == TRANS_INPROGRESS);
-
-       while (s->nestingLevel >= level)
-       {
-               CommitSubTransaction();
-               PopTransaction();
-               s = CurrentTransactionState;                            /* changed by pop */
-               Assert(s->state == TRANS_INPROGRESS);
-       }
-}
-
-/*
  * IsTransactionBlock --- are we within a transaction block?
  */
 bool
@@ -2780,7 +3614,7 @@ IsTransactionBlock(void)
 
 /*
  * IsTransactionOrTransactionBlock --- are we within either a transaction
- * or a transaction block?  (The backend is only really "idle" when this
+ * or a transaction block?     (The backend is only really "idle" when this
  * returns false.)
  *
  * This should match up with IsTransactionBlock and IsTransactionState.
@@ -2810,19 +3644,21 @@ TransactionBlockStatusCode(void)
                case TBLOCK_STARTED:
                        return 'I';                     /* idle --- not in transaction */
                case TBLOCK_BEGIN:
+               case TBLOCK_SUBBEGIN:
                case TBLOCK_INPROGRESS:
-               case TBLOCK_END:
                case TBLOCK_SUBINPROGRESS:
-               case TBLOCK_SUBBEGIN:
+               case TBLOCK_END:
                case TBLOCK_SUBEND:
+               case TBLOCK_PREPARE:
                        return 'T';                     /* in transaction */
                case TBLOCK_ABORT:
-               case TBLOCK_ENDABORT:
                case TBLOCK_SUBABORT:
-               case TBLOCK_SUBENDABORT_ALL:
-               case TBLOCK_SUBENDABORT:
+               case TBLOCK_ABORT_END:
+               case TBLOCK_SUBABORT_END:
+               case TBLOCK_ABORT_PENDING:
                case TBLOCK_SUBABORT_PENDING:
-               case TBLOCK_SUBENDABORT_RELEASE:
+               case TBLOCK_SUBRESTART:
+               case TBLOCK_SUBABORT_RESTART:
                        return 'E';                     /* in failed transaction */
        }
 
@@ -2839,36 +3675,24 @@ bool
 IsSubTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
-       
-       switch (s->blockState)
-       {
-               case TBLOCK_DEFAULT:
-               case TBLOCK_STARTED:
-               case TBLOCK_BEGIN:
-               case TBLOCK_INPROGRESS:
-               case TBLOCK_END:
-               case TBLOCK_ABORT:
-               case TBLOCK_ENDABORT:
-                       return false;
-               case TBLOCK_SUBBEGIN:
-               case TBLOCK_SUBINPROGRESS:
-               case TBLOCK_SUBABORT:
-               case TBLOCK_SUBEND:
-               case TBLOCK_SUBENDABORT_ALL:
-               case TBLOCK_SUBENDABORT:
-               case TBLOCK_SUBABORT_PENDING:
-               case TBLOCK_SUBENDABORT_RELEASE:
-                       return true;
-       }
 
-       /* should never get here */
-       elog(FATAL, "invalid transaction block state: %s",
-                BlockStateAsString(s->blockState));
-       return false;                           /* keep compiler quiet */
+       if (s->nestingLevel >= 2)
+               return true;
+
+       return false;
 }
 
 /*
  * StartSubTransaction
+ *
+ * If you're wondering why this is separate from PushTransaction: it's because
+ * we can't conveniently do this stuff right inside DefineSavepoint.  The
+ * SAVEPOINT utility command will be executed inside a Portal, and if we
+ * muck with CurrentMemoryContext or CurrentResourceOwner then exit from
+ * the Portal will undo those settings.  So we make DefineSavepoint just
+ * push a dummy transaction block, and when control returns to the main
+ * idle loop, CommitTransactionCommand will be called, and we'll come here
+ * to finish starting the subtransaction.
  */
 static void
 StartSubTransaction(void)
@@ -2882,40 +3706,32 @@ StartSubTransaction(void)
        s->state = TRANS_START;
 
        /*
+        * Initialize subsystems for new subtransaction
+        *
         * must initialize resource-management stuff first
         */
        AtSubStart_Memory();
        AtSubStart_ResourceOwner();
-
-       /*
-        * Generate a new Xid and record it in pg_subtrans.
-        */
-       s->transactionIdData = GetNewTransactionId(true);
-
-       SubTransSetParent(s->transactionIdData, s->parent->transactionIdData);
-
-       XactLockTableInsert(s->transactionIdData);
-
-       /*
-        * Finish setup of other transaction state fields.
-        */
-       s->currentUser = GetUserId();
-       s->prevXactReadOnly = XactReadOnly;
-       
-       /*
-        * Initialize other subsystems for new subtransaction
-        */
        AtSubStart_Inval();
        AtSubStart_Notify();
-       DeferredTriggerBeginSubXact();
+       AfterTriggerBeginSubXact();
 
        s->state = TRANS_INPROGRESS;
 
+       /*
+        * Call start-of-subxact callbacks
+        */
+       CallSubXactCallbacks(SUBXACT_EVENT_START_SUB, s->subTransactionId,
+                                                s->parent->subTransactionId);
+
        ShowTransactionState("StartSubTransaction");
 }
 
 /*
  * CommitSubTransaction
+ *
+ *     The caller has to make sure to always reassign CurrentTransactionState
+ *     if it has a local pointer to it after calling this function.
  */
 static void
 CommitSubTransaction(void)
@@ -2928,67 +3744,85 @@ CommitSubTransaction(void)
                elog(WARNING, "CommitSubTransaction while in %s state",
                         TransStateAsString(s->state));
 
-       /* Pre-commit processing */
-       AtSubCommit_Portals(s->parent->transactionIdData,
-                                               s->parent->curTransactionOwner);
-       DeferredTriggerEndSubXact(true);
+       /* Pre-commit processing goes here -- nothing to do at the moment */
 
        s->state = TRANS_COMMIT;
 
-       /* Mark subtransaction as subcommitted */
+       /* Must CCI to ensure commands of subtransaction are seen as done */
        CommandCounterIncrement();
-       RecordSubTransactionCommit();
-       AtSubCommit_childXids();
+
+       /* Mark subtransaction as subcommitted */
+       if (TransactionIdIsValid(s->transactionId))
+       {
+               RecordSubTransactionCommit();
+               AtSubCommit_childXids();
+       }
 
        /* Post-commit cleanup */
-       AtSubCommit_smgr();
+       AfterTriggerEndSubXact(true);
+       AtSubCommit_Portals(s->subTransactionId,
+                                               s->parent->subTransactionId,
+                                               s->parent->curTransactionOwner);
+       AtEOSubXact_LargeObject(true, s->subTransactionId,
+                                                       s->parent->subTransactionId);
+       AtSubCommit_Notify();
+       AtEOSubXact_UpdateFlatFiles(true, s->subTransactionId,
+                                                               s->parent->subTransactionId);
 
-       AtEOSubXact_Inval(true);
-       AtEOSubXact_SPI(true, s->transactionIdData);
+       CallSubXactCallbacks(SUBXACT_EVENT_COMMIT_SUB, s->subTransactionId,
+                                                s->parent->subTransactionId);
 
-       AtEOSubXact_LargeObject(true, s->transactionIdData,
-                                                       s->parent->transactionIdData);
-       AtEOSubXact_UpdatePasswordFile(true, s->transactionIdData,
-                                                                  s->parent->transactionIdData);
-       AtEOSubXact_Files(true, s->transactionIdData,
-                                         s->parent->transactionIdData);
-       AtEOSubXact_Namespace(true, s->transactionIdData,
-                                                 s->parent->transactionIdData);
+       ResourceOwnerRelease(s->curTransactionOwner,
+                                                RESOURCE_RELEASE_BEFORE_LOCKS,
+                                                true, false);
+       AtEOSubXact_RelationCache(true, s->subTransactionId,
+                                                         s->parent->subTransactionId);
+       AtEOSubXact_Inval(true);
+       AtSubCommit_smgr();
 
        /*
-        * Note that we just release the resource owner's resources and don't
-        * delete it.  This is because locks are not actually released here.
-        * The owner object continues to exist as a child of its parent owner
-        * (namely my parent transaction's resource owner), and the locks
-        * effectively become that owner object's responsibility.
+        * The only lock we actually release here is the subtransaction XID lock.
+        * The rest just get transferred to the parent resource owner.
         */
+       CurrentResourceOwner = s->curTransactionOwner;
+       if (TransactionIdIsValid(s->transactionId))
+               XactLockTableDelete(s->transactionId);
+
        ResourceOwnerRelease(s->curTransactionOwner,
-                                                RESOURCE_RELEASE_BEFORE_LOCKS,
+                                                RESOURCE_RELEASE_LOCKS,
                                                 true, false);
-       /* we can skip the LOCKS phase */
        ResourceOwnerRelease(s->curTransactionOwner,
                                                 RESOURCE_RELEASE_AFTER_LOCKS,
                                                 true, false);
 
-       AtSubCommit_Notify();
-       AtEOXact_GUC(true, true);
-       AtEOSubXact_on_commit_actions(true, s->transactionIdData,
-                                                                 s->parent->transactionIdData);
+       AtEOXact_GUC(true, s->gucNestLevel);
+       AtEOSubXact_SPI(true, s->subTransactionId);
+       AtEOSubXact_on_commit_actions(true, s->subTransactionId,
+                                                                 s->parent->subTransactionId);
+       AtEOSubXact_Namespace(true, s->subTransactionId,
+                                                 s->parent->subTransactionId);
+       AtEOSubXact_Files(true, s->subTransactionId,
+                                         s->parent->subTransactionId);
+       AtEOSubXact_HashTables(true, s->nestingLevel);
+       AtEOSubXact_PgStat(true, s->nestingLevel);
 
        /*
-        * We need to restore the upper transaction's read-only state,
-        * in case the upper is read-write while the child is read-only;
-        * GUC will incorrectly think it should leave the child state in place.
+        * We need to restore the upper transaction's read-only state, in case the
+        * upper is read-write while the child is read-only; GUC will incorrectly
+        * think it should leave the child state in place.
         */
        XactReadOnly = s->prevXactReadOnly;
 
        CurrentResourceOwner = s->parent->curTransactionOwner;
        CurTransactionResourceOwner = s->parent->curTransactionOwner;
+       ResourceOwnerDelete(s->curTransactionOwner);
        s->curTransactionOwner = NULL;
 
        AtSubCommit_Memory();
 
        s->state = TRANS_DEFAULT;
+
+       PopTransaction();
 }
 
 /*
@@ -2999,21 +3833,18 @@ AbortSubTransaction(void)
 {
        TransactionState s = CurrentTransactionState;
 
-       ShowTransactionState("AbortSubTransaction");
-
-       if (s->state != TRANS_INPROGRESS)
-               elog(WARNING, "AbortSubTransaction while in %s state",
-                        TransStateAsString(s->state));
-
+       /* Prevent cancel/die interrupt while cleaning up */
        HOLD_INTERRUPTS();
 
-       s->state = TRANS_ABORT;
+       /* Make sure we have a valid memory context and resource owner */
+       AtSubAbort_Memory();
+       AtSubAbort_ResourceOwner();
 
        /*
         * Release any LW locks we might be holding as quickly as possible.
         * (Regular locks, however, must be held till we finish aborting.)
-        * Releasing LW locks is critical since we might try to grab them
-        * again while cleaning up!
+        * Releasing LW locks is critical since we might try to grab them again
+        * while cleaning up!
         *
         * FIXME This may be incorrect --- Are there some locks we should keep?
         * Buffer locks, for example?  I don't think so but I'm not sure.
@@ -3025,75 +3856,99 @@ AbortSubTransaction(void)
 
        LockWaitCancel();
 
-       AtSubAbort_Memory();
-
        /*
-        * do abort processing
+        * check the current transaction state
         */
+       ShowTransactionState("AbortSubTransaction");
 
-       RecordSubTransactionAbort();
-
-       /* Post-abort cleanup */
-       AtSubAbort_smgr();
-
-       DeferredTriggerEndSubXact(false);
-       AtEOSubXact_SPI(false, s->transactionIdData);
-       AtSubAbort_Portals(s->parent->transactionIdData,
-                                          s->parent->curTransactionOwner);
-       AtEOSubXact_Inval(false);
+       if (s->state != TRANS_INPROGRESS)
+               elog(WARNING, "AbortSubTransaction while in %s state",
+                        TransStateAsString(s->state));
 
-       AtEOSubXact_LargeObject(false, s->transactionIdData,
-                                                       s->parent->transactionIdData);
-       AtEOSubXact_UpdatePasswordFile(false, s->transactionIdData,
-                                                                  s->parent->transactionIdData);
-       AtEOSubXact_Files(false, s->transactionIdData,
-                                         s->parent->transactionIdData);
-       AtEOSubXact_Namespace(false, s->transactionIdData,
-                                                 s->parent->transactionIdData);
+       s->state = TRANS_ABORT;
 
-       ResourceOwnerRelease(s->curTransactionOwner,
-                                                RESOURCE_RELEASE_BEFORE_LOCKS,
-                                                false, false);
-       ResourceOwnerRelease(s->curTransactionOwner,
-                                                RESOURCE_RELEASE_LOCKS,
-                                                false, false);
-       ResourceOwnerRelease(s->curTransactionOwner,
-                                                RESOURCE_RELEASE_AFTER_LOCKS,
-                                                false, false);
+       /*
+        * We can skip all this stuff if the subxact failed before creating a
+        * ResourceOwner...
+        */
+       if (s->curTransactionOwner)
+       {
+               AfterTriggerEndSubXact(false);
+               AtSubAbort_Portals(s->subTransactionId,
+                                                  s->parent->subTransactionId,
+                                                  s->parent->curTransactionOwner);
+               AtEOSubXact_LargeObject(false, s->subTransactionId,
+                                                               s->parent->subTransactionId);
+               AtSubAbort_Notify();
+               AtEOSubXact_UpdateFlatFiles(false, s->subTransactionId,
+                                                                       s->parent->subTransactionId);
+
+               /* Advertise the fact that we aborted in pg_clog. */
+               if (TransactionIdIsValid(s->transactionId))
+               {
+                       RecordSubTransactionAbort();
+                       AtSubAbort_childXids();
+               }
 
-       AtSubAbort_Notify();
-       AtEOXact_GUC(false, true);
-       AtEOSubXact_on_commit_actions(false, s->transactionIdData,
-                                                                 s->parent->transactionIdData);
+               /* Post-abort cleanup */
+               CallSubXactCallbacks(SUBXACT_EVENT_ABORT_SUB, s->subTransactionId,
+                                                        s->parent->subTransactionId);
+
+               ResourceOwnerRelease(s->curTransactionOwner,
+                                                        RESOURCE_RELEASE_BEFORE_LOCKS,
+                                                        false, false);
+               AtEOSubXact_RelationCache(false, s->subTransactionId,
+                                                                 s->parent->subTransactionId);
+               AtEOSubXact_Inval(false);
+               AtSubAbort_smgr();
+               ResourceOwnerRelease(s->curTransactionOwner,
+                                                        RESOURCE_RELEASE_LOCKS,
+                                                        false, false);
+               ResourceOwnerRelease(s->curTransactionOwner,
+                                                        RESOURCE_RELEASE_AFTER_LOCKS,
+                                                        false, false);
+
+               AtEOXact_GUC(false, s->gucNestLevel);
+               AtEOSubXact_SPI(false, s->subTransactionId);
+               AtEOSubXact_on_commit_actions(false, s->subTransactionId,
+                                                                         s->parent->subTransactionId);
+               AtEOSubXact_Namespace(false, s->subTransactionId,
+                                                         s->parent->subTransactionId);
+               AtEOSubXact_Files(false, s->subTransactionId,
+                                                 s->parent->subTransactionId);
+               AtEOSubXact_HashTables(false, s->nestingLevel);
+               AtEOSubXact_PgStat(false, s->nestingLevel);
+       }
 
        /*
-        * Reset user id which might have been changed transiently.  Here we
-        * want to restore to the userid that was current at subxact entry.
-        * (As in AbortTransaction, we need not worry about the session userid.)
+        * Reset user id which might have been changed transiently.  Here we want
+        * to restore to the userid that was current at subxact entry. (As in
+        * AbortTransaction, we need not worry about the session userid.)
         *
-        * Must do this after AtEOXact_GUC to handle the case where we entered
-        * the subxact inside a SECURITY DEFINER function (hence current and
-        * session userids were different) and then session auth was changed
-        * inside the subxact.  GUC will reset both current and session userids
-        * to the entry-time session userid.  This is right in every other
-        * scenario so it seems simplest to let GUC do that and fix it here.
+        * Must do this after AtEOXact_GUC to handle the case where we entered the
+        * subxact inside a SECURITY DEFINER function (hence current and session
+        * userids were different) and then session auth was changed inside the
+        * subxact.  GUC will reset both current and session userids to the
+        * entry-time session userid.  This is right in every other scenario so it
+        * seems simplest to let GUC do that and fix it here.
         */
        SetUserId(s->currentUser);
 
        /*
-        * Restore the upper transaction's read-only state, too.  This should
-        * be redundant with GUC's cleanup but we may as well do it for
-        * consistency with the commit case.
+        * Restore the upper transaction's read-only state, too.  This should be
+        * redundant with GUC's cleanup but we may as well do it for consistency
+        * with the commit case.
         */
        XactReadOnly = s->prevXactReadOnly;
 
-       CommandCounterIncrement();
-
        RESUME_INTERRUPTS();
 }
 
 /*
  * CleanupSubTransaction
+ *
+ *     The caller has to make sure to always reassign CurrentTransactionState
+ *     if it has a local pointer to it after calling this function.
  */
 static void
 CleanupSubTransaction(void)
@@ -3106,66 +3961,24 @@ CleanupSubTransaction(void)
                elog(WARNING, "CleanupSubTransaction while in %s state",
                         TransStateAsString(s->state));
 
-       AtSubCleanup_Portals();
+       AtSubCleanup_Portals(s->subTransactionId);
 
        CurrentResourceOwner = s->parent->curTransactionOwner;
        CurTransactionResourceOwner = s->parent->curTransactionOwner;
-       ResourceOwnerDelete(s->curTransactionOwner);
+       if (s->curTransactionOwner)
+               ResourceOwnerDelete(s->curTransactionOwner);
        s->curTransactionOwner = NULL;
 
        AtSubCleanup_Memory();
 
        s->state = TRANS_DEFAULT;
-}
-
-/*
- * StartAbortedSubTransaction
- *
- * This function is used to start a subtransaction and put it immediately
- * into aborted state.  The end result should be equivalent to
- * StartSubTransaction immediately followed by AbortSubTransaction.
- * The reason we don't implement it just that way is that many of the backend
- * modules aren't designed to handle starting a subtransaction when not
- * inside a valid transaction.  Rather than making them all capable of
- * doing that, we just omit the paired start and abort calls in this path.
- */
-static void
-StartAbortedSubTransaction(void)
-{
-       TransactionState s = CurrentTransactionState;
-
-       if (s->state != TRANS_DEFAULT)
-               elog(WARNING, "StartAbortedSubTransaction while in %s state",
-                        TransStateAsString(s->state));
-
-       s->state = TRANS_START;
-
-       /*
-        * We don't bother to generate a new Xid, so the end state is not
-        * *exactly* like we had done a full Start/AbortSubTransaction...
-        */
-       s->transactionIdData = InvalidTransactionId;
 
-       /* Make sure currentUser is reasonably valid */
-       Assert(s->parent != NULL);
-       s->currentUser = s->parent->currentUser;
-       
-       /*
-        * Initialize only what has to be there for CleanupSubTransaction to work.
-        */
-       AtSubStart_Memory();
-       AtSubStart_ResourceOwner();
-
-       s->state = TRANS_ABORT;
-
-       AtSubAbort_Memory();
-
-       ShowTransactionState("StartAbortedSubTransaction");
+       PopTransaction();
 }
 
 /*
  * PushTransaction
- *             Set up transaction state for a subtransaction
+ *             Create transaction state stack entry for a subtransaction
  *
  *     The caller has to make sure to always reassign CurrentTransactionState
  *     if it has a local pointer to it after calling this function.
@@ -3173,8 +3986,15 @@ StartAbortedSubTransaction(void)
 static void
 PushTransaction(void)
 {
-       TransactionState    p = CurrentTransactionState;
-       TransactionState    s;
+       TransactionState p = CurrentTransactionState;
+       TransactionState s;
+       Oid                     currentUser;
+
+       /*
+        * At present, GetUserId cannot fail, but let's not assume that.  Get the
+        * ID before entering the critical code sequence.
+        */
+       currentUser = GetUserId();
 
        /*
         * We keep subtransaction state nodes in TopTransactionContext.
@@ -3182,25 +4002,43 @@ PushTransaction(void)
        s = (TransactionState)
                MemoryContextAllocZero(TopTransactionContext,
                                                           sizeof(TransactionStateData));
+
+       /*
+        * Assign a subtransaction ID, watching out for counter wraparound.
+        */
+       currentSubTransactionId += 1;
+       if (currentSubTransactionId == InvalidSubTransactionId)
+       {
+               currentSubTransactionId -= 1;
+               pfree(s);
+               ereport(ERROR,
+                               (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
+                                errmsg("cannot have more than 2^32-1 subtransactions in a transaction")));
+       }
+
+       /*
+        * We can now stack a minimally valid subtransaction without fear of
+        * failure.
+        */
+       s->transactionId = InvalidTransactionId;        /* until assigned */
+       s->subTransactionId = currentSubTransactionId;
        s->parent = p;
        s->nestingLevel = p->nestingLevel + 1;
+       s->gucNestLevel = NewGUCNestLevel();
        s->savepointLevel = p->savepointLevel;
        s->state = TRANS_DEFAULT;
        s->blockState = TBLOCK_SUBBEGIN;
+       s->currentUser = currentUser;
+       s->prevXactReadOnly = XactReadOnly;
 
-       /* Command IDs count in a continuous sequence through subtransactions */
-       s->commandId = p->commandId;
+       CurrentTransactionState = s;
 
        /*
-        * Copy down some other data so that we will have valid state until
-        * StartSubTransaction runs.
+        * AbortSubTransaction and CleanupSubTransaction have to be able to cope
+        * with the subtransaction from here on out; in particular they should not
+        * assume that it necessarily has a transaction context, resource owner,
+        * or XID.
         */
-       s->transactionIdData = p->transactionIdData;
-       s->curTransactionContext = p->curTransactionContext;
-       s->curTransactionOwner = p->curTransactionOwner;
-       s->currentUser = p->currentUser;
-
-       CurrentTransactionState = s;
 }
 
 /*
@@ -3222,9 +4060,6 @@ PopTransaction(void)
        if (s->parent == NULL)
                elog(FATAL, "PopTransaction with no parent");
 
-       /* Command IDs count in a continuous sequence through subtransactions */
-       s->parent->commandId = s->commandId;
-
        CurrentTransactionState = s->parent;
 
        /* Let's just make sure CurTransactionContext is good */
@@ -3249,9 +4084,9 @@ static void
 ShowTransactionState(const char *str)
 {
        /* skip work if message will definitely not be printed */
-       if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
+       if (log_min_messages <= DEBUG3 || client_min_messages <= DEBUG3)
        {
-               elog(DEBUG2, "%s", str);
+               elog(DEBUG3, "%s", str);
                ShowTransactionStateRec(CurrentTransactionState);
        }
 }
@@ -3267,13 +4102,14 @@ ShowTransactionStateRec(TransactionState s)
                ShowTransactionStateRec(s->parent);
 
        /* use ereport to suppress computation if msg will not be printed */
-       ereport(DEBUG2,
-                       (errmsg_internal("name: %s; blockState: %13s; state: %7s, xid/cid: %u/%02u, nestlvl: %d, children: %s",
+       ereport(DEBUG3,
+                       (errmsg_internal("name: %s; blockState: %13s; state: %7s, xid/subid/cid: %u/%u/%u, nestlvl: %d, children: %s",
                                                         PointerIsValid(s->name) ? s->name : "unnamed",
                                                         BlockStateAsString(s->blockState),
                                                         TransStateAsString(s->state),
-                                                        (unsigned int) s->transactionIdData,
-                                                        (unsigned int) s->commandId,
+                                                        (unsigned int) s->transactionId,
+                                                        (unsigned int) s->subTransactionId,
+                                                        (unsigned int) currentCommandId,
                                                         s->nestingLevel,
                                                         nodeToString(s->childXids))));
 }
@@ -3299,8 +4135,12 @@ BlockStateAsString(TBlockState blockState)
                        return "END";
                case TBLOCK_ABORT:
                        return "ABORT";
-               case TBLOCK_ENDABORT:
-                       return "ENDABORT";
+               case TBLOCK_ABORT_END:
+                       return "ABORT END";
+               case TBLOCK_ABORT_PENDING:
+                       return "ABORT PEND";
+               case TBLOCK_PREPARE:
+                       return "PREPARE";
                case TBLOCK_SUBBEGIN:
                        return "SUB BEGIN";
                case TBLOCK_SUBINPROGRESS:
@@ -3309,14 +4149,14 @@ BlockStateAsString(TBlockState blockState)
                        return "SUB END";
                case TBLOCK_SUBABORT:
                        return "SUB ABORT";
-               case TBLOCK_SUBENDABORT_ALL:
-                       return "SUB ENDAB ALL";
-               case TBLOCK_SUBENDABORT:
-                       return "SUB ENDAB";
+               case TBLOCK_SUBABORT_END:
+                       return "SUB ABORT END";
                case TBLOCK_SUBABORT_PENDING:
                        return "SUB ABRT PEND";
-               case TBLOCK_SUBENDABORT_RELEASE:
-                       return "SUB ENDAB REL";
+               case TBLOCK_SUBRESTART:
+                       return "SUB RESTART";
+               case TBLOCK_SUBABORT_RESTART:
+                       return "SUB AB RESTRT";
        }
        return "UNRECOGNIZED";
 }
@@ -3334,12 +4174,14 @@ TransStateAsString(TransState state)
                        return "DEFAULT";
                case TRANS_START:
                        return "START";
+               case TRANS_INPROGRESS:
+                       return "INPROGR";
                case TRANS_COMMIT:
                        return "COMMIT";
                case TRANS_ABORT:
                        return "ABORT";
-               case TRANS_INPROGRESS:
-                       return "INPROGR";
+               case TRANS_PREPARE:
+                       return "PREPARE";
        }
        return "UNRECOGNIZED";
 }
@@ -3347,7 +4189,7 @@ TransStateAsString(TransState state)
 /*
  * xactGetCommittedChildren
  *
- * Gets the list of committed children of the current transaction.  The return
+ * Gets the list of committed children of the current transaction.     The return
  * value is the number of child transactions.  *children is set to point to a
  * palloc'd array of TransactionIds.  If there are no subxacts, *children is
  * set to NULL.
@@ -3355,10 +4197,10 @@ TransStateAsString(TransState state)
 int
 xactGetCommittedChildren(TransactionId **ptr)
 {
-       TransactionState        s = CurrentTransactionState;
-       int                                     nchildren;
-       TransactionId      *children;
-       ListCell                   *p;
+       TransactionState s = CurrentTransactionState;
+       int                     nchildren;
+       TransactionId *children;
+       ListCell   *p;
 
        nchildren = list_length(s->childXids);
        if (nchildren == 0)
@@ -3372,7 +4214,7 @@ xactGetCommittedChildren(TransactionId **ptr)
 
        foreach(p, s->childXids)
        {
-               TransactionId child = lfirst_int(p);
+               TransactionId child = lfirst_xid(p);
 
                *children++ = child;
        }
@@ -3384,6 +4226,76 @@ xactGetCommittedChildren(TransactionId **ptr)
  *     XLOG support routines
  */
 
+static void
+xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
+{
+       TransactionId *sub_xids;
+       TransactionId max_xid;
+       int                     i;
+
+       TransactionIdCommit(xid);
+
+       /* Mark committed subtransactions as committed */
+       sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+       TransactionIdCommitTree(xlrec->nsubxacts, sub_xids);
+
+       /* Make sure nextXid is beyond any XID mentioned in the record */
+       max_xid = xid;
+       for (i = 0; i < xlrec->nsubxacts; i++)
+       {
+               if (TransactionIdPrecedes(max_xid, sub_xids[i]))
+                       max_xid = sub_xids[i];
+       }
+       if (TransactionIdFollowsOrEquals(max_xid,
+                                                                        ShmemVariableCache->nextXid))
+       {
+               ShmemVariableCache->nextXid = max_xid;
+               TransactionIdAdvance(ShmemVariableCache->nextXid);
+       }
+
+       /* Make sure files supposed to be dropped are dropped */
+       for (i = 0; i < xlrec->nrels; i++)
+       {
+               XLogDropRelation(xlrec->xnodes[i]);
+               smgrdounlink(smgropen(xlrec->xnodes[i]), false, true);
+       }
+}
+
+static void
+xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
+{
+       TransactionId *sub_xids;
+       TransactionId max_xid;
+       int                     i;
+
+       TransactionIdAbort(xid);
+
+       /* Mark subtransactions as aborted */
+       sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
+       TransactionIdAbortTree(xlrec->nsubxacts, sub_xids);
+
+       /* Make sure nextXid is beyond any XID mentioned in the record */
+       max_xid = xid;
+       for (i = 0; i < xlrec->nsubxacts; i++)
+       {
+               if (TransactionIdPrecedes(max_xid, sub_xids[i]))
+                       max_xid = sub_xids[i];
+       }
+       if (TransactionIdFollowsOrEquals(max_xid,
+                                                                        ShmemVariableCache->nextXid))
+       {
+               ShmemVariableCache->nextXid = max_xid;
+               TransactionIdAdvance(ShmemVariableCache->nextXid);
+       }
+
+       /* Make sure files supposed to be dropped are dropped */
+       for (i = 0; i < xlrec->nrels; i++)
+       {
+               XLogDropRelation(xlrec->xnodes[i]);
+               smgrdounlink(smgropen(xlrec->xnodes[i]), false, true);
+       }
+}
+
 void
 xact_redo(XLogRecPtr lsn, XLogRecord *record)
 {
@@ -3392,130 +4304,132 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
        if (info == XLOG_XACT_COMMIT)
        {
                xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
-               int             i;
 
-               TransactionIdCommit(record->xl_xid);
-               /* Mark committed subtransactions as committed */
-               TransactionIdCommitTree(xlrec->nsubxacts,
-                                                               (TransactionId *) &(xlrec->xnodes[xlrec->nrels]));
-               /* Make sure files supposed to be dropped are dropped */
-               for (i = 0; i < xlrec->nrels; i++)
-               {
-                       XLogCloseRelation(xlrec->xnodes[i]);
-                       smgrdounlink(smgropen(xlrec->xnodes[i]), false, true);
-               }
+               xact_redo_commit(xlrec, record->xl_xid);
        }
        else if (info == XLOG_XACT_ABORT)
        {
                xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
-               int             i;
 
-               TransactionIdAbort(record->xl_xid);
-               /* mark subtransactions as aborted */
-               TransactionIdAbortTree(xlrec->nsubxacts,
-                                                          (TransactionId *) &(xlrec->xnodes[xlrec->nrels]));
-               /* Make sure files supposed to be dropped are dropped */
+               xact_redo_abort(xlrec, record->xl_xid);
+       }
+       else if (info == XLOG_XACT_PREPARE)
+       {
+               /* the record contents are exactly the 2PC file */
+               RecreateTwoPhaseFile(record->xl_xid,
+                                                        XLogRecGetData(record), record->xl_len);
+       }
+       else if (info == XLOG_XACT_COMMIT_PREPARED)
+       {
+               xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) XLogRecGetData(record);
+
+               xact_redo_commit(&xlrec->crec, xlrec->xid);
+               RemoveTwoPhaseFile(xlrec->xid, false);
+       }
+       else if (info == XLOG_XACT_ABORT_PREPARED)
+       {
+               xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) XLogRecGetData(record);
+
+               xact_redo_abort(&xlrec->arec, xlrec->xid);
+               RemoveTwoPhaseFile(xlrec->xid, false);
+       }
+       else
+               elog(PANIC, "xact_redo: unknown op code %u", info);
+}
+
+static void
+xact_desc_commit(StringInfo buf, xl_xact_commit *xlrec)
+{
+       int                     i;
+
+       appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
+       if (xlrec->nrels > 0)
+       {
+               appendStringInfo(buf, "; rels:");
                for (i = 0; i < xlrec->nrels; i++)
                {
-                       XLogCloseRelation(xlrec->xnodes[i]);
-                       smgrdounlink(smgropen(xlrec->xnodes[i]), false, true);
+                       RelFileNode rnode = xlrec->xnodes[i];
+
+                       appendStringInfo(buf, " %u/%u/%u",
+                                                        rnode.spcNode, rnode.dbNode, rnode.relNode);
                }
        }
-       else
-               elog(PANIC, "xact_redo: unknown op code %u", info);
+       if (xlrec->nsubxacts > 0)
+       {
+               TransactionId *xacts = (TransactionId *)
+               &xlrec->xnodes[xlrec->nrels];
+
+               appendStringInfo(buf, "; subxacts:");
+               for (i = 0; i < xlrec->nsubxacts; i++)
+                       appendStringInfo(buf, " %u", xacts[i]);
+       }
 }
 
-void
-xact_undo(XLogRecPtr lsn, XLogRecord *record)
+static void
+xact_desc_abort(StringInfo buf, xl_xact_abort *xlrec)
 {
-       uint8           info = record->xl_info & ~XLR_INFO_MASK;
+       int                     i;
 
-       if (info == XLOG_XACT_COMMIT)           /* shouldn't be called by XLOG */
-               elog(PANIC, "xact_undo: can't undo committed xaction");
-       else if (info != XLOG_XACT_ABORT)
-               elog(PANIC, "xact_redo: unknown op code %u", info);
+       appendStringInfoString(buf, timestamptz_to_str(xlrec->xact_time));
+       if (xlrec->nrels > 0)
+       {
+               appendStringInfo(buf, "; rels:");
+               for (i = 0; i < xlrec->nrels; i++)
+               {
+                       RelFileNode rnode = xlrec->xnodes[i];
+
+                       appendStringInfo(buf, " %u/%u/%u",
+                                                        rnode.spcNode, rnode.dbNode, rnode.relNode);
+               }
+       }
+       if (xlrec->nsubxacts > 0)
+       {
+               TransactionId *xacts = (TransactionId *)
+               &xlrec->xnodes[xlrec->nrels];
+
+               appendStringInfo(buf, "; subxacts:");
+               for (i = 0; i < xlrec->nsubxacts; i++)
+                       appendStringInfo(buf, " %u", xacts[i]);
+       }
 }
 
 void
-xact_desc(char *buf, uint8 xl_info, char *rec)
+xact_desc(StringInfo buf, uint8 xl_info, char *rec)
 {
        uint8           info = xl_info & ~XLR_INFO_MASK;
-       int i;
 
        if (info == XLOG_XACT_COMMIT)
        {
                xl_xact_commit *xlrec = (xl_xact_commit *) rec;
-               struct tm  *tm = localtime(&xlrec->xtime);
-
-               sprintf(buf + strlen(buf), "commit: %04u-%02u-%02u %02u:%02u:%02u",
-                               tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
-                               tm->tm_hour, tm->tm_min, tm->tm_sec);
-               if (xlrec->nrels > 0)
-               {
-                       sprintf(buf + strlen(buf), "; rels:");
-                       for (i = 0; i < xlrec->nrels; i++)
-                       {
-                               RelFileNode rnode = xlrec->xnodes[i];
-                               sprintf(buf + strlen(buf), " %u/%u/%u",
-                                               rnode.spcNode, rnode.dbNode, rnode.relNode);
-                       }
-               }
-               if (xlrec->nsubxacts > 0)
-               {
-                       TransactionId *xacts = (TransactionId *)
-                               &xlrec->xnodes[xlrec->nrels];
 
-                       sprintf(buf + strlen(buf), "; subxacts:");
-                       for (i = 0; i < xlrec->nsubxacts; i++)
-                               sprintf(buf + strlen(buf), " %u", xacts[i]);
-               }
+               appendStringInfo(buf, "commit: ");
+               xact_desc_commit(buf, xlrec);
        }
        else if (info == XLOG_XACT_ABORT)
        {
                xl_xact_abort *xlrec = (xl_xact_abort *) rec;
-               struct tm  *tm = localtime(&xlrec->xtime);
 
-               sprintf(buf + strlen(buf), "abort: %04u-%02u-%02u %02u:%02u:%02u",
-                               tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
-                               tm->tm_hour, tm->tm_min, tm->tm_sec);
-               if (xlrec->nrels > 0)
-               {
-                       sprintf(buf + strlen(buf), "; rels:");
-                       for (i = 0; i < xlrec->nrels; i++)
-                       {
-                               RelFileNode rnode = xlrec->xnodes[i];
-                               sprintf(buf + strlen(buf), " %u/%u/%u",
-                                               rnode.spcNode, rnode.dbNode, rnode.relNode);
-                       }
-               }
-               if (xlrec->nsubxacts > 0)
-               {
-                       TransactionId *xacts = (TransactionId *)
-                               &xlrec->xnodes[xlrec->nrels];
-
-                       sprintf(buf + strlen(buf), "; subxacts:");
-                       for (i = 0; i < xlrec->nsubxacts; i++)
-                               sprintf(buf + strlen(buf), " %u", xacts[i]);
-               }
+               appendStringInfo(buf, "abort: ");
+               xact_desc_abort(buf, xlrec);
        }
-       else
-               strcat(buf, "UNKNOWN");
-}
-
-void
-XactPushRollback(void (*func) (void *), void *data)
-{
-#ifdef XLOG_II
-       if (_RollbackFunc != NULL)
-               elog(PANIC, "XactPushRollback: already installed");
-#endif
+       else if (info == XLOG_XACT_PREPARE)
+       {
+               appendStringInfo(buf, "prepare");
+       }
+       else if (info == XLOG_XACT_COMMIT_PREPARED)
+       {
+               xl_xact_commit_prepared *xlrec = (xl_xact_commit_prepared *) rec;
 
-       _RollbackFunc = func;
-       _RollbackData = data;
-}
+               appendStringInfo(buf, "commit %u: ", xlrec->xid);
+               xact_desc_commit(buf, &xlrec->crec);
+       }
+       else if (info == XLOG_XACT_ABORT_PREPARED)
+       {
+               xl_xact_abort_prepared *xlrec = (xl_xact_abort_prepared *) rec;
 
-void
-XactPopRollback(void)
-{
-       _RollbackFunc = NULL;
+               appendStringInfo(buf, "abort %u: ", xlrec->xid);
+               xact_desc_abort(buf, &xlrec->arec);
+       }
+       else
+               appendStringInfo(buf, "UNKNOWN");
 }