OSDN Git Service

Add recovery_end_command option to recovery.conf. recovery_end_command
[pg-rex/syncrep.git] / src / backend / access / transam / xlog.c
index 47041c4..09b5075 100644 (file)
@@ -4,10 +4,10 @@
  *             PostgreSQL transaction log manager
  *
  *
- * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.325 2008/12/24 20:41:29 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.338 2009/05/14 20:31:09 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -17,6 +17,7 @@
 #include <ctype.h>
 #include <signal.h>
 #include <time.h>
+#include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/wait.h>
@@ -35,6 +36,7 @@
 #include "catalog/pg_control.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
+#include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
@@ -46,6 +48,7 @@
 #include "storage/smgr.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
+#include "utils/flatfiles.h"
 #include "utils/guc.h"
 #include "utils/ps_status.h"
 #include "pg_trace.h"
@@ -118,21 +121,36 @@ CheckpointStatsData CheckpointStats;
  */
 TimeLineID     ThisTimeLineID = 0;
 
-/* Are we doing recovery from XLOG? */
+/*
+ * Are we doing recovery from XLOG? 
+ *
+ * This is only ever true in the startup process, even if the system is still
+ * in recovery. Prior to 8.4, all activity during recovery were carried out
+ * by Startup process. This local variable continues to be used in functions
+ * that need to act differently when called from a redo function (e.g skip
+ * WAL logging). To check whether the system is in recovery regardless of what
+ * process you're running in, use RecoveryInProgress().
+ */
 bool           InRecovery = false;
 
 /* Are we recovering using offline XLOG archives? */
 static bool InArchiveRecovery = false;
 
+/*
+ * Local copy of SharedRecoveryInProgress variable. True actually means "not
+ * known, need to check the shared state"
+ */
+static bool LocalRecoveryInProgress = true;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;
 
 /* options taken from recovery.conf */
 static char *recoveryRestoreCommand = NULL;
+static char *recoveryEndCommand = NULL;
 static bool recoveryTarget = false;
 static bool recoveryTargetExact = false;
 static bool recoveryTargetInclusive = true;
-static bool recoveryLogRestartpoints = false;
 static TransactionId recoveryTargetXid;
 static TimestampTz recoveryTargetTime;
 static TimestampTz recoveryLastXTime = 0;
@@ -241,9 +259,8 @@ static XLogRecPtr RedoRecPtr;
  * ControlFileLock: must be held to read/update control file or create
  * new log file.
  *
- * CheckpointLock: must be held to do a checkpoint (ensures only one
- * checkpointer at a time; currently, with all checkpoints done by the
- * bgwriter, this is just pro forma).
+ * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
+ * only one checkpointer at a time)
  *
  *----------
  */
@@ -312,6 +329,25 @@ typedef struct XLogCtlData
        int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
        TimeLineID      ThisTimeLineID;
 
+       /*
+        * SharedRecoveryInProgress indicates if we're still in crash or archive
+        * recovery.  It's checked by RecoveryInProgress().
+        */
+       bool            SharedRecoveryInProgress;
+
+       /*
+        * During recovery, we keep a copy of the latest checkpoint record
+        * here.  Used by the background writer when it wants to create
+        * a restartpoint.
+        *
+        * Protected by info_lck.
+        */
+       XLogRecPtr      lastCheckPointRecPtr;
+       CheckPoint      lastCheckPoint;
+
+       /* end+1 of the last record replayed (or being replayed) */
+       XLogRecPtr      replayEndRecPtr;
+
        slock_t         info_lck;               /* locks shared variables shown above */
 } XLogCtlData;
 
@@ -386,9 +422,22 @@ static XLogRecPtr ReadRecPtr;      /* start of last record read */
 static XLogRecPtr EndRecPtr;   /* end+1 of last record read */
 static XLogRecord *nextRecord = NULL;
 static TimeLineID lastPageTLI = 0;
+static XLogRecPtr minRecoveryPoint; /* local copy of ControlFile->minRecoveryPoint */
+static bool    updateMinRecoveryPoint = true;
 
 static bool InRedo = false;
 
+/*
+ * Flag set by interrupt handlers for later service in the redo loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t shutdown_requested = false;
+/*
+ * Flag set when executing a restore command, to tell SIGTERM signal handler
+ * that it's safe to just proc_exit.
+ */
+static volatile sig_atomic_t in_restore_command = false;
+
 
 static void XLogArchiveNotify(const char *xlog);
 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
@@ -415,10 +464,12 @@ static int        XLogFileRead(uint32 log, uint32 seg, int emode);
 static void XLogFileClose(void);
 static bool RestoreArchivedFile(char *path, const char *xlogfname,
                                        const char *recovername, off_t expectedSize);
+static void ExecuteRecoveryEndCommand(void);
 static void PreallocXlogFiles(XLogRecPtr endptr);
 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 static void ValidateXLOGDirectoryStructure(void);
 static void CleanupBackupHistory(void);
+static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
@@ -483,6 +534,10 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
        bool            doPageWrites;
        bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 
+       /* cross-check on whether we should be here or not */
+       if (RecoveryInProgress())
+               elog(FATAL, "cannot make new WAL entries during recovery");
+
        /* info's high bits are reserved for use by me */
        if (info & XLR_INFO_MASK)
                elog(PANIC, "invalid xlog info mask %02X", info);
@@ -1717,6 +1772,63 @@ XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
 }
 
 /*
+ * Advance minRecoveryPoint in control file.
+ *
+ * If we crash during recovery, we must reach this point again before the
+ * database is consistent. 
+ * 
+ * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
+ * is is only updated if it's not already greater than or equal to 'lsn'.
+ */
+static void
+UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
+{
+       /* Quick check using our local copy of the variable */
+       if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
+               return;
+
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+       /* update local copy */
+       minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+       /*
+        * An invalid minRecoveryPoint means that we need to recover all the WAL,
+        * ie. crash recovery. Don't update the control file in that case.
+        */
+       if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+               updateMinRecoveryPoint = false;
+       else if (force || XLByteLT(minRecoveryPoint, lsn))
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+               XLogRecPtr newMinRecoveryPoint;
+
+               /*
+                * To avoid having to update the control file too often, we update it
+                * all the way to the last record being replayed, even though 'lsn'
+                * would suffice for correctness.
+                */
+               SpinLockAcquire(&xlogctl->info_lck);
+               newMinRecoveryPoint = xlogctl->replayEndRecPtr;
+               SpinLockRelease(&xlogctl->info_lck);
+
+               /* update control file */
+               if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
+               {
+                       ControlFile->minRecoveryPoint = newMinRecoveryPoint;
+                       UpdateControlFile();
+                       minRecoveryPoint = newMinRecoveryPoint;
+
+                       ereport(DEBUG2,
+                                       (errmsg("updated min recovery point to %X/%X",
+                                               minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+               }
+       }
+       LWLockRelease(ControlFileLock);
+}
+
+/*
  * Ensure that all XLOG data through the given position is flushed to disk.
  *
  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
@@ -1728,9 +1840,15 @@ XLogFlush(XLogRecPtr record)
        XLogRecPtr      WriteRqstPtr;
        XLogwrtRqst WriteRqst;
 
-       /* Disabled during REDO */
-       if (InRedo)
+       /*
+        * During REDO, we don't try to flush the WAL, but update minRecoveryPoint
+        * instead.
+        */
+       if (RecoveryInProgress())
+       {
+               UpdateMinRecoveryPoint(record, false);
                return;
+       }
 
        /* Quick exit if already known flushed */
        if (XLByteLE(record, LogwrtResult.Flush))
@@ -1817,9 +1935,9 @@ XLogFlush(XLogRecPtr record)
         * the bad page is encountered again during recovery then we would be
         * unable to restart the database at all!  (This scenario has actually
         * happened in the field several times with 7.1 releases. Note that we
-        * cannot get here while InRedo is true, but if the bad page is brought in
-        * and marked dirty during recovery then CreateCheckPoint will try to
-        * flush it at the end of recovery.)
+        * cannot get here while RecoveryInProgress(), but if the bad page is
+        * brought in and marked dirty during recovery then if a checkpoint were
+        * performed at the end of recovery it will try to flush it.
         *
         * The current approach is to ERROR under normal conditions, but only
         * WARNING during recovery, so that the system can be brought up even if
@@ -1856,6 +1974,10 @@ XLogBackgroundFlush(void)
        XLogRecPtr      WriteRqstPtr;
        bool            flexible = true;
 
+       /* XLOG doesn't need flushing during recovery */
+       if (RecoveryInProgress())
+               return;
+
        /* read LogwrtResult and update local state */
        {
                /* use volatile pointer to prevent code rearrangement */
@@ -1927,6 +2049,10 @@ XLogAsyncCommitFlush(void)
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;
 
+       /* There's no asynchronously committed transactions during recovery */
+       if (RecoveryInProgress())
+               return;
+
        SpinLockAcquire(&xlogctl->info_lck);
        WriteRqstPtr = xlogctl->asyncCommitLSN;
        SpinLockRelease(&xlogctl->info_lck);
@@ -1943,6 +2069,10 @@ XLogAsyncCommitFlush(void)
 bool
 XLogNeedsFlush(XLogRecPtr record)
 {
+       /* XLOG doesn't need flushing during recovery */
+       if (RecoveryInProgress())
+               return false;
+
        /* Quick exit if already known flushed */
        if (XLByteLE(record, LogwrtResult.Flush))
                return false;
@@ -2436,29 +2566,17 @@ XLogFileClose(void)
        Assert(openLogFile >= 0);
 
        /*
-        * posix_fadvise is problematic on many platforms: on older x86 Linux it
-        * just dumps core, and there are reports of problems on PPC platforms as
-        * well.  The following is therefore disabled for the time being. We could
-        * consider some kind of configure test to see if it's safe to use, but
-        * since we lack hard evidence that there's any useful performance gain to
-        * be had, spending time on that seems unprofitable for now.
-        */
-#ifdef NOT_USED
-
-       /*
         * WAL segment files will not be re-read in normal operation, so we advise
-        * OS to release any cached pages.      But do not do so if WAL archiving is
-        * active, because archiver process could use the cache to read the WAL
-        * segment.
-        *
-        * While O_DIRECT works for O_SYNC, posix_fadvise() works for fsync() and
-        * O_SYNC, and some platforms only have posix_fadvise().
-        */
-#if defined(HAVE_DECL_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-       if (!XLogArchivingActive())
-               posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
+        * the OS to release any cached pages.  But do not do so if WAL archiving
+        * is active, because archiver process could use the cache to read the WAL
+        * segment.  Also, don't bother with it if we are using O_DIRECT, since
+        * the kernel is presumably not caching in that case.
+        */
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+       if (!XLogArchivingActive() &&
+               (get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
+               (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 #endif
-#endif   /* NOT_USED */
 
        if (close(openLogFile))
                ereport(PANIC,
@@ -2630,9 +2748,22 @@ RestoreArchivedFile(char *path, const char *xlogfname,
                                                         xlogRestoreCmd)));
 
        /*
+        * Set in_restore_command to tell the signal handler that we should exit
+        * right away on SIGTERM. We know that we're in a safe point to do that.
+        * Check if we had already received the signal, so that we don't miss a
+        * shutdown request received just before this.
+        */
+       in_restore_command = true;
+       if (shutdown_requested)
+               proc_exit(1);
+
+       /*
         * Copy xlog from archival storage to XLOGDIR
         */
        rc = system(xlogRestoreCmd);
+
+       in_restore_command = false;
+
        if (rc == 0)
        {
                /*
@@ -2685,14 +2816,24 @@ RestoreArchivedFile(char *path, const char *xlogfname,
         * assume that recovery is complete and start up the database!) It's
         * essential to abort on child SIGINT and SIGQUIT, because per spec
         * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
-        * those it's a good bet we should have gotten it too.  Aborting on other
-        * signals such as SIGTERM seems a good idea as well.
+        * those it's a good bet we should have gotten it too.
+        *
+        * On SIGTERM, assume we have received a fast shutdown request, and exit
+        * cleanly. It's pure chance whether we receive the SIGTERM first, or the
+        * child process. If we receive it first, the signal handler will call
+        * proc_exit, otherwise we do it here. If we or the child process
+        * received SIGTERM for any other reason than a fast shutdown request,
+        * postmaster will perform an immediate shutdown when it sees us exiting
+        * unexpectedly.
         *
         * Per the Single Unix Spec, shells report exit status > 128 when a called
         * command died on a signal.  Also, 126 and 127 are used to report
         * problems such as an unfindable command; treat those as fatal errors
         * too.
         */
+       if (WTERMSIG(rc) == SIGTERM)
+               proc_exit(1);
+
        signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
 
        ereport(signaled ? FATAL : DEBUG2,
@@ -2711,6 +2852,114 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 }
 
 /*
+ * Attempt to execute the recovery_end_command.
+ */
+static void
+ExecuteRecoveryEndCommand(void)
+{
+       char            xlogRecoveryEndCmd[MAXPGPATH];
+       char            lastRestartPointFname[MAXPGPATH];
+       char       *dp;
+       char       *endp;
+       const char *sp;
+       int                     rc;
+       bool            signaled;
+       uint32          restartLog;
+       uint32          restartSeg;
+
+       Assert(recoveryEndCommand);
+
+       /*
+        * Calculate the archive file cutoff point for use during log shipping
+        * replication. All files earlier than this point can be deleted
+        * from the archive, though there is no requirement to do so.
+        *
+        * We initialise this with the filename of an InvalidXLogRecPtr, which
+        * will prevent the deletion of any WAL files from the archive
+        * because of the alphabetic sorting property of WAL filenames. 
+        *
+        * Once we have successfully located the redo pointer of the checkpoint
+        * from which we start recovery we never request a file prior to the redo
+        * pointer of the last restartpoint. When redo begins we know that we
+        * have successfully located it, so there is no need for additional
+        * status flags to signify the point when we can begin deleting WAL files
+        * from the archive. 
+        */
+       if (InRedo)
+       {
+               XLByteToSeg(ControlFile->checkPointCopy.redo,
+                                       restartLog, restartSeg);
+               XLogFileName(lastRestartPointFname,
+                                        ControlFile->checkPointCopy.ThisTimeLineID,
+                                        restartLog, restartSeg);
+       }
+       else
+               XLogFileName(lastRestartPointFname, 0, 0, 0);
+
+       /*
+        * construct the command to be executed
+        */
+       dp = xlogRecoveryEndCmd;
+       endp = xlogRecoveryEndCmd + MAXPGPATH - 1;
+       *endp = '\0';
+
+       for (sp = recoveryEndCommand; *sp; sp++)
+       {
+               if (*sp == '%')
+               {
+                       switch (sp[1])
+                       {
+                               case 'r':
+                                       /* %r: filename of last restartpoint */
+                                       sp++;
+                                       StrNCpy(dp, lastRestartPointFname, endp - dp);
+                                       dp += strlen(dp);
+                                       break;
+                               case '%':
+                                       /* convert %% to a single % */
+                                       sp++;
+                                       if (dp < endp)
+                                               *dp++ = *sp;
+                                       break;
+                               default:
+                                       /* otherwise treat the % as not special */
+                                       if (dp < endp)
+                                               *dp++ = *sp;
+                                       break;
+                       }
+               }
+               else
+               {
+                       if (dp < endp)
+                               *dp++ = *sp;
+               }
+       }
+       *dp = '\0';
+
+       ereport(DEBUG3,
+                       (errmsg_internal("executing recovery end command \"%s\"",
+                                                        xlogRecoveryEndCmd)));
+
+       /*
+        * Copy xlog from archival storage to XLOGDIR
+        */
+       rc = system(xlogRecoveryEndCmd);
+       if (rc != 0)
+       {
+               /*
+                * If the failure was due to any sort of signal, it's best to punt and
+                * abort recovery. See also detailed comments on signals in 
+                * RestoreArchivedFile().
+                */
+               signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
+
+               ereport(signaled ? FATAL : WARNING,
+                               (errmsg("recovery_end_command \"%s\": return code %d",
+                                                               xlogRecoveryEndCmd, rc)));
+       }
+}
+
+/*
  * Preallocate log files beyond the specified log endpoint.
  *
  * XXX this is currently extremely conservative, since it forces only one
@@ -2933,9 +3182,15 @@ CleanupBackupHistory(void)
  * page might not be.  This will force us to replay all subsequent
  * modifications of the page that appear in XLOG, rather than possibly
  * ignoring them as already applied, but that's not a huge drawback.
+ *
+ * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
+ * Otherwise, a normal exclusive lock is used.  At the moment, that's just
+ * pro forma, because there can't be any regular backends in the system
+ * during recovery.  The 'cleanup' argument applies to all backup blocks
+ * in the WAL record, that suffices for now.
  */
-static void
-RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
+void
+RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
 {
        Buffer          buffer;
        Page            page;
@@ -2943,6 +3198,9 @@ RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
        char       *blk;
        int                     i;
 
+       if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
+               return;
+
        blk = (char *) XLogRecGetData(record) + record->xl_len;
        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
        {
@@ -2955,6 +3213,11 @@ RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
                buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
                                                                                RBM_ZERO);
                Assert(BufferIsValid(buffer));
+               if (cleanup)
+                       LockBufferForCleanup(buffer);
+               else
+                       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
                page = (Page) BufferGetPage(buffer);
 
                if (bkpb.hole_length == 0)
@@ -4042,15 +4305,9 @@ ReadControlFile(void)
                                (errmsg("incorrect checksum in control file")));
 
        /*
-        * Do compatibility checking immediately.  We do this here for 2 reasons:
-        *
-        * (1) if the database isn't compatible with the backend executable, we
-        * want to abort before we can possibly do any damage;
-        *
-        * (2) this code is executed in the postmaster, so the setlocale() will
-        * propagate to forked backends, which aren't going to read this file for
-        * themselves.  (These locale settings are considered critical
-        * compatibility items because they can affect sort order of indexes.)
+        * Do compatibility checking immediately.  If the database isn't
+        * compatible with the backend executable, we want to abort before we
+        * can possibly do any damage.
         */
        if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
                ereport(FATAL,
@@ -4517,6 +4774,13 @@ readRecoveryCommandFile(void)
                                        (errmsg("restore_command = '%s'",
                                                        recoveryRestoreCommand)));
                }
+               else if (strcmp(tok1, "recovery_end_command") == 0)
+               {
+                       recoveryEndCommand = pstrdup(tok2);
+                       ereport(LOG,
+                                       (errmsg("recovery_end_command = '%s'",
+                                                       recoveryEndCommand)));
+               }
                else if (strcmp(tok1, "recovery_target_timeline") == 0)
                {
                        rtliGiven = true;
@@ -4587,18 +4851,6 @@ readRecoveryCommandFile(void)
                        ereport(LOG,
                                        (errmsg("recovery_target_inclusive = %s", tok2)));
                }
-               else if (strcmp(tok1, "log_restartpoints") == 0)
-               {
-                       /*
-                        * does nothing if a recovery_target is not also set
-                        */
-                       if (!parse_bool(tok2, &recoveryLogRestartpoints))
-                                 ereport(ERROR,
-                                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                                         errmsg("parameter \"log_restartpoints\" requires a Boolean value")));
-                       ereport(LOG,
-                                       (errmsg("log_restartpoints = %s", tok2)));
-               }
                else
                        ereport(FATAL,
                                        (errmsg("unrecognized recovery parameter \"%s\"",
@@ -4715,10 +4967,22 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
                 * If we are establishing a new timeline, we have to copy data from
                 * the last WAL segment of the old timeline to create a starting WAL
                 * segment for the new timeline.
+                *
+                * Notify the archiver that the last WAL segment of the old timeline
+                * is ready to copy to archival storage. Otherwise, it is not archived
+                * for a while.
                 */
                if (endTLI != ThisTimeLineID)
+               {
                        XLogFileCopy(endLogId, endLogSeg,
                                                 endTLI, endLogId, endLogSeg);
+
+                       if (XLogArchivingActive())
+                       {
+                               XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
+                               XLogArchiveNotify(xlogpath);
+                       }
+               }
        }
 
        /*
@@ -4880,7 +5144,7 @@ StartupXLOG(void)
        XLogRecPtr      RecPtr,
                                LastRec,
                                checkPointLoc,
-                               minRecoveryLoc,
+                               backupStopLoc,
                                EndOfLog;
        uint32          endLogId;
        uint32          endLogSeg;
@@ -4888,6 +5152,8 @@ StartupXLOG(void)
        uint32          freespace;
        TransactionId oldestActiveXID;
 
+       XLogCtl->SharedRecoveryInProgress = true;
+
        /*
         * Read control file and check XLOG status looks valid.
         *
@@ -4967,7 +5233,7 @@ StartupXLOG(void)
                                                recoveryTargetTLI,
                                                ControlFile->checkPointCopy.ThisTimeLineID)));
 
-       if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
+       if (read_backup_label(&checkPointLoc, &backupStopLoc))
        {
                /*
                 * When a backup_label file is present, we want to roll forward from
@@ -5105,11 +5371,23 @@ StartupXLOG(void)
                ControlFile->prevCheckPoint = ControlFile->checkPoint;
                ControlFile->checkPoint = checkPointLoc;
                ControlFile->checkPointCopy = checkPoint;
-               if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
-                       ControlFile->minRecoveryPoint = minRecoveryLoc;
+               if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0)
+               {
+                       if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc))
+                               ControlFile->minRecoveryPoint = backupStopLoc;
+               }
                ControlFile->time = (pg_time_t) time(NULL);
+               /* No need to hold ControlFileLock yet, we aren't up far enough */
                UpdateControlFile();
 
+               /* update our local copy of minRecoveryPoint */
+               minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+               /*
+                * Reset pgstat data, because it may be invalid after recovery.
+                */
+               pgstat_reset_all();
+
                /*
                 * If there was a backup label file, it's done its job and the info
                 * has now been propagated into pg_control.  We must get rid of the
@@ -5154,12 +5432,41 @@ StartupXLOG(void)
                {
                        bool            recoveryContinue = true;
                        bool            recoveryApply = true;
+                       bool            reachedMinRecoveryPoint = false;
                        ErrorContextCallback errcontext;
+                       /* use volatile pointer to prevent code rearrangement */
+                       volatile XLogCtlData *xlogctl = XLogCtl;
+
+                       /* Update shared replayEndRecPtr */
+                       SpinLockAcquire(&xlogctl->info_lck);
+                       xlogctl->replayEndRecPtr = ReadRecPtr;
+                       SpinLockRelease(&xlogctl->info_lck);
 
                        InRedo = true;
-                       ereport(LOG,
-                                       (errmsg("redo starts at %X/%X",
-                                                       ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
+
+                       if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+                               ereport(LOG,
+                                               (errmsg("redo starts at %X/%X",
+                                                               ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
+                       else
+                               ereport(LOG,
+                                               (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
+                                               ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
+                                               minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+
+                       /*
+                        * Let postmaster know we've started redo now, so that it can
+                        * launch bgwriter to perform restartpoints.  We don't bother
+                        * during crash recovery as restartpoints can only be performed
+                        * during archive recovery.  And we'd like to keep crash recovery
+                        * simple, to avoid introducing bugs that could you from
+                        * recovering after crash.
+                        *
+                        * After this point, we can no longer assume that we're the only
+                        * process in addition to postmaster!
+                        */
+                       if (InArchiveRecovery && IsUnderPostmaster)
+                               SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
 
                        /*
                         * main redo apply loop
@@ -5186,6 +5493,39 @@ StartupXLOG(void)
 #endif
 
                                /*
+                                * Check if we were requested to re-read config file.
+                                */
+                               if (got_SIGHUP)
+                               {
+                                       got_SIGHUP = false;
+                                       ProcessConfigFile(PGC_SIGHUP);
+                               }
+
+                               /*
+                                * Check if we were requested to exit without finishing
+                                * recovery.
+                                */
+                               if (shutdown_requested)
+                                       proc_exit(1);
+
+                               /*
+                                * Have we reached our safe starting point? If so, we can
+                                * tell postmaster that the database is consistent now.
+                                */
+                               if (!reachedMinRecoveryPoint && 
+                                        XLByteLE(minRecoveryPoint, EndRecPtr))
+                               {
+                                       reachedMinRecoveryPoint = true;
+                                       if (InArchiveRecovery)
+                                       {
+                                               ereport(LOG,
+                                                               (errmsg("consistent recovery state reached")));
+                                               if (IsUnderPostmaster)
+                                                       SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+                                       }
+                               }
+
+                               /*
                                 * Have we reached our recovery target?
                                 */
                                if (recoveryStopsHere(record, &recoveryApply))
@@ -5210,8 +5550,14 @@ StartupXLOG(void)
                                        TransactionIdAdvance(ShmemVariableCache->nextXid);
                                }
 
-                               if (record->xl_info & XLR_BKP_BLOCK_MASK)
-                                       RestoreBkpBlocks(record, EndRecPtr);
+                               /*
+                                * Update shared replayEndRecPtr before replaying this
+                                * record, so that XLogFlush will update minRecoveryPoint
+                                * correctly.
+                                */
+                               SpinLockAcquire(&xlogctl->info_lck);
+                               xlogctl->replayEndRecPtr = EndRecPtr;
+                               SpinLockRelease(&xlogctl->info_lck);
 
                                RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
 
@@ -5256,14 +5602,14 @@ StartupXLOG(void)
         * Complain if we did not roll forward far enough to render the backup
         * dump consistent.
         */
-       if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
+       if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
        {
                if (reachedStopPoint)   /* stopped because of stop request */
                        ereport(FATAL,
-                                       (errmsg("requested recovery stop point is before end time of backup dump")));
+                                       (errmsg("requested recovery stop point is before consistent recovery point")));
                else    /* ran off end of WAL */
                        ereport(FATAL,
-                                       (errmsg("WAL ends before end time of backup dump")));
+                                       (errmsg("WAL ends before consistent recovery point")));
        }
 
        /*
@@ -5358,6 +5704,12 @@ StartupXLOG(void)
        /* Pre-scan prepared transactions to find out the range of XIDs present */
        oldestActiveXID = PrescanPreparedTransactions();
 
+       /*
+        * Allow writing WAL for us, so that we can create a checkpoint record.
+        * But not yet for other backends!
+        */
+       LocalRecoveryInProgress = false;
+
        if (InRecovery)
        {
                int                     rmid;
@@ -5378,11 +5730,6 @@ StartupXLOG(void)
                XLogCheckInvalidPages();
 
                /*
-                * Reset pgstat data, because it may be invalid after recovery.
-                */
-               pgstat_reset_all();
-
-               /*
                 * Perform a checkpoint to update all our recovery activity to disk.
                 *
                 * Note that we write a shutdown checkpoint rather than an on-line
@@ -5392,6 +5739,9 @@ StartupXLOG(void)
                 * allows some extra error checking in xlog_redo.
                 */
                CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+
+               if (recoveryEndCommand)
+                       ExecuteRecoveryEndCommand();
        }
 
        /*
@@ -5404,12 +5754,14 @@ StartupXLOG(void)
         */
        InRecovery = false;
 
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->state = DB_IN_PRODUCTION;
        ControlFile->time = (pg_time_t) time(NULL);
        UpdateControlFile();
+       LWLockRelease(ControlFileLock);
 
        /* start the archive_timeout timer running */
-       XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
+       XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
 
        /* initialize shared-memory copy of latest checkpoint XID/epoch */
        XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
@@ -5444,6 +5796,45 @@ StartupXLOG(void)
                readRecordBuf = NULL;
                readRecordBufSize = 0;
        }
+
+       /*
+        * All done. Allow others to write WAL.
+        */
+       XLogCtl->SharedRecoveryInProgress = false;
+}
+
+/*
+ * Is the system still in recovery?
+ *
+ * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
+ * variables the first time we see that recovery is finished.
+ */
+bool
+RecoveryInProgress(void)
+{
+       /*
+        * We check shared state each time only until we leave recovery mode.
+        * We can't re-enter recovery, so we rely on the local state variable
+        * after that.
+        */
+       if (!LocalRecoveryInProgress)
+               return false;
+       else
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+
+               /*
+                * Initialize TimeLineID and RedoRecPtr the first time we see that
+                * recovery is finished.
+                */
+               if (!LocalRecoveryInProgress)
+                       InitXLOGAccess();
+
+               return LocalRecoveryInProgress;
+       }
 }
 
 /*
@@ -5575,6 +5966,8 @@ InitXLOGAccess(void)
 {
        /* ThisTimeLineID doesn't change so we need no lock to copy it */
        ThisTimeLineID = XLogCtl->ThisTimeLineID;
+       Assert(ThisTimeLineID != 0);
+
        /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
        (void) GetRedoRecPtr();
 }
@@ -5686,7 +6079,10 @@ ShutdownXLOG(int code, Datum arg)
        ereport(LOG,
                        (errmsg("shutting down")));
 
-       CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+       if (RecoveryInProgress())
+               CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+       else
+               CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
        ShutdownCLOG();
        ShutdownSUBTRANS();
        ShutdownMultiXact();
@@ -5699,9 +6095,20 @@ ShutdownXLOG(int code, Datum arg)
  * Log start of a checkpoint.
  */
 static void
-LogCheckpointStart(int flags)
+LogCheckpointStart(int flags, bool restartpoint)
 {
-       elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
+       char *msg;
+
+       /*
+        * XXX: This is hopelessly untranslatable. We could call gettext_noop
+        * for the main message, but what about all the flags?
+        */
+       if (restartpoint)
+               msg = "restartpoint starting:%s%s%s%s%s%s";
+       else
+               msg = "checkpoint starting:%s%s%s%s%s%s";
+
+       elog(LOG, msg,
                 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
                 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
                 (flags & CHECKPOINT_FORCE) ? " force" : "",
@@ -5714,7 +6121,7 @@ LogCheckpointStart(int flags)
  * Log end of a checkpoint.
  */
 static void
-LogCheckpointEnd(void)
+LogCheckpointEnd(bool restartpoint)
 {
        long            write_secs,
                                sync_secs,
@@ -5737,17 +6144,26 @@ LogCheckpointEnd(void)
                                                CheckpointStats.ckpt_sync_end_t,
                                                &sync_secs, &sync_usecs);
 
-       elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
-                "%d transaction log file(s) added, %d removed, %d recycled; "
-                "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
-                CheckpointStats.ckpt_bufs_written,
-                (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-                CheckpointStats.ckpt_segs_added,
-                CheckpointStats.ckpt_segs_removed,
-                CheckpointStats.ckpt_segs_recycled,
-                write_secs, write_usecs / 1000,
-                sync_secs, sync_usecs / 1000,
-                total_secs, total_usecs / 1000);
+       if (restartpoint)
+               elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
+                        "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+                        CheckpointStats.ckpt_bufs_written,
+                        (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+                        write_secs, write_usecs / 1000,
+                        sync_secs, sync_usecs / 1000,
+                        total_secs, total_usecs / 1000);
+       else
+               elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
+                        "%d transaction log file(s) added, %d removed, %d recycled; "
+                        "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+                        CheckpointStats.ckpt_bufs_written,
+                        (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+                        CheckpointStats.ckpt_segs_added,
+                        CheckpointStats.ckpt_segs_removed,
+                        CheckpointStats.ckpt_segs_recycled,
+                        write_secs, write_usecs / 1000,
+                        sync_secs, sync_usecs / 1000,
+                        total_secs, total_usecs / 1000);
 }
 
 /*
@@ -5778,13 +6194,33 @@ CreateCheckPoint(int flags)
        TransactionId *inCommitXids;
        int                     nInCommit;
 
+       /* shouldn't happen */
+       if (RecoveryInProgress())
+               elog(ERROR, "can't create a checkpoint during recovery");
+
        /*
         * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
-        * (This is just pro forma, since in the present system structure there is
-        * only one process that is allowed to issue checkpoints at any given
-        * time.)
+        * During normal operation, bgwriter is the only process that creates
+        * checkpoints, but at the end of archive recovery, the bgwriter can be
+        * busy creating a restartpoint while the startup process tries to perform
+        * the startup checkpoint.
         */
-       LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+       if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
+       {
+               Assert(InRecovery);
+
+               /*
+                * A restartpoint is in progress. Wait until it finishes. This can
+                * cause an extra restartpoint to be performed, but that's OK because
+                * we're just about to perform a checkpoint anyway. Flushing the
+                * buffers in this restartpoint can take some time, but that time is
+                * saved from the upcoming checkpoint so the net effect is zero.
+                */
+               ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint")));
+               RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
+
+               LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+       }
 
        /*
         * Prepare to accumulate statistics.
@@ -5803,9 +6239,11 @@ CreateCheckPoint(int flags)
 
        if (shutdown)
        {
+               LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                ControlFile->state = DB_SHUTDOWNING;
                ControlFile->time = (pg_time_t) time(NULL);
                UpdateControlFile();
+               LWLockRelease(ControlFileLock);
        }
 
        /*
@@ -5909,7 +6347,7 @@ CreateCheckPoint(int flags)
         * to log anything if we decided to skip the checkpoint.
         */
        if (log_checkpoints)
-               LogCheckpointStart(flags);
+               LogCheckpointStart(flags, false);
 
        TRACE_POSTGRESQL_CHECKPOINT_START(flags);
 
@@ -6076,12 +6514,13 @@ CreateCheckPoint(int flags)
 
        /* All real work is done, but log before releasing lock. */
        if (log_checkpoints)
-               LogCheckpointEnd();
+               LogCheckpointEnd(false);
 
-        TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
-                                NBuffers, CheckpointStats.ckpt_segs_added,
-                                CheckpointStats.ckpt_segs_removed,
-                                CheckpointStats.ckpt_segs_recycled);
+       TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
+                                                                        NBuffers,
+                                                                        CheckpointStats.ckpt_segs_added,
+                                                                        CheckpointStats.ckpt_segs_removed,
+                                                                        CheckpointStats.ckpt_segs_recycled);
 
        LWLockRelease(CheckpointLock);
 }
@@ -6104,32 +6543,17 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 }
 
 /*
- * Set a recovery restart point if appropriate
- *
- * This is similar to CreateCheckPoint, but is used during WAL recovery
- * to establish a point from which recovery can roll forward without
- * replaying the entire recovery log.  This function is called each time
- * a checkpoint record is read from XLOG; it must determine whether a
- * restartpoint is needed or not.
+ * This is used during WAL recovery to establish a point from which recovery
+ * can roll forward without replaying the entire recovery log.  This function
+ * is called each time a checkpoint record is read from XLOG. It is stored
+ * in shared memory, so that it can be used as a restartpoint later on.
  */
 static void
 RecoveryRestartPoint(const CheckPoint *checkPoint)
 {
-       int                     elapsed_secs;
        int                     rmid;
-
-       /*
-        * Do nothing if the elapsed time since the last restartpoint is less than
-        * half of checkpoint_timeout.  (We use a value less than
-        * checkpoint_timeout so that variations in the timing of checkpoints on
-        * the master, or speed of transmission of WAL segments to a slave, won't
-        * make the slave skip a restartpoint once it's synced with the master.)
-        * Checking true elapsed time keeps us from doing restartpoints too often
-        * while rapidly scanning large amounts of WAL.
-        */
-       elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
-       if (elapsed_secs < CheckPointTimeout / 2)
-               return;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
 
        /*
         * Is it safe to checkpoint?  We must ask each of the resource managers
@@ -6151,28 +6575,128 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
        }
 
        /*
-        * OK, force data out to disk
+        * Copy the checkpoint record to shared memory, so that bgwriter can
+        * use it the next time it wants to perform a restartpoint.
         */
-       CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
+       SpinLockAcquire(&xlogctl->info_lck);
+       XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
+       memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
+       SpinLockRelease(&xlogctl->info_lck);
+}
+
+/*
+ * This is similar to CreateCheckPoint, but is used during WAL recovery
+ * to establish a point from which recovery can roll forward without
+ * replaying the entire recovery log.
+ *
+ * Returns true if a new restartpoint was established. We can only establish
+ * a restartpoint if we have replayed a checkpoint record since last
+ * restartpoint.
+ */
+bool
+CreateRestartPoint(int flags)
+{
+       XLogRecPtr lastCheckPointRecPtr;
+       CheckPoint lastCheckPoint;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
 
        /*
-        * Update pg_control so that any subsequent crash will restart from this
-        * checkpoint.  Note: ReadRecPtr gives the XLOG address of the checkpoint
-        * record itself.
+        * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
+        * happens at a time.
         */
+       LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+       /* Get the a local copy of the last checkpoint record. */
+       SpinLockAcquire(&xlogctl->info_lck);
+       lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
+       memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
+       SpinLockRelease(&xlogctl->info_lck);
+
+       /* 
+        * Check that we're still in recovery mode. It's ok if we exit recovery
+        * mode after this check, the restart point is valid anyway.
+        */
+       if (!RecoveryInProgress())
+       {
+               ereport(DEBUG2,
+                               (errmsg("skipping restartpoint, recovery has already ended")));
+               LWLockRelease(CheckpointLock);
+               return false;
+       }
+
+       /*
+        * If the last checkpoint record we've replayed is already our last
+        * restartpoint, we can't perform a new restart point. We still update
+        * minRecoveryPoint in that case, so that if this is a shutdown restart
+        * point, we won't start up earlier than before. That's not strictly
+        * necessary, but when we get hot standby capability, it would be rather
+        * weird if the database opened up for read-only connections at a
+        * point-in-time before the last shutdown. Such time travel is still
+        * possible in case of immediate shutdown, though.
+        *
+        * We don't explicitly advance minRecoveryPoint when we do create a
+        * restartpoint. It's assumed that flushing the buffers will do that
+        * as a side-effect.
+        */
+       if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
+               XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
+       {
+               XLogRecPtr InvalidXLogRecPtr = {0, 0};
+               ereport(DEBUG2,
+                               (errmsg("skipping restartpoint, already performed at %X/%X",
+                                               lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+
+               UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+               LWLockRelease(CheckpointLock);
+               return false;
+       }
+
+       if (log_checkpoints)
+       {
+               /*
+                * Prepare to accumulate statistics.
+                */
+               MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+               CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+               LogCheckpointStart(flags, true);
+       }
+
+       CheckPointGuts(lastCheckPoint.redo, flags);
+
+       /*
+        * Update pg_control, using current time
+        */
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->prevCheckPoint = ControlFile->checkPoint;
-       ControlFile->checkPoint = ReadRecPtr;
-       ControlFile->checkPointCopy = *checkPoint;
+       ControlFile->checkPoint = lastCheckPointRecPtr;
+       ControlFile->checkPointCopy = lastCheckPoint;
        ControlFile->time = (pg_time_t) time(NULL);
        UpdateControlFile();
+       LWLockRelease(ControlFileLock);
+
+       /*
+        * Currently, there is no need to truncate pg_subtrans during recovery.
+        * If we did do that, we will need to have called StartupSUBTRANS()
+        * already and then TruncateSUBTRANS() would go here.
+        */
+
+       /* All real work is done, but log before releasing lock. */
+       if (log_checkpoints)
+               LogCheckpointEnd(true);
 
-       ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
+       ereport((log_checkpoints ? LOG : DEBUG2),
                        (errmsg("recovery restart point at %X/%X",
-                                       checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
+                                       lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+
        if (recoveryLastXTime)
-               ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
-                               (errmsg("last completed transaction was at log time %s",
-                                               timestamptz_to_str(recoveryLastXTime))));
+               ereport((log_checkpoints ? LOG : DEBUG2),
+                       (errmsg("last completed transaction was at log time %s",
+                                       timestamptz_to_str(recoveryLastXTime))));
+
+       LWLockRelease(CheckpointLock);
+       return true;
 }
 
 /*
@@ -6238,12 +6762,18 @@ RequestXLogSwitch(void)
 
 /*
  * XLOG resource manager's routines
+ *
+ * Definitions of info values are in include/catalog/pg_control.h, though
+ * not all records types are related to control file processing.
  */
 void
 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 {
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
 
+       /* Backup blocks are not used in xlog records */
+       Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+
        if (info == XLOG_NEXTOID)
        {
                Oid                     nextOid;
@@ -6281,9 +6811,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                                                                 (int) checkPoint.ThisTimeLineID))
                                ereport(PANIC,
                                                (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
-                                                               checkPoint.ThisTimeLineID, ThisTimeLineID)));
-                       /* Following WAL records should be run with new TLI */
-                       ThisTimeLineID = checkPoint.ThisTimeLineID;
+                               checkPoint.ThisTimeLineID, ThisTimeLineID)));
+           /* Following WAL records should be run with new TLI */
+           ThisTimeLineID = checkPoint.ThisTimeLineID;
                }
 
                RecoveryRestartPoint(&checkPoint);
@@ -6516,6 +7046,7 @@ Datum
 pg_start_backup(PG_FUNCTION_ARGS)
 {
        text       *backupid = PG_GETARG_TEXT_P(0);
+       bool            fast = PG_GETARG_BOOL(1);
        char       *backupidstr;
        XLogRecPtr      checkpointloc;
        XLogRecPtr      startpoint;
@@ -6576,6 +7107,19 @@ pg_start_backup(PG_FUNCTION_ARGS)
        XLogCtl->Insert.forcePageWrites = true;
        LWLockRelease(WALInsertLock);
 
+       /*
+        * Force an XLOG file switch before the checkpoint, to ensure that the WAL
+        * segment the checkpoint is written to doesn't contain pages with old
+        * timeline IDs. That would otherwise happen if you called
+        * pg_start_backup() right after restoring from a PITR archive: the first
+        * WAL segment containing the startup checkpoint has pages in the
+        * beginning with the old timeline ID. That can cause trouble at recovery:
+        * we won't have a history file covering the old timeline if pg_xlog
+        * directory was not included in the base backup and the WAL archive was
+        * cleared too before starting the backup.
+        */
+       RequestXLogSwitch();
+
        /* Ensure we release forcePageWrites if fail below */
        PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
        {
@@ -6585,9 +7129,11 @@ pg_start_backup(PG_FUNCTION_ARGS)
                 * have different checkpoint positions and hence different history
                 * file names, even if nothing happened in between.
                 *
-                * We don't use CHECKPOINT_IMMEDIATE, hence this can take awhile.
+                * We use CHECKPOINT_IMMEDIATE only if requested by user (via
+                * passing fast = true).  Otherwise this can take awhile.
                 */
-               RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT);
+               RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
+                                                 (fast ? CHECKPOINT_IMMEDIATE : 0));
 
                /*
                 * Now we need to fetch the checkpoint record location, and also its
@@ -7224,3 +7770,99 @@ CancelBackup(void)
        }
 }
 
+/* ------------------------------------------------------
+ *  Startup Process main entry point and signal handlers
+ * ------------------------------------------------------
+ */
+
+/*
+ * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+startupproc_quickdie(SIGNAL_ARGS)
+{
+       PG_SETMASK(&BlockSig);
+
+       /*
+        * DO NOT proc_exit() -- we're here because shared memory may be
+        * corrupted, so we don't want to try to clean up our transaction. Just
+        * nail the windows shut and get out of town.
+        *
+        * Note we do exit(2) not exit(0).      This is to force the postmaster into a
+        * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+        * backend.  This is necessary precisely because we don't clean up our
+        * shared memory state.
+        */
+       exit(2);
+}
+
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+StartupProcSigHupHandler(SIGNAL_ARGS)
+{
+       got_SIGHUP = true;
+}
+
+/* SIGTERM: set flag to abort redo and exit */
+static void
+StartupProcShutdownHandler(SIGNAL_ARGS)
+{
+       if (in_restore_command)
+               proc_exit(1);
+       else
+               shutdown_requested = true;
+}
+
+/* Main entry point for startup process */
+void
+StartupProcessMain(void)
+{
+       /*
+        * If possible, make this process a group leader, so that the postmaster
+        * can signal any child processes too.
+        */
+#ifdef HAVE_SETSID
+       if (setsid() < 0)
+               elog(FATAL, "setsid() failed: %m");
+#endif
+
+       /*
+        * Properly accept or ignore signals the postmaster might send us
+        */
+       pqsignal(SIGHUP, StartupProcSigHupHandler);      /* reload config file */
+       pqsignal(SIGINT, SIG_IGN);                                      /* ignore query cancel */
+       pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */
+       pqsignal(SIGQUIT, startupproc_quickdie);                /* hard crash time */
+       pqsignal(SIGALRM, SIG_IGN);
+       pqsignal(SIGPIPE, SIG_IGN);
+       pqsignal(SIGUSR1, SIG_IGN);
+       pqsignal(SIGUSR2, SIG_IGN);
+
+       /*
+        * Reset some signals that are accepted by postmaster but not here
+        */
+       pqsignal(SIGCHLD, SIG_DFL);
+       pqsignal(SIGTTIN, SIG_DFL);
+       pqsignal(SIGTTOU, SIG_DFL);
+       pqsignal(SIGCONT, SIG_DFL);
+       pqsignal(SIGWINCH, SIG_DFL);
+
+       /*
+        * Unblock signals (they were blocked when the postmaster forked us)
+        */
+       PG_SETMASK(&UnBlockSig);
+
+       StartupXLOG();  
+
+       BuildFlatFiles(false);
+
+       /*
+        * Exit normally. Exit code 0 tells postmaster that we completed
+        * recovery successfully.
+        */
+       proc_exit(0);
+}