OSDN Git Service

Add recovery_end_command option to recovery.conf. recovery_end_command
[pg-rex/syncrep.git] / src / backend / access / transam / xlog.c
index df1549f..09b5075 100644 (file)
@@ -4,10 +4,10 @@
  *             PostgreSQL transaction log manager
  *
  *
- * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.300 2008/04/24 14:23:43 mha Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.338 2009/05/14 20:31:09 heikki Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
 #include <ctype.h>
-#include <fcntl.h>
 #include <signal.h>
 #include <time.h>
+#include <fcntl.h>
 #include <sys/stat.h>
 #include <sys/time.h>
 #include <sys/wait.h>
 #include <unistd.h>
 
 #include "access/clog.h"
-#include "access/heapam.h"
 #include "access/multixact.h"
 #include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/twophase.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
-#include "access/xlogdefs.h"
 #include "access/xlogutils.h"
 #include "catalog/catversion.h"
 #include "catalog/pg_control.h"
 #include "catalog/pg_type.h"
 #include "funcapi.h"
+#include "libpq/pqsignal.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "postmaster/bgwriter.h"
-#include "storage/bufpage.h"
+#include "storage/bufmgr.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 #include "storage/pmsignal.h"
 #include "storage/smgr.h"
 #include "storage/spin.h"
 #include "utils/builtins.h"
-#include "utils/pg_locale.h"
+#include "utils/flatfiles.h"
+#include "utils/guc.h"
 #include "utils/ps_status.h"
+#include "pg_trace.h"
 
 
 /* File path names (all relative to $PGDATA) */
@@ -66,10 +67,9 @@ int                  XLOGbuffers = 8;
 int                    XLogArchiveTimeout = 0;
 bool           XLogArchiveMode = false;
 char      *XLogArchiveCommand = NULL;
-char      *XLOG_sync_method = NULL;
-const char     XLOG_sync_method_default[] = DEFAULT_SYNC_METHOD_STR;
 bool           fullPageWrites = true;
 bool           log_checkpoints = false;
+int            sync_method = DEFAULT_SYNC_METHOD;
 
 #ifdef WAL_DEBUG
 bool           XLOG_DEBUG = false;
@@ -88,13 +88,25 @@ bool                XLOG_DEBUG = false;
  */
 #define XLOGfileslop   (2*CheckPointSegments + 1)
 
-
-/* these are derived from XLOG_sync_method by assign_xlog_sync_method */
-int                    sync_method = DEFAULT_SYNC_METHOD;
-static int     open_sync_bit = DEFAULT_SYNC_FLAGBIT;
-
-#define XLOG_SYNC_BIT  (enableFsync ? open_sync_bit : 0)
-
+/*
+ * GUC support
+ */
+const struct config_enum_entry sync_method_options[] = {
+       {"fsync", SYNC_METHOD_FSYNC, false},
+#ifdef HAVE_FSYNC_WRITETHROUGH
+       {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
+#endif
+#ifdef HAVE_FDATASYNC
+       {"fdatasync", SYNC_METHOD_FDATASYNC, false},
+#endif
+#ifdef OPEN_SYNC_FLAG
+       {"open_sync", SYNC_METHOD_OPEN, false},
+#endif
+#ifdef OPEN_DATASYNC_FLAG
+       {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
+#endif
+       {NULL, 0, false}
+};
 
 /*
  * Statistics for current checkpoint are collected in this global struct.
@@ -109,21 +121,36 @@ CheckpointStatsData CheckpointStats;
  */
 TimeLineID     ThisTimeLineID = 0;
 
-/* Are we doing recovery from XLOG? */
+/*
+ * Are we doing recovery from XLOG? 
+ *
+ * This is only ever true in the startup process, even if the system is still
+ * in recovery. Prior to 8.4, all activity during recovery were carried out
+ * by Startup process. This local variable continues to be used in functions
+ * that need to act differently when called from a redo function (e.g skip
+ * WAL logging). To check whether the system is in recovery regardless of what
+ * process you're running in, use RecoveryInProgress().
+ */
 bool           InRecovery = false;
 
 /* Are we recovering using offline XLOG archives? */
 static bool InArchiveRecovery = false;
 
+/*
+ * Local copy of SharedRecoveryInProgress variable. True actually means "not
+ * known, need to check the shared state"
+ */
+static bool LocalRecoveryInProgress = true;
+
 /* Was the last xlog file restored from archive, or local? */
 static bool restoredFromArchive = false;
 
 /* options taken from recovery.conf */
 static char *recoveryRestoreCommand = NULL;
+static char *recoveryEndCommand = NULL;
 static bool recoveryTarget = false;
 static bool recoveryTargetExact = false;
 static bool recoveryTargetInclusive = true;
-static bool recoveryLogRestartpoints = false;
 static TransactionId recoveryTargetXid;
 static TimestampTz recoveryTargetTime;
 static TimestampTz recoveryLastXTime = 0;
@@ -232,9 +259,8 @@ static XLogRecPtr RedoRecPtr;
  * ControlFileLock: must be held to read/update control file or create
  * new log file.
  *
- * CheckpointLock: must be held to do a checkpoint (ensures only one
- * checkpointer at a time; currently, with all checkpoints done by the
- * bgwriter, this is just pro forma).
+ * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
+ * only one checkpointer at a time)
  *
  *----------
  */
@@ -303,6 +329,25 @@ typedef struct XLogCtlData
        int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
        TimeLineID      ThisTimeLineID;
 
+       /*
+        * SharedRecoveryInProgress indicates if we're still in crash or archive
+        * recovery.  It's checked by RecoveryInProgress().
+        */
+       bool            SharedRecoveryInProgress;
+
+       /*
+        * During recovery, we keep a copy of the latest checkpoint record
+        * here.  Used by the background writer when it wants to create
+        * a restartpoint.
+        *
+        * Protected by info_lck.
+        */
+       XLogRecPtr      lastCheckPointRecPtr;
+       CheckPoint      lastCheckPoint;
+
+       /* end+1 of the last record replayed (or being replayed) */
+       XLogRecPtr      replayEndRecPtr;
+
        slock_t         info_lck;               /* locks shared variables shown above */
 } XLogCtlData;
 
@@ -377,13 +422,27 @@ static XLogRecPtr ReadRecPtr;     /* start of last record read */
 static XLogRecPtr EndRecPtr;   /* end+1 of last record read */
 static XLogRecord *nextRecord = NULL;
 static TimeLineID lastPageTLI = 0;
+static XLogRecPtr minRecoveryPoint; /* local copy of ControlFile->minRecoveryPoint */
+static bool    updateMinRecoveryPoint = true;
 
 static bool InRedo = false;
 
+/*
+ * Flag set by interrupt handlers for later service in the redo loop.
+ */
+static volatile sig_atomic_t got_SIGHUP = false;
+static volatile sig_atomic_t shutdown_requested = false;
+/*
+ * Flag set when executing a restore command, to tell SIGTERM signal handler
+ * that it's safe to just proc_exit.
+ */
+static volatile sig_atomic_t in_restore_command = false;
+
 
 static void XLogArchiveNotify(const char *xlog);
 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
-static bool XLogArchiveCheckDone(const char *xlog, bool create_if_missing);
+static bool XLogArchiveCheckDone(const char *xlog);
+static bool XLogArchiveIsBusy(const char *xlog);
 static void XLogArchiveCleanup(const char *xlog);
 static void readRecoveryCommandFile(void);
 static void exitArchiveRecovery(TimeLineID endTLI,
@@ -405,9 +464,12 @@ static int XLogFileRead(uint32 log, uint32 seg, int emode);
 static void XLogFileClose(void);
 static bool RestoreArchivedFile(char *path, const char *xlogfname,
                                        const char *recovername, off_t expectedSize);
+static void ExecuteRecoveryEndCommand(void);
 static void PreallocXlogFiles(XLogRecPtr endptr);
 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
+static void ValidateXLOGDirectoryStructure(void);
 static void CleanupBackupHistory(void);
+static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode);
 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
@@ -428,6 +490,7 @@ static void pg_start_backup_callback(int code, Datum arg);
 static bool read_backup_label(XLogRecPtr *checkPointLoc,
                                  XLogRecPtr *minRecoveryLoc);
 static void rm_redo_error_callback(void *arg);
+static int get_sync_bit(int method);
 
 
 /*
@@ -471,10 +534,16 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
        bool            doPageWrites;
        bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 
+       /* cross-check on whether we should be here or not */
+       if (RecoveryInProgress())
+               elog(FATAL, "cannot make new WAL entries during recovery");
+
        /* info's high bits are reserved for use by me */
        if (info & XLR_INFO_MASK)
                elog(PANIC, "invalid xlog info mask %02X", info);
 
+       TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
+
        /*
         * In bootstrap mode, we don't actually log anything but XLOG resources;
         * return a phony record pointer.
@@ -903,6 +972,8 @@ begin:;
                XLogwrtRqst FlushRqst;
                XLogRecPtr      OldSegEnd;
 
+               TRACE_POSTGRESQL_XLOG_SWITCH();
+
                LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 
                /*
@@ -1008,31 +1079,30 @@ static bool
 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
                                XLogRecPtr *lsn, BkpBlock *bkpb)
 {
-       PageHeader      page;
+       Page            page;
 
-       page = (PageHeader) BufferGetBlock(rdata->buffer);
+       page = BufferGetPage(rdata->buffer);
 
        /*
         * XXX We assume page LSN is first data on *every* page that can be passed
         * to XLogInsert, whether it otherwise has the standard page layout or
         * not.
         */
-       *lsn = page->pd_lsn;
+       *lsn = PageGetLSN(page);
 
        if (doPageWrites &&
-               XLByteLE(page->pd_lsn, RedoRecPtr))
+               XLByteLE(PageGetLSN(page), RedoRecPtr))
        {
                /*
                 * The page needs to be backed up, so set up *bkpb
                 */
-               bkpb->node = BufferGetFileNode(rdata->buffer);
-               bkpb->block = BufferGetBlockNumber(rdata->buffer);
+               BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
 
                if (rdata->buffer_std)
                {
                        /* Assume we can omit data between pd_lower and pd_upper */
-                       uint16          lower = page->pd_lower;
-                       uint16          upper = page->pd_upper;
+                       uint16          lower = ((PageHeader) page)->pd_lower;
+                       uint16          upper = ((PageHeader) page)->pd_upper;
 
                        if (lower >= SizeOfPageHeaderData &&
                                upper > lower &&
@@ -1129,7 +1199,7 @@ XLogArchiveNotifySeg(uint32 log, uint32 seg)
  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
  */
 static bool
-XLogArchiveCheckDone(const char *xlog, bool create_if_missing)
+XLogArchiveCheckDone(const char *xlog)
 {
        char            archiveStatusPath[MAXPGPATH];
        struct stat stat_buf;
@@ -1154,13 +1224,55 @@ XLogArchiveCheckDone(const char *xlog, bool create_if_missing)
                return true;
 
        /* Retry creation of the .ready file */
-       if (create_if_missing)
-               XLogArchiveNotify(xlog);
-
+       XLogArchiveNotify(xlog);
        return false;
 }
 
 /*
+ * XLogArchiveIsBusy
+ *
+ * Check to see if an XLOG segment file is still unarchived.
+ * This is almost but not quite the inverse of XLogArchiveCheckDone: in
+ * the first place we aren't chartered to recreate the .ready file, and
+ * in the second place we should consider that if the file is already gone
+ * then it's not busy.  (This check is needed to handle the race condition
+ * that a checkpoint already deleted the no-longer-needed file.)
+ */
+static bool
+XLogArchiveIsBusy(const char *xlog)
+{
+       char            archiveStatusPath[MAXPGPATH];
+       struct stat stat_buf;
+
+       /* First check for .done --- this means archiver is done with it */
+       StatusFilePath(archiveStatusPath, xlog, ".done");
+       if (stat(archiveStatusPath, &stat_buf) == 0)
+               return false;
+
+       /* check for .ready --- this means archiver is still busy with it */
+       StatusFilePath(archiveStatusPath, xlog, ".ready");
+       if (stat(archiveStatusPath, &stat_buf) == 0)
+               return true;
+
+       /* Race condition --- maybe archiver just finished, so recheck */
+       StatusFilePath(archiveStatusPath, xlog, ".done");
+       if (stat(archiveStatusPath, &stat_buf) == 0)
+               return false;
+
+       /*
+        * Check to see if the WAL file has been removed by checkpoint,
+        * which implies it has already been archived, and explains why we
+        * can't see a status file for it.
+        */
+       snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
+       if (stat(archiveStatusPath, &stat_buf) != 0 &&
+               errno == ENOENT)
+               return false;
+
+       return true;
+}
+
+/*
  * XLogArchiveCleanup
  *
  * Cleanup archive notification file(s) for a particular xlog segment
@@ -1261,12 +1373,14 @@ AdvanceXLInsertBuffer(bool new_segment)
                                 * Have to write buffers while holding insert lock. This is
                                 * not good, so only write as much as we absolutely must.
                                 */
+                               TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
                                WriteRqst.Write = OldPageRqstPtr;
                                WriteRqst.Flush.xlogid = 0;
                                WriteRqst.Flush.xrecoff = 0;
                                XLogWrite(WriteRqst, false, false);
                                LWLockRelease(WALWriteLock);
                                Insert->LogwrtResult = LogwrtResult;
+                               TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
                        }
                }
        }
@@ -1601,7 +1715,8 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
                 * have no open file or the wrong one.  However, we do not need to
                 * fsync more than one file.
                 */
-               if (sync_method != SYNC_METHOD_OPEN)
+               if (sync_method != SYNC_METHOD_OPEN &&
+                       sync_method != SYNC_METHOD_OPEN_DSYNC)
                {
                        if (openLogFile >= 0 &&
                                !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
@@ -1657,6 +1772,63 @@ XLogSetAsyncCommitLSN(XLogRecPtr asyncCommitLSN)
 }
 
 /*
+ * Advance minRecoveryPoint in control file.
+ *
+ * If we crash during recovery, we must reach this point again before the
+ * database is consistent. 
+ * 
+ * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
+ * is is only updated if it's not already greater than or equal to 'lsn'.
+ */
+static void
+UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
+{
+       /* Quick check using our local copy of the variable */
+       if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
+               return;
+
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
+
+       /* update local copy */
+       minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+       /*
+        * An invalid minRecoveryPoint means that we need to recover all the WAL,
+        * ie. crash recovery. Don't update the control file in that case.
+        */
+       if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+               updateMinRecoveryPoint = false;
+       else if (force || XLByteLT(minRecoveryPoint, lsn))
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+               XLogRecPtr newMinRecoveryPoint;
+
+               /*
+                * To avoid having to update the control file too often, we update it
+                * all the way to the last record being replayed, even though 'lsn'
+                * would suffice for correctness.
+                */
+               SpinLockAcquire(&xlogctl->info_lck);
+               newMinRecoveryPoint = xlogctl->replayEndRecPtr;
+               SpinLockRelease(&xlogctl->info_lck);
+
+               /* update control file */
+               if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
+               {
+                       ControlFile->minRecoveryPoint = newMinRecoveryPoint;
+                       UpdateControlFile();
+                       minRecoveryPoint = newMinRecoveryPoint;
+
+                       ereport(DEBUG2,
+                                       (errmsg("updated min recovery point to %X/%X",
+                                               minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+               }
+       }
+       LWLockRelease(ControlFileLock);
+}
+
+/*
  * Ensure that all XLOG data through the given position is flushed to disk.
  *
  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
@@ -1668,9 +1840,15 @@ XLogFlush(XLogRecPtr record)
        XLogRecPtr      WriteRqstPtr;
        XLogwrtRqst WriteRqst;
 
-       /* Disabled during REDO */
-       if (InRedo)
+       /*
+        * During REDO, we don't try to flush the WAL, but update minRecoveryPoint
+        * instead.
+        */
+       if (RecoveryInProgress())
+       {
+               UpdateMinRecoveryPoint(record, false);
                return;
+       }
 
        /* Quick exit if already known flushed */
        if (XLByteLE(record, LogwrtResult.Flush))
@@ -1757,9 +1935,9 @@ XLogFlush(XLogRecPtr record)
         * the bad page is encountered again during recovery then we would be
         * unable to restart the database at all!  (This scenario has actually
         * happened in the field several times with 7.1 releases. Note that we
-        * cannot get here while InRedo is true, but if the bad page is brought in
-        * and marked dirty during recovery then CreateCheckPoint will try to
-        * flush it at the end of recovery.)
+        * cannot get here while RecoveryInProgress(), but if the bad page is
+        * brought in and marked dirty during recovery then if a checkpoint were
+        * performed at the end of recovery it will try to flush it.
         *
         * The current approach is to ERROR under normal conditions, but only
         * WARNING during recovery, so that the system can be brought up even if
@@ -1796,6 +1974,10 @@ XLogBackgroundFlush(void)
        XLogRecPtr      WriteRqstPtr;
        bool            flexible = true;
 
+       /* XLOG doesn't need flushing during recovery */
+       if (RecoveryInProgress())
+               return;
+
        /* read LogwrtResult and update local state */
        {
                /* use volatile pointer to prevent code rearrangement */
@@ -1867,6 +2049,10 @@ XLogAsyncCommitFlush(void)
        /* use volatile pointer to prevent code rearrangement */
        volatile XLogCtlData *xlogctl = XLogCtl;
 
+       /* There's no asynchronously committed transactions during recovery */
+       if (RecoveryInProgress())
+               return;
+
        SpinLockAcquire(&xlogctl->info_lck);
        WriteRqstPtr = xlogctl->asyncCommitLSN;
        SpinLockRelease(&xlogctl->info_lck);
@@ -1883,6 +2069,10 @@ XLogAsyncCommitFlush(void)
 bool
 XLogNeedsFlush(XLogRecPtr record)
 {
+       /* XLOG doesn't need flushing during recovery */
+       if (RecoveryInProgress())
+               return false;
+
        /* Quick exit if already known flushed */
        if (XLByteLE(record, LogwrtResult.Flush))
                return false;
@@ -1944,7 +2134,7 @@ XLogFileInit(uint32 log, uint32 seg,
         */
        if (*use_existent)
        {
-               fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
+               fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
                                                   S_IRUSR | S_IWUSR);
                if (fd < 0)
                {
@@ -1970,7 +2160,7 @@ XLogFileInit(uint32 log, uint32 seg,
 
        unlink(tmppath);
 
-       /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
+       /* do not use get_sync_bit() here --- want to fsync only at end of fill */
        fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
                                           S_IRUSR | S_IWUSR);
        if (fd < 0)
@@ -2048,7 +2238,7 @@ XLogFileInit(uint32 log, uint32 seg,
        *use_existent = false;
 
        /* Now open original target segment (might not be file I just made) */
-       fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
+       fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
                                           S_IRUSR | S_IWUSR);
        if (fd < 0)
                ereport(ERROR,
@@ -2099,7 +2289,7 @@ XLogFileCopy(uint32 log, uint32 seg,
 
        unlink(tmppath);
 
-       /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
+       /* do not use get_sync_bit() here --- want to fsync only at end of fill */
        fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
                                           S_IRUSR | S_IWUSR);
        if (fd < 0)
@@ -2281,7 +2471,7 @@ XLogFileOpen(uint32 log, uint32 seg)
 
        XLogFilePath(path, ThisTimeLineID, log, seg);
 
-       fd = BasicOpenFile(path, O_RDWR | PG_BINARY | XLOG_SYNC_BIT,
+       fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
                                           S_IRUSR | S_IWUSR);
        if (fd < 0)
                ereport(PANIC,
@@ -2376,29 +2566,17 @@ XLogFileClose(void)
        Assert(openLogFile >= 0);
 
        /*
-        * posix_fadvise is problematic on many platforms: on older x86 Linux it
-        * just dumps core, and there are reports of problems on PPC platforms as
-        * well.  The following is therefore disabled for the time being. We could
-        * consider some kind of configure test to see if it's safe to use, but
-        * since we lack hard evidence that there's any useful performance gain to
-        * be had, spending time on that seems unprofitable for now.
-        */
-#ifdef NOT_USED
-
-       /*
         * WAL segment files will not be re-read in normal operation, so we advise
-        * OS to release any cached pages.      But do not do so if WAL archiving is
-        * active, because archiver process could use the cache to read the WAL
-        * segment.
-        *
-        * While O_DIRECT works for O_SYNC, posix_fadvise() works for fsync() and
-        * O_SYNC, and some platforms only have posix_fadvise().
-        */
-#if defined(HAVE_DECL_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
-       if (!XLogArchivingActive())
-               posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
+        * the OS to release any cached pages.  But do not do so if WAL archiving
+        * is active, because archiver process could use the cache to read the WAL
+        * segment.  Also, don't bother with it if we are using O_DIRECT, since
+        * the kernel is presumably not caching in that case.
+        */
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+       if (!XLogArchivingActive() &&
+               (get_sync_bit(sync_method) & PG_O_DIRECT) == 0)
+               (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
 #endif
-#endif   /* NOT_USED */
 
        if (close(openLogFile))
                ereport(PANIC,
@@ -2484,6 +2662,35 @@ RestoreArchivedFile(char *path, const char *xlogfname,
        }
 
        /*
+        * Calculate the archive file cutoff point for use during log shipping
+        * replication. All files earlier than this point can be deleted
+        * from the archive, though there is no requirement to do so.
+        *
+        * We initialise this with the filename of an InvalidXLogRecPtr, which
+        * will prevent the deletion of any WAL files from the archive
+        * because of the alphabetic sorting property of WAL filenames.
+        *
+        * Once we have successfully located the redo pointer of the checkpoint
+        * from which we start recovery we never request a file prior to the redo
+        * pointer of the last restartpoint. When redo begins we know that we
+        * have successfully located it, so there is no need for additional
+        * status flags to signify the point when we can begin deleting WAL files
+        * from the archive.
+        */
+       if (InRedo)
+       {
+               XLByteToSeg(ControlFile->checkPointCopy.redo,
+                                       restartLog, restartSeg);
+               XLogFileName(lastRestartPointFname,
+                                        ControlFile->checkPointCopy.ThisTimeLineID,
+                                        restartLog, restartSeg);
+               /* we shouldn't need anything earlier than last restart point */
+               Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
+       }
+       else
+               XLogFileName(lastRestartPointFname, 0, 0, 0);
+
+       /*
         * construct the command to be executed
         */
        dp = xlogRestoreCmd;
@@ -2512,11 +2719,6 @@ RestoreArchivedFile(char *path, const char *xlogfname,
                                case 'r':
                                        /* %r: filename of last restartpoint */
                                        sp++;
-                                       XLByteToSeg(ControlFile->checkPointCopy.redo,
-                                                               restartLog, restartSeg);
-                                       XLogFileName(lastRestartPointFname,
-                                                                ControlFile->checkPointCopy.ThisTimeLineID,
-                                                                restartLog, restartSeg);
                                        StrNCpy(dp, lastRestartPointFname, endp - dp);
                                        dp += strlen(dp);
                                        break;
@@ -2546,9 +2748,22 @@ RestoreArchivedFile(char *path, const char *xlogfname,
                                                         xlogRestoreCmd)));
 
        /*
+        * Set in_restore_command to tell the signal handler that we should exit
+        * right away on SIGTERM. We know that we're in a safe point to do that.
+        * Check if we had already received the signal, so that we don't miss a
+        * shutdown request received just before this.
+        */
+       in_restore_command = true;
+       if (shutdown_requested)
+               proc_exit(1);
+
+       /*
         * Copy xlog from archival storage to XLOGDIR
         */
        rc = system(xlogRestoreCmd);
+
+       in_restore_command = false;
+
        if (rc == 0)
        {
                /*
@@ -2601,14 +2816,24 @@ RestoreArchivedFile(char *path, const char *xlogfname,
         * assume that recovery is complete and start up the database!) It's
         * essential to abort on child SIGINT and SIGQUIT, because per spec
         * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
-        * those it's a good bet we should have gotten it too.  Aborting on other
-        * signals such as SIGTERM seems a good idea as well.
+        * those it's a good bet we should have gotten it too.
+        *
+        * On SIGTERM, assume we have received a fast shutdown request, and exit
+        * cleanly. It's pure chance whether we receive the SIGTERM first, or the
+        * child process. If we receive it first, the signal handler will call
+        * proc_exit, otherwise we do it here. If we or the child process
+        * received SIGTERM for any other reason than a fast shutdown request,
+        * postmaster will perform an immediate shutdown when it sees us exiting
+        * unexpectedly.
         *
         * Per the Single Unix Spec, shells report exit status > 128 when a called
         * command died on a signal.  Also, 126 and 127 are used to report
         * problems such as an unfindable command; treat those as fatal errors
         * too.
         */
+       if (WTERMSIG(rc) == SIGTERM)
+               proc_exit(1);
+
        signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
 
        ereport(signaled ? FATAL : DEBUG2,
@@ -2627,6 +2852,114 @@ RestoreArchivedFile(char *path, const char *xlogfname,
 }
 
 /*
+ * Attempt to execute the recovery_end_command.
+ */
+static void
+ExecuteRecoveryEndCommand(void)
+{
+       char            xlogRecoveryEndCmd[MAXPGPATH];
+       char            lastRestartPointFname[MAXPGPATH];
+       char       *dp;
+       char       *endp;
+       const char *sp;
+       int                     rc;
+       bool            signaled;
+       uint32          restartLog;
+       uint32          restartSeg;
+
+       Assert(recoveryEndCommand);
+
+       /*
+        * Calculate the archive file cutoff point for use during log shipping
+        * replication. All files earlier than this point can be deleted
+        * from the archive, though there is no requirement to do so.
+        *
+        * We initialise this with the filename of an InvalidXLogRecPtr, which
+        * will prevent the deletion of any WAL files from the archive
+        * because of the alphabetic sorting property of WAL filenames. 
+        *
+        * Once we have successfully located the redo pointer of the checkpoint
+        * from which we start recovery we never request a file prior to the redo
+        * pointer of the last restartpoint. When redo begins we know that we
+        * have successfully located it, so there is no need for additional
+        * status flags to signify the point when we can begin deleting WAL files
+        * from the archive. 
+        */
+       if (InRedo)
+       {
+               XLByteToSeg(ControlFile->checkPointCopy.redo,
+                                       restartLog, restartSeg);
+               XLogFileName(lastRestartPointFname,
+                                        ControlFile->checkPointCopy.ThisTimeLineID,
+                                        restartLog, restartSeg);
+       }
+       else
+               XLogFileName(lastRestartPointFname, 0, 0, 0);
+
+       /*
+        * construct the command to be executed
+        */
+       dp = xlogRecoveryEndCmd;
+       endp = xlogRecoveryEndCmd + MAXPGPATH - 1;
+       *endp = '\0';
+
+       for (sp = recoveryEndCommand; *sp; sp++)
+       {
+               if (*sp == '%')
+               {
+                       switch (sp[1])
+                       {
+                               case 'r':
+                                       /* %r: filename of last restartpoint */
+                                       sp++;
+                                       StrNCpy(dp, lastRestartPointFname, endp - dp);
+                                       dp += strlen(dp);
+                                       break;
+                               case '%':
+                                       /* convert %% to a single % */
+                                       sp++;
+                                       if (dp < endp)
+                                               *dp++ = *sp;
+                                       break;
+                               default:
+                                       /* otherwise treat the % as not special */
+                                       if (dp < endp)
+                                               *dp++ = *sp;
+                                       break;
+                       }
+               }
+               else
+               {
+                       if (dp < endp)
+                               *dp++ = *sp;
+               }
+       }
+       *dp = '\0';
+
+       ereport(DEBUG3,
+                       (errmsg_internal("executing recovery end command \"%s\"",
+                                                        xlogRecoveryEndCmd)));
+
+       /*
+        * Copy xlog from archival storage to XLOGDIR
+        */
+       rc = system(xlogRecoveryEndCmd);
+       if (rc != 0)
+       {
+               /*
+                * If the failure was due to any sort of signal, it's best to punt and
+                * abort recovery. See also detailed comments on signals in 
+                * RestoreArchivedFile().
+                */
+               signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
+
+               ereport(signaled ? FATAL : WARNING,
+                               (errmsg("recovery_end_command \"%s\": return code %d",
+                                                               xlogRecoveryEndCmd, rc)));
+       }
+}
+
+/*
  * Preallocate log files beyond the specified log endpoint.
  *
  * XXX this is currently extremely conservative, since it forces only one
@@ -2707,7 +3040,7 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
                        strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
                        strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
                {
-                       if (XLogArchiveCheckDone(xlde->d_name, true))
+                       if (XLogArchiveCheckDone(xlde->d_name))
                        {
                                snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
 
@@ -2749,6 +3082,53 @@ RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
 }
 
 /*
+ * Verify whether pg_xlog and pg_xlog/archive_status exist.
+ * If the latter does not exist, recreate it.
+ *
+ * It is not the goal of this function to verify the contents of these
+ * directories, but to help in cases where someone has performed a cluster
+ * copy for PITR purposes but omitted pg_xlog from the copy.
+ *
+ * We could also recreate pg_xlog if it doesn't exist, but a deliberate
+ * policy decision was made not to.  It is fairly common for pg_xlog to be
+ * a symlink, and if that was the DBA's intent then automatically making a
+ * plain directory would result in degraded performance with no notice.
+ */
+static void
+ValidateXLOGDirectoryStructure(void)
+{
+       char            path[MAXPGPATH];
+       struct stat     stat_buf;
+
+       /* Check for pg_xlog; if it doesn't exist, error out */
+       if (stat(XLOGDIR, &stat_buf) != 0 ||
+               !S_ISDIR(stat_buf.st_mode))
+               ereport(FATAL, 
+                               (errmsg("required WAL directory \"%s\" does not exist",
+                                               XLOGDIR)));
+
+       /* Check for archive_status */
+       snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
+       if (stat(path, &stat_buf) == 0)
+       {
+               /* Check for weird cases where it exists but isn't a directory */
+               if (!S_ISDIR(stat_buf.st_mode))
+                       ereport(FATAL, 
+                                       (errmsg("required WAL directory \"%s\" does not exist",
+                                                       path)));
+       }
+       else
+       {
+               ereport(LOG,
+                               (errmsg("creating missing WAL directory \"%s\"", path)));
+               if (mkdir(path, 0700) < 0)
+                       ereport(FATAL, 
+                                       (errmsg("could not create missing directory \"%s\": %m",
+                                                       path)));
+       }
+}
+
+/*
  * Remove previous backup history files.  This also retries creation of
  * .ready files for any backup history files for which XLogArchiveNotify
  * failed earlier.
@@ -2774,7 +3154,7 @@ CleanupBackupHistory(void)
                        strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
                                   ".backup") == 0)
                {
-                       if (XLogArchiveCheckDone(xlde->d_name, true))
+                       if (XLogArchiveCheckDone(xlde->d_name))
                        {
                                ereport(DEBUG2,
                                (errmsg("removing transaction log backup history file \"%s\"",
@@ -2802,17 +3182,25 @@ CleanupBackupHistory(void)
  * page might not be.  This will force us to replay all subsequent
  * modifications of the page that appear in XLOG, rather than possibly
  * ignoring them as already applied, but that's not a huge drawback.
+ *
+ * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
+ * Otherwise, a normal exclusive lock is used.  At the moment, that's just
+ * pro forma, because there can't be any regular backends in the system
+ * during recovery.  The 'cleanup' argument applies to all backup blocks
+ * in the WAL record, that suffices for now.
  */
-static void
-RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
+void
+RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
 {
-       Relation        reln;
        Buffer          buffer;
        Page            page;
        BkpBlock        bkpb;
        char       *blk;
        int                     i;
 
+       if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
+               return;
+
        blk = (char *) XLogRecGetData(record) + record->xl_len;
        for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
        {
@@ -2822,9 +3210,14 @@ RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
                memcpy(&bkpb, blk, sizeof(BkpBlock));
                blk += sizeof(BkpBlock);
 
-               reln = XLogOpenRelation(bkpb.node);
-               buffer = XLogReadBuffer(reln, bkpb.block, true);
+               buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
+                                                                               RBM_ZERO);
                Assert(BufferIsValid(buffer));
+               if (cleanup)
+                       LockBufferForCleanup(buffer);
+               else
+                       LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
+
                page = (Page) BufferGetPage(buffer);
 
                if (bkpb.hole_length == 0)
@@ -3294,8 +3687,11 @@ got_record:;
        return (XLogRecord *) buffer;
 
 next_record_is_invalid:;
-       close(readFile);
-       readFile = -1;
+       if (readFile >= 0)
+       {
+               close(readFile);
+               readFile = -1;
+       }
        nextRecord = NULL;
        return NULL;
 }
@@ -3610,7 +4006,7 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 
        unlink(tmppath);
 
-       /* do not use XLOG_SYNC_BIT here --- want to fsync only at end of fill */
+       /* do not use get_sync_bit() here --- want to fsync only at end of fill */
        fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
                                           S_IRUSR | S_IWUSR);
        if (fd < 0)
@@ -3769,7 +4165,6 @@ WriteControlFile(void)
 {
        int                     fd;
        char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
-       char       *localeptr;
 
        /*
         * Initialize version and compatibility-check fields
@@ -3798,18 +4193,6 @@ WriteControlFile(void)
        ControlFile->float4ByVal = FLOAT4PASSBYVAL;
        ControlFile->float8ByVal = FLOAT8PASSBYVAL;
 
-       ControlFile->localeBuflen = LOCALE_NAME_BUFLEN;
-       localeptr = setlocale(LC_COLLATE, NULL);
-       if (!localeptr)
-               ereport(PANIC,
-                               (errmsg("invalid LC_COLLATE setting")));
-       StrNCpy(ControlFile->lc_collate, localeptr, LOCALE_NAME_BUFLEN);
-       localeptr = setlocale(LC_CTYPE, NULL);
-       if (!localeptr)
-               ereport(PANIC,
-                               (errmsg("invalid LC_CTYPE setting")));
-       StrNCpy(ControlFile->lc_ctype, localeptr, LOCALE_NAME_BUFLEN);
-
        /* Contents are protected with a CRC */
        INIT_CRC32(ControlFile->crc);
        COMP_CRC32(ControlFile->crc,
@@ -3922,15 +4305,9 @@ ReadControlFile(void)
                                (errmsg("incorrect checksum in control file")));
 
        /*
-        * Do compatibility checking immediately.  We do this here for 2 reasons:
-        *
-        * (1) if the database isn't compatible with the backend executable, we
-        * want to abort before we can possibly do any damage;
-        *
-        * (2) this code is executed in the postmaster, so the setlocale() will
-        * propagate to forked backends, which aren't going to read this file for
-        * themselves.  (These locale settings are considered critical
-        * compatibility items because they can affect sort order of indexes.)
+        * Do compatibility checking immediately.  If the database isn't
+        * compatible with the backend executable, we want to abort before we
+        * can possibly do any damage.
         */
        if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
                ereport(FATAL,
@@ -4048,34 +4425,6 @@ ReadControlFile(void)
                                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
                                 errhint("It looks like you need to recompile or initdb.")));
 #endif
-
-       if (ControlFile->localeBuflen != LOCALE_NAME_BUFLEN)
-               ereport(FATAL,
-                               (errmsg("database files are incompatible with server"),
-                                errdetail("The database cluster was initialized with LOCALE_NAME_BUFLEN %d,"
-                                 " but the server was compiled with LOCALE_NAME_BUFLEN %d.",
-                                                  ControlFile->localeBuflen, LOCALE_NAME_BUFLEN),
-                                errhint("It looks like you need to recompile or initdb.")));
-       if (pg_perm_setlocale(LC_COLLATE, ControlFile->lc_collate) == NULL)
-               ereport(FATAL,
-                       (errmsg("database files are incompatible with operating system"),
-                        errdetail("The database cluster was initialized with LC_COLLATE \"%s\","
-                                          " which is not recognized by setlocale().",
-                                          ControlFile->lc_collate),
-                        errhint("It looks like you need to initdb or install locale support.")));
-       if (pg_perm_setlocale(LC_CTYPE, ControlFile->lc_ctype) == NULL)
-               ereport(FATAL,
-                       (errmsg("database files are incompatible with operating system"),
-               errdetail("The database cluster was initialized with LC_CTYPE \"%s\","
-                                 " which is not recognized by setlocale().",
-                                 ControlFile->lc_ctype),
-                        errhint("It looks like you need to initdb or install locale support.")));
-
-       /* Make the fixed locale settings visible as GUC variables, too */
-       SetConfigOption("lc_collate", ControlFile->lc_collate,
-                                       PGC_INTERNAL, PGC_S_OVERRIDE);
-       SetConfigOption("lc_ctype", ControlFile->lc_ctype,
-                                       PGC_INTERNAL, PGC_S_OVERRIDE);
 }
 
 void
@@ -4425,6 +4774,13 @@ readRecoveryCommandFile(void)
                                        (errmsg("restore_command = '%s'",
                                                        recoveryRestoreCommand)));
                }
+               else if (strcmp(tok1, "recovery_end_command") == 0)
+               {
+                       recoveryEndCommand = pstrdup(tok2);
+                       ereport(LOG,
+                                       (errmsg("recovery_end_command = '%s'",
+                                                       recoveryEndCommand)));
+               }
                else if (strcmp(tok1, "recovery_target_timeline") == 0)
                {
                        rtliGiven = true;
@@ -4488,31 +4844,13 @@ readRecoveryCommandFile(void)
                        /*
                         * does nothing if a recovery_target is not also set
                         */
-                       if (strcmp(tok2, "true") == 0)
-                               recoveryTargetInclusive = true;
-                       else
-                       {
-                               recoveryTargetInclusive = false;
-                               tok2 = "false";
-                       }
+                       if (!parse_bool(tok2, &recoveryTargetInclusive))
+                                 ereport(ERROR,
+                                                       (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                                         errmsg("parameter \"recovery_target_inclusive\" requires a Boolean value")));
                        ereport(LOG,
                                        (errmsg("recovery_target_inclusive = %s", tok2)));
                }
-               else if (strcmp(tok1, "log_restartpoints") == 0)
-               {
-                       /*
-                        * does nothing if a recovery_target is not also set
-                        */
-                       if (strcmp(tok2, "true") == 0)
-                               recoveryLogRestartpoints = true;
-                       else
-                       {
-                               recoveryLogRestartpoints = false;
-                               tok2 = "false";
-                       }
-                       ereport(LOG,
-                                       (errmsg("log_restartpoints = %s", tok2)));
-               }
                else
                        ereport(FATAL,
                                        (errmsg("unrecognized recovery parameter \"%s\"",
@@ -4629,10 +4967,22 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
                 * If we are establishing a new timeline, we have to copy data from
                 * the last WAL segment of the old timeline to create a starting WAL
                 * segment for the new timeline.
+                *
+                * Notify the archiver that the last WAL segment of the old timeline
+                * is ready to copy to archival storage. Otherwise, it is not archived
+                * for a while.
                 */
                if (endTLI != ThisTimeLineID)
+               {
                        XLogFileCopy(endLogId, endLogSeg,
                                                 endTLI, endLogId, endLogSeg);
+
+                       if (XLogArchivingActive())
+                       {
+                               XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
+                               XLogArchiveNotify(xlogpath);
+                       }
+               }
        }
 
        /*
@@ -4667,6 +5017,9 @@ exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
  *
  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
  * *includeThis is set TRUE if we should apply this record before stopping.
+ *
+ * We also track the timestamp of the latest applied COMMIT/ABORT record
+ * in recoveryLastXTime, for logging purposes.
  * Also, some information is saved in recoveryStopXid et al for use in
  * annotating the new timeline's history file.
  */
@@ -4698,12 +5051,12 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
        else
                return false;
 
-       /* Remember the most recent COMMIT/ABORT time for logging purposes */
-       recoveryLastXTime = recordXtime;
-
        /* Do we have a PITR target at all? */
        if (!recoveryTarget)
+       {
+               recoveryLastXTime = recordXtime;
                return false;
+       }
 
        if (recoveryTargetExact)
        {
@@ -4767,7 +5120,12 @@ recoveryStopsHere(XLogRecord *record, bool *includeThis)
                                                                recoveryStopXid,
                                                                timestamptz_to_str(recoveryStopTime))));
                }
+
+               if (recoveryStopAfter)
+                       recoveryLastXTime = recordXtime;
        }
+       else
+               recoveryLastXTime = recordXtime;
 
        return stopsHere;
 }
@@ -4786,7 +5144,7 @@ StartupXLOG(void)
        XLogRecPtr      RecPtr,
                                LastRec,
                                checkPointLoc,
-                               minRecoveryLoc,
+                               backupStopLoc,
                                EndOfLog;
        uint32          endLogId;
        uint32          endLogSeg;
@@ -4794,6 +5152,8 @@ StartupXLOG(void)
        uint32          freespace;
        TransactionId oldestActiveXID;
 
+       XLogCtl->SharedRecoveryInProgress = true;
+
        /*
         * Read control file and check XLOG status looks valid.
         *
@@ -4840,6 +5200,13 @@ StartupXLOG(void)
 #endif
 
        /*
+        * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
+        * someone has performed a copy for PITR, these directories may have
+        * been excluded and need to be re-created.
+        */
+       ValidateXLOGDirectoryStructure();
+
+       /*
         * Initialize on the assumption we want to recover to the same timeline
         * that's active according to pg_control.
         */
@@ -4866,7 +5233,7 @@ StartupXLOG(void)
                                                recoveryTargetTLI,
                                                ControlFile->checkPointCopy.ThisTimeLineID)));
 
-       if (read_backup_label(&checkPointLoc, &minRecoveryLoc))
+       if (read_backup_label(&checkPointLoc, &backupStopLoc))
        {
                /*
                 * When a backup_label file is present, we want to roll forward from
@@ -5004,11 +5371,23 @@ StartupXLOG(void)
                ControlFile->prevCheckPoint = ControlFile->checkPoint;
                ControlFile->checkPoint = checkPointLoc;
                ControlFile->checkPointCopy = checkPoint;
-               if (minRecoveryLoc.xlogid != 0 || minRecoveryLoc.xrecoff != 0)
-                       ControlFile->minRecoveryPoint = minRecoveryLoc;
+               if (backupStopLoc.xlogid != 0 || backupStopLoc.xrecoff != 0)
+               {
+                       if (XLByteLT(ControlFile->minRecoveryPoint, backupStopLoc))
+                               ControlFile->minRecoveryPoint = backupStopLoc;
+               }
                ControlFile->time = (pg_time_t) time(NULL);
+               /* No need to hold ControlFileLock yet, we aren't up far enough */
                UpdateControlFile();
 
+               /* update our local copy of minRecoveryPoint */
+               minRecoveryPoint = ControlFile->minRecoveryPoint;
+
+               /*
+                * Reset pgstat data, because it may be invalid after recovery.
+                */
+               pgstat_reset_all();
+
                /*
                 * If there was a backup label file, it's done its job and the info
                 * has now been propagated into pg_control.  We must get rid of the
@@ -5027,9 +5406,7 @@ StartupXLOG(void)
                                                                BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
                }
 
-               /* Start up the recovery environment */
-               XLogInitRelationCache();
-
+               /* Initialize resource managers */
                for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
                {
                        if (RmgrTable[rmid].rm_startup != NULL)
@@ -5055,12 +5432,41 @@ StartupXLOG(void)
                {
                        bool            recoveryContinue = true;
                        bool            recoveryApply = true;
+                       bool            reachedMinRecoveryPoint = false;
                        ErrorContextCallback errcontext;
+                       /* use volatile pointer to prevent code rearrangement */
+                       volatile XLogCtlData *xlogctl = XLogCtl;
+
+                       /* Update shared replayEndRecPtr */
+                       SpinLockAcquire(&xlogctl->info_lck);
+                       xlogctl->replayEndRecPtr = ReadRecPtr;
+                       SpinLockRelease(&xlogctl->info_lck);
 
                        InRedo = true;
-                       ereport(LOG,
-                                       (errmsg("redo starts at %X/%X",
-                                                       ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
+
+                       if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
+                               ereport(LOG,
+                                               (errmsg("redo starts at %X/%X",
+                                                               ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
+                       else
+                               ereport(LOG,
+                                               (errmsg("redo starts at %X/%X, consistency will be reached at %X/%X",
+                                               ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
+                                               minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
+
+                       /*
+                        * Let postmaster know we've started redo now, so that it can
+                        * launch bgwriter to perform restartpoints.  We don't bother
+                        * during crash recovery as restartpoints can only be performed
+                        * during archive recovery.  And we'd like to keep crash recovery
+                        * simple, to avoid introducing bugs that could you from
+                        * recovering after crash.
+                        *
+                        * After this point, we can no longer assume that we're the only
+                        * process in addition to postmaster!
+                        */
+                       if (InArchiveRecovery && IsUnderPostmaster)
+                               SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
 
                        /*
                         * main redo apply loop
@@ -5087,6 +5493,39 @@ StartupXLOG(void)
 #endif
 
                                /*
+                                * Check if we were requested to re-read config file.
+                                */
+                               if (got_SIGHUP)
+                               {
+                                       got_SIGHUP = false;
+                                       ProcessConfigFile(PGC_SIGHUP);
+                               }
+
+                               /*
+                                * Check if we were requested to exit without finishing
+                                * recovery.
+                                */
+                               if (shutdown_requested)
+                                       proc_exit(1);
+
+                               /*
+                                * Have we reached our safe starting point? If so, we can
+                                * tell postmaster that the database is consistent now.
+                                */
+                               if (!reachedMinRecoveryPoint && 
+                                        XLByteLE(minRecoveryPoint, EndRecPtr))
+                               {
+                                       reachedMinRecoveryPoint = true;
+                                       if (InArchiveRecovery)
+                                       {
+                                               ereport(LOG,
+                                                               (errmsg("consistent recovery state reached")));
+                                               if (IsUnderPostmaster)
+                                                       SendPostmasterSignal(PMSIGNAL_RECOVERY_CONSISTENT);
+                                       }
+                               }
+
+                               /*
                                 * Have we reached our recovery target?
                                 */
                                if (recoveryStopsHere(record, &recoveryApply))
@@ -5111,8 +5550,14 @@ StartupXLOG(void)
                                        TransactionIdAdvance(ShmemVariableCache->nextXid);
                                }
 
-                               if (record->xl_info & XLR_BKP_BLOCK_MASK)
-                                       RestoreBkpBlocks(record, EndRecPtr);
+                               /*
+                                * Update shared replayEndRecPtr before replaying this
+                                * record, so that XLogFlush will update minRecoveryPoint
+                                * correctly.
+                                */
+                               SpinLockAcquire(&xlogctl->info_lck);
+                               xlogctl->replayEndRecPtr = EndRecPtr;
+                               SpinLockRelease(&xlogctl->info_lck);
 
                                RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
 
@@ -5157,14 +5602,14 @@ StartupXLOG(void)
         * Complain if we did not roll forward far enough to render the backup
         * dump consistent.
         */
-       if (XLByteLT(EndOfLog, ControlFile->minRecoveryPoint))
+       if (InRecovery && XLByteLT(EndOfLog, minRecoveryPoint))
        {
                if (reachedStopPoint)   /* stopped because of stop request */
                        ereport(FATAL,
-                                       (errmsg("requested recovery stop point is before end time of backup dump")));
+                                       (errmsg("requested recovery stop point is before consistent recovery point")));
                else    /* ran off end of WAL */
                        ereport(FATAL,
-                                       (errmsg("WAL ends before end time of backup dump")));
+                                       (errmsg("WAL ends before consistent recovery point")));
        }
 
        /*
@@ -5259,6 +5704,12 @@ StartupXLOG(void)
        /* Pre-scan prepared transactions to find out the range of XIDs present */
        oldestActiveXID = PrescanPreparedTransactions();
 
+       /*
+        * Allow writing WAL for us, so that we can create a checkpoint record.
+        * But not yet for other backends!
+        */
+       LocalRecoveryInProgress = false;
+
        if (InRecovery)
        {
                int                     rmid;
@@ -5279,11 +5730,6 @@ StartupXLOG(void)
                XLogCheckInvalidPages();
 
                /*
-                * Reset pgstat data, because it may be invalid after recovery.
-                */
-               pgstat_reset_all();
-
-               /*
                 * Perform a checkpoint to update all our recovery activity to disk.
                 *
                 * Note that we write a shutdown checkpoint rather than an on-line
@@ -5294,10 +5740,8 @@ StartupXLOG(void)
                 */
                CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
 
-               /*
-                * Close down recovery environment
-                */
-               XLogCloseRelationCache();
+               if (recoveryEndCommand)
+                       ExecuteRecoveryEndCommand();
        }
 
        /*
@@ -5310,12 +5754,14 @@ StartupXLOG(void)
         */
        InRecovery = false;
 
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->state = DB_IN_PRODUCTION;
        ControlFile->time = (pg_time_t) time(NULL);
        UpdateControlFile();
+       LWLockRelease(ControlFileLock);
 
        /* start the archive_timeout timer running */
-       XLogCtl->Write.lastSegSwitchTime = ControlFile->time;
+       XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
 
        /* initialize shared-memory copy of latest checkpoint XID/epoch */
        XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
@@ -5350,6 +5796,45 @@ StartupXLOG(void)
                readRecordBuf = NULL;
                readRecordBufSize = 0;
        }
+
+       /*
+        * All done. Allow others to write WAL.
+        */
+       XLogCtl->SharedRecoveryInProgress = false;
+}
+
+/*
+ * Is the system still in recovery?
+ *
+ * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
+ * variables the first time we see that recovery is finished.
+ */
+bool
+RecoveryInProgress(void)
+{
+       /*
+        * We check shared state each time only until we leave recovery mode.
+        * We can't re-enter recovery, so we rely on the local state variable
+        * after that.
+        */
+       if (!LocalRecoveryInProgress)
+               return false;
+       else
+       {
+               /* use volatile pointer to prevent code rearrangement */
+               volatile XLogCtlData *xlogctl = XLogCtl;
+
+               LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
+
+               /*
+                * Initialize TimeLineID and RedoRecPtr the first time we see that
+                * recovery is finished.
+                */
+               if (!LocalRecoveryInProgress)
+                       InitXLOGAccess();
+
+               return LocalRecoveryInProgress;
+       }
 }
 
 /*
@@ -5481,6 +5966,8 @@ InitXLOGAccess(void)
 {
        /* ThisTimeLineID doesn't change so we need no lock to copy it */
        ThisTimeLineID = XLogCtl->ThisTimeLineID;
+       Assert(ThisTimeLineID != 0);
+
        /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
        (void) GetRedoRecPtr();
 }
@@ -5592,7 +6079,10 @@ ShutdownXLOG(int code, Datum arg)
        ereport(LOG,
                        (errmsg("shutting down")));
 
-       CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+       if (RecoveryInProgress())
+               CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
+       else
+               CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
        ShutdownCLOG();
        ShutdownSUBTRANS();
        ShutdownMultiXact();
@@ -5605,9 +6095,20 @@ ShutdownXLOG(int code, Datum arg)
  * Log start of a checkpoint.
  */
 static void
-LogCheckpointStart(int flags)
+LogCheckpointStart(int flags, bool restartpoint)
 {
-       elog(LOG, "checkpoint starting:%s%s%s%s%s%s",
+       char *msg;
+
+       /*
+        * XXX: This is hopelessly untranslatable. We could call gettext_noop
+        * for the main message, but what about all the flags?
+        */
+       if (restartpoint)
+               msg = "restartpoint starting:%s%s%s%s%s%s";
+       else
+               msg = "checkpoint starting:%s%s%s%s%s%s";
+
+       elog(LOG, msg,
                 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
                 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
                 (flags & CHECKPOINT_FORCE) ? " force" : "",
@@ -5620,7 +6121,7 @@ LogCheckpointStart(int flags)
  * Log end of a checkpoint.
  */
 static void
-LogCheckpointEnd(void)
+LogCheckpointEnd(bool restartpoint)
 {
        long            write_secs,
                                sync_secs,
@@ -5643,17 +6144,26 @@ LogCheckpointEnd(void)
                                                CheckpointStats.ckpt_sync_end_t,
                                                &sync_secs, &sync_usecs);
 
-       elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
-                "%d transaction log file(s) added, %d removed, %d recycled; "
-                "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
-                CheckpointStats.ckpt_bufs_written,
-                (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
-                CheckpointStats.ckpt_segs_added,
-                CheckpointStats.ckpt_segs_removed,
-                CheckpointStats.ckpt_segs_recycled,
-                write_secs, write_usecs / 1000,
-                sync_secs, sync_usecs / 1000,
-                total_secs, total_usecs / 1000);
+       if (restartpoint)
+               elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
+                        "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+                        CheckpointStats.ckpt_bufs_written,
+                        (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+                        write_secs, write_usecs / 1000,
+                        sync_secs, sync_usecs / 1000,
+                        total_secs, total_usecs / 1000);
+       else
+               elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
+                        "%d transaction log file(s) added, %d removed, %d recycled; "
+                        "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+                        CheckpointStats.ckpt_bufs_written,
+                        (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
+                        CheckpointStats.ckpt_segs_added,
+                        CheckpointStats.ckpt_segs_removed,
+                        CheckpointStats.ckpt_segs_recycled,
+                        write_secs, write_usecs / 1000,
+                        sync_secs, sync_usecs / 1000,
+                        total_secs, total_usecs / 1000);
 }
 
 /*
@@ -5684,13 +6194,33 @@ CreateCheckPoint(int flags)
        TransactionId *inCommitXids;
        int                     nInCommit;
 
+       /* shouldn't happen */
+       if (RecoveryInProgress())
+               elog(ERROR, "can't create a checkpoint during recovery");
+
        /*
         * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
-        * (This is just pro forma, since in the present system structure there is
-        * only one process that is allowed to issue checkpoints at any given
-        * time.)
+        * During normal operation, bgwriter is the only process that creates
+        * checkpoints, but at the end of archive recovery, the bgwriter can be
+        * busy creating a restartpoint while the startup process tries to perform
+        * the startup checkpoint.
         */
-       LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+       if (!LWLockConditionalAcquire(CheckpointLock, LW_EXCLUSIVE))
+       {
+               Assert(InRecovery);
+
+               /*
+                * A restartpoint is in progress. Wait until it finishes. This can
+                * cause an extra restartpoint to be performed, but that's OK because
+                * we're just about to perform a checkpoint anyway. Flushing the
+                * buffers in this restartpoint can take some time, but that time is
+                * saved from the upcoming checkpoint so the net effect is zero.
+                */
+               ereport(DEBUG2, (errmsg("hurrying in-progress restartpoint")));
+               RequestCheckpoint(CHECKPOINT_IMMEDIATE | CHECKPOINT_WAIT);
+
+               LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+       }
 
        /*
         * Prepare to accumulate statistics.
@@ -5709,9 +6239,11 @@ CreateCheckPoint(int flags)
 
        if (shutdown)
        {
+               LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
                ControlFile->state = DB_SHUTDOWNING;
                ControlFile->time = (pg_time_t) time(NULL);
                UpdateControlFile();
+               LWLockRelease(ControlFileLock);
        }
 
        /*
@@ -5815,7 +6347,9 @@ CreateCheckPoint(int flags)
         * to log anything if we decided to skip the checkpoint.
         */
        if (log_checkpoints)
-               LogCheckpointStart(flags);
+               LogCheckpointStart(flags, false);
+
+       TRACE_POSTGRESQL_CHECKPOINT_START(flags);
 
        /*
         * Before flushing data, we must wait for any transactions that are
@@ -5980,7 +6514,13 @@ CreateCheckPoint(int flags)
 
        /* All real work is done, but log before releasing lock. */
        if (log_checkpoints)
-               LogCheckpointEnd();
+               LogCheckpointEnd(false);
+
+       TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
+                                                                        NBuffers,
+                                                                        CheckpointStats.ckpt_segs_added,
+                                                                        CheckpointStats.ckpt_segs_removed,
+                                                                        CheckpointStats.ckpt_segs_recycled);
 
        LWLockRelease(CheckpointLock);
 }
@@ -6003,32 +6543,17 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
 }
 
 /*
- * Set a recovery restart point if appropriate
- *
- * This is similar to CreateCheckPoint, but is used during WAL recovery
- * to establish a point from which recovery can roll forward without
- * replaying the entire recovery log.  This function is called each time
- * a checkpoint record is read from XLOG; it must determine whether a
- * restartpoint is needed or not.
+ * This is used during WAL recovery to establish a point from which recovery
+ * can roll forward without replaying the entire recovery log.  This function
+ * is called each time a checkpoint record is read from XLOG. It is stored
+ * in shared memory, so that it can be used as a restartpoint later on.
  */
 static void
 RecoveryRestartPoint(const CheckPoint *checkPoint)
 {
-       int                     elapsed_secs;
        int                     rmid;
-
-       /*
-        * Do nothing if the elapsed time since the last restartpoint is less than
-        * half of checkpoint_timeout.  (We use a value less than
-        * checkpoint_timeout so that variations in the timing of checkpoints on
-        * the master, or speed of transmission of WAL segments to a slave, won't
-        * make the slave skip a restartpoint once it's synced with the master.)
-        * Checking true elapsed time keeps us from doing restartpoints too often
-        * while rapidly scanning large amounts of WAL.
-        */
-       elapsed_secs = (pg_time_t) time(NULL) - ControlFile->time;
-       if (elapsed_secs < CheckPointTimeout / 2)
-               return;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
 
        /*
         * Is it safe to checkpoint?  We must ask each of the resource managers
@@ -6050,28 +6575,128 @@ RecoveryRestartPoint(const CheckPoint *checkPoint)
        }
 
        /*
-        * OK, force data out to disk
+        * Copy the checkpoint record to shared memory, so that bgwriter can
+        * use it the next time it wants to perform a restartpoint.
+        */
+       SpinLockAcquire(&xlogctl->info_lck);
+       XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
+       memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
+       SpinLockRelease(&xlogctl->info_lck);
+}
+
+/*
+ * This is similar to CreateCheckPoint, but is used during WAL recovery
+ * to establish a point from which recovery can roll forward without
+ * replaying the entire recovery log.
+ *
+ * Returns true if a new restartpoint was established. We can only establish
+ * a restartpoint if we have replayed a checkpoint record since last
+ * restartpoint.
+ */
+bool
+CreateRestartPoint(int flags)
+{
+       XLogRecPtr lastCheckPointRecPtr;
+       CheckPoint lastCheckPoint;
+       /* use volatile pointer to prevent code rearrangement */
+       volatile XLogCtlData *xlogctl = XLogCtl;
+
+       /*
+        * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
+        * happens at a time.
+        */
+       LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
+
+       /* Get the a local copy of the last checkpoint record. */
+       SpinLockAcquire(&xlogctl->info_lck);
+       lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
+       memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
+       SpinLockRelease(&xlogctl->info_lck);
+
+       /* 
+        * Check that we're still in recovery mode. It's ok if we exit recovery
+        * mode after this check, the restart point is valid anyway.
+        */
+       if (!RecoveryInProgress())
+       {
+               ereport(DEBUG2,
+                               (errmsg("skipping restartpoint, recovery has already ended")));
+               LWLockRelease(CheckpointLock);
+               return false;
+       }
+
+       /*
+        * If the last checkpoint record we've replayed is already our last
+        * restartpoint, we can't perform a new restart point. We still update
+        * minRecoveryPoint in that case, so that if this is a shutdown restart
+        * point, we won't start up earlier than before. That's not strictly
+        * necessary, but when we get hot standby capability, it would be rather
+        * weird if the database opened up for read-only connections at a
+        * point-in-time before the last shutdown. Such time travel is still
+        * possible in case of immediate shutdown, though.
+        *
+        * We don't explicitly advance minRecoveryPoint when we do create a
+        * restartpoint. It's assumed that flushing the buffers will do that
+        * as a side-effect.
         */
-       CheckPointGuts(checkPoint->redo, CHECKPOINT_IMMEDIATE);
+       if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
+               XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
+       {
+               XLogRecPtr InvalidXLogRecPtr = {0, 0};
+               ereport(DEBUG2,
+                               (errmsg("skipping restartpoint, already performed at %X/%X",
+                                               lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+
+               UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
+               LWLockRelease(CheckpointLock);
+               return false;
+       }
+
+       if (log_checkpoints)
+       {
+               /*
+                * Prepare to accumulate statistics.
+                */
+               MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
+               CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
+
+               LogCheckpointStart(flags, true);
+       }
+
+       CheckPointGuts(lastCheckPoint.redo, flags);
 
        /*
-        * Update pg_control so that any subsequent crash will restart from this
-        * checkpoint.  Note: ReadRecPtr gives the XLOG address of the checkpoint
-        * record itself.
+        * Update pg_control, using current time
         */
+       LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
        ControlFile->prevCheckPoint = ControlFile->checkPoint;
-       ControlFile->checkPoint = ReadRecPtr;
-       ControlFile->checkPointCopy = *checkPoint;
+       ControlFile->checkPoint = lastCheckPointRecPtr;
+       ControlFile->checkPointCopy = lastCheckPoint;
        ControlFile->time = (pg_time_t) time(NULL);
        UpdateControlFile();
+       LWLockRelease(ControlFileLock);
 
-       ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
+       /*
+        * Currently, there is no need to truncate pg_subtrans during recovery.
+        * If we did do that, we will need to have called StartupSUBTRANS()
+        * already and then TruncateSUBTRANS() would go here.
+        */
+
+       /* All real work is done, but log before releasing lock. */
+       if (log_checkpoints)
+               LogCheckpointEnd(true);
+
+       ereport((log_checkpoints ? LOG : DEBUG2),
                        (errmsg("recovery restart point at %X/%X",
-                                       checkPoint->redo.xlogid, checkPoint->redo.xrecoff)));
+                                       lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
+
        if (recoveryLastXTime)
-               ereport((recoveryLogRestartpoints ? LOG : DEBUG2),
-                               (errmsg("last completed transaction was at log time %s",
-                                               timestamptz_to_str(recoveryLastXTime))));
+               ereport((log_checkpoints ? LOG : DEBUG2),
+                       (errmsg("last completed transaction was at log time %s",
+                                       timestamptz_to_str(recoveryLastXTime))));
+
+       LWLockRelease(CheckpointLock);
+       return true;
 }
 
 /*
@@ -6137,12 +6762,18 @@ RequestXLogSwitch(void)
 
 /*
  * XLOG resource manager's routines
+ *
+ * Definitions of info values are in include/catalog/pg_control.h, though
+ * not all records types are related to control file processing.
  */
 void
 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 {
        uint8           info = record->xl_info & ~XLR_INFO_MASK;
 
+       /* Backup blocks are not used in xlog records */
+       Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
+
        if (info == XLOG_NEXTOID)
        {
                Oid                     nextOid;
@@ -6180,9 +6811,9 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
                                                                 (int) checkPoint.ThisTimeLineID))
                                ereport(PANIC,
                                                (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
-                                                               checkPoint.ThisTimeLineID, ThisTimeLineID)));
-                       /* Following WAL records should be run with new TLI */
-                       ThisTimeLineID = checkPoint.ThisTimeLineID;
+                               checkPoint.ThisTimeLineID, ThisTimeLineID)));
+           /* Following WAL records should be run with new TLI */
+           ThisTimeLineID = checkPoint.ThisTimeLineID;
                }
 
                RecoveryRestartPoint(&checkPoint);
@@ -6288,54 +6919,53 @@ xlog_outrec(StringInfo buf, XLogRecord *record)
 
 
 /*
- * GUC support
+ * Return the (possible) sync flag used for opening a file, depending on the
+ * value of the GUC wal_sync_method.
  */
-const char *
-assign_xlog_sync_method(const char *method, bool doit, GucSource source)
+static int
+get_sync_bit(int method)
 {
-       int                     new_sync_method;
-       int                     new_sync_bit;
+       /* If fsync is disabled, never open in sync mode */
+       if (!enableFsync)
+               return 0;
 
-       if (pg_strcasecmp(method, "fsync") == 0)
-       {
-               new_sync_method = SYNC_METHOD_FSYNC;
-               new_sync_bit = 0;
-       }
-#ifdef HAVE_FSYNC_WRITETHROUGH
-       else if (pg_strcasecmp(method, "fsync_writethrough") == 0)
-       {
-               new_sync_method = SYNC_METHOD_FSYNC_WRITETHROUGH;
-               new_sync_bit = 0;
-       }
-#endif
-#ifdef HAVE_FDATASYNC
-       else if (pg_strcasecmp(method, "fdatasync") == 0)
+       switch (method)
        {
-               new_sync_method = SYNC_METHOD_FDATASYNC;
-               new_sync_bit = 0;
-       }
-#endif
+               /*
+                * enum values for all sync options are defined even if they are not
+                * supported on the current platform.  But if not, they are not
+                * included in the enum option array, and therefore will never be seen
+                * here.
+                */
+               case SYNC_METHOD_FSYNC:
+               case SYNC_METHOD_FSYNC_WRITETHROUGH:
+               case SYNC_METHOD_FDATASYNC:
+                       return 0;
 #ifdef OPEN_SYNC_FLAG
-       else if (pg_strcasecmp(method, "open_sync") == 0)
-       {
-               new_sync_method = SYNC_METHOD_OPEN;
-               new_sync_bit = OPEN_SYNC_FLAG;
-       }
+               case SYNC_METHOD_OPEN:
+                       return OPEN_SYNC_FLAG;
 #endif
 #ifdef OPEN_DATASYNC_FLAG
-       else if (pg_strcasecmp(method, "open_datasync") == 0)
-       {
-               new_sync_method = SYNC_METHOD_OPEN;
-               new_sync_bit = OPEN_DATASYNC_FLAG;
-       }
+               case SYNC_METHOD_OPEN_DSYNC:
+                       return OPEN_DATASYNC_FLAG;
 #endif
-       else
-               return NULL;
+               default:
+                       /* can't happen (unless we are out of sync with option array) */
+                       elog(ERROR, "unrecognized wal_sync_method: %d", method);
+                       return 0; /* silence warning */
+       }
+}
 
+/*
+ * GUC support
+ */
+bool
+assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
+{
        if (!doit)
-               return method;
+               return true;
 
-       if (sync_method != new_sync_method || open_sync_bit != new_sync_bit)
+       if (sync_method != new_sync_method)
        {
                /*
                 * To ensure that no blocks escape unsynced, force an fsync on the
@@ -6350,14 +6980,12 @@ assign_xlog_sync_method(const char *method, bool doit, GucSource source)
                                                (errcode_for_file_access(),
                                                 errmsg("could not fsync log file %u, segment %u: %m",
                                                                openLogId, openLogSeg)));
-                       if (open_sync_bit != new_sync_bit)
+                       if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
                                XLogFileClose();
                }
-               sync_method = new_sync_method;
-               open_sync_bit = new_sync_bit;
        }
 
-       return method;
+       return true;
 }
 
 
@@ -6395,6 +7023,7 @@ issue_xlog_fsync(void)
                        break;
 #endif
                case SYNC_METHOD_OPEN:
+               case SYNC_METHOD_OPEN_DSYNC:
                        /* write synced it already */
                        break;
                default:
@@ -6417,6 +7046,7 @@ Datum
 pg_start_backup(PG_FUNCTION_ARGS)
 {
        text       *backupid = PG_GETARG_TEXT_P(0);
+       bool            fast = PG_GETARG_BOOL(1);
        char       *backupidstr;
        XLogRecPtr      checkpointloc;
        XLogRecPtr      startpoint;
@@ -6477,6 +7107,19 @@ pg_start_backup(PG_FUNCTION_ARGS)
        XLogCtl->Insert.forcePageWrites = true;
        LWLockRelease(WALInsertLock);
 
+       /*
+        * Force an XLOG file switch before the checkpoint, to ensure that the WAL
+        * segment the checkpoint is written to doesn't contain pages with old
+        * timeline IDs. That would otherwise happen if you called
+        * pg_start_backup() right after restoring from a PITR archive: the first
+        * WAL segment containing the startup checkpoint has pages in the
+        * beginning with the old timeline ID. That can cause trouble at recovery:
+        * we won't have a history file covering the old timeline if pg_xlog
+        * directory was not included in the base backup and the WAL archive was
+        * cleared too before starting the backup.
+        */
+       RequestXLogSwitch();
+
        /* Ensure we release forcePageWrites if fail below */
        PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
        {
@@ -6486,9 +7129,11 @@ pg_start_backup(PG_FUNCTION_ARGS)
                 * have different checkpoint positions and hence different history
                 * file names, even if nothing happened in between.
                 *
-                * We don't use CHECKPOINT_IMMEDIATE, hence this can take awhile.
+                * We use CHECKPOINT_IMMEDIATE only if requested by user (via
+                * passing fast = true).  Otherwise this can take awhile.
                 */
-               RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT);
+               RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
+                                                 (fast ? CHECKPOINT_IMMEDIATE : 0));
 
                /*
                 * Now we need to fetch the checkpoint record location, and also its
@@ -6589,6 +7234,8 @@ pg_stop_backup(PG_FUNCTION_ARGS)
        char            histfilepath[MAXPGPATH];
        char            startxlogfilename[MAXFNAMELEN];
        char            stopxlogfilename[MAXFNAMELEN];
+       char            lastxlogfilename[MAXFNAMELEN];
+       char            histfilename[MAXFNAMELEN];
        uint32          _logId;
        uint32          _logSeg;
        FILE       *lfp;
@@ -6603,6 +7250,12 @@ pg_stop_backup(PG_FUNCTION_ARGS)
                                (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
                                 (errmsg("must be superuser to run a backup"))));
 
+       if (!XLogArchivingActive())
+               ereport(ERROR,
+                               (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
+                                errmsg("WAL archiving is not active"),
+                                errhint("archive_mode must be enabled at server start.")));
+
        /*
         * OK to clear forcePageWrites
         */
@@ -6701,25 +7354,27 @@ pg_stop_backup(PG_FUNCTION_ARGS)
        CleanupBackupHistory();
 
        /*
-        * Wait until the history file has been archived. We assume that the 
-        * alphabetic sorting property of the WAL files ensures the last WAL
-        * file is guaranteed archived by the time the history file is archived.
+        * Wait until both the last WAL file filled during backup and the history
+        * file have been archived.  We assume that the alphabetic sorting
+        * property of the WAL files ensures any earlier WAL files are safely
+        * archived as well.
         *
         * We wait forever, since archive_command is supposed to work and
-        * we assume the admin wanted his backup to work completely. If you 
-        * don't wish to wait, you can SET statement_timeout = xx;
-        *
-        * If the status file is missing, we assume that is because it was
-        * set to .ready before we slept, then while asleep it has been set
-        * to .done and then removed by a concurrent checkpoint.
+        * we assume the admin wanted his backup to work completely. If you
+        * don't wish to wait, you can set statement_timeout.
         */
-       BackupHistoryFileName(histfilepath, ThisTimeLineID, _logId, _logSeg,
+       XLByteToPrevSeg(stoppoint, _logId, _logSeg);
+       XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
+
+       XLByteToSeg(startpoint, _logId, _logSeg);
+       BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
                                                  startpoint.xrecoff % XLogSegSize);
 
        seconds_before_warning = 60;
        waits = 0;
 
-       while (!XLogArchiveCheckDone(histfilepath, false))
+       while (XLogArchiveIsBusy(lastxlogfilename) ||
+                  XLogArchiveIsBusy(histfilename))
        {
                CHECK_FOR_INTERRUPTS();
 
@@ -6728,8 +7383,9 @@ pg_stop_backup(PG_FUNCTION_ARGS)
                if (++waits >= seconds_before_warning)
                {
                        seconds_before_warning *= 2;     /* This wraps in >10 years... */
-                       elog(WARNING, "pg_stop_backup() waiting for archive to complete " 
-                                                       "(%d seconds delay)", waits);
+                       ereport(WARNING,
+                                       (errmsg("pg_stop_backup still waiting for archive to complete (%d seconds elapsed)",
+                                                       waits)));
                }
        }
 
@@ -7114,3 +7770,99 @@ CancelBackup(void)
        }
 }
 
+/* ------------------------------------------------------
+ *  Startup Process main entry point and signal handlers
+ * ------------------------------------------------------
+ */
+
+/*
+ * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
+ *
+ * Some backend has bought the farm,
+ * so we need to stop what we're doing and exit.
+ */
+static void
+startupproc_quickdie(SIGNAL_ARGS)
+{
+       PG_SETMASK(&BlockSig);
+
+       /*
+        * DO NOT proc_exit() -- we're here because shared memory may be
+        * corrupted, so we don't want to try to clean up our transaction. Just
+        * nail the windows shut and get out of town.
+        *
+        * Note we do exit(2) not exit(0).      This is to force the postmaster into a
+        * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
+        * backend.  This is necessary precisely because we don't clean up our
+        * shared memory state.
+        */
+       exit(2);
+}
+
+
+/* SIGHUP: set flag to re-read config file at next convenient time */
+static void
+StartupProcSigHupHandler(SIGNAL_ARGS)
+{
+       got_SIGHUP = true;
+}
+
+/* SIGTERM: set flag to abort redo and exit */
+static void
+StartupProcShutdownHandler(SIGNAL_ARGS)
+{
+       if (in_restore_command)
+               proc_exit(1);
+       else
+               shutdown_requested = true;
+}
+
+/* Main entry point for startup process */
+void
+StartupProcessMain(void)
+{
+       /*
+        * If possible, make this process a group leader, so that the postmaster
+        * can signal any child processes too.
+        */
+#ifdef HAVE_SETSID
+       if (setsid() < 0)
+               elog(FATAL, "setsid() failed: %m");
+#endif
+
+       /*
+        * Properly accept or ignore signals the postmaster might send us
+        */
+       pqsignal(SIGHUP, StartupProcSigHupHandler);      /* reload config file */
+       pqsignal(SIGINT, SIG_IGN);                                      /* ignore query cancel */
+       pqsignal(SIGTERM, StartupProcShutdownHandler); /* request shutdown */
+       pqsignal(SIGQUIT, startupproc_quickdie);                /* hard crash time */
+       pqsignal(SIGALRM, SIG_IGN);
+       pqsignal(SIGPIPE, SIG_IGN);
+       pqsignal(SIGUSR1, SIG_IGN);
+       pqsignal(SIGUSR2, SIG_IGN);
+
+       /*
+        * Reset some signals that are accepted by postmaster but not here
+        */
+       pqsignal(SIGCHLD, SIG_DFL);
+       pqsignal(SIGTTIN, SIG_DFL);
+       pqsignal(SIGTTOU, SIG_DFL);
+       pqsignal(SIGCONT, SIG_DFL);
+       pqsignal(SIGWINCH, SIG_DFL);
+
+       /*
+        * Unblock signals (they were blocked when the postmaster forked us)
+        */
+       PG_SETMASK(&UnBlockSig);
+
+       StartupXLOG();  
+
+       BuildFlatFiles(false);
+
+       /*
+        * Exit normally. Exit code 0 tells postmaster that we completed
+        * recovery successfully.
+        */
+       proc_exit(0);
+}