src/backend/access/transam/xlog.c

   1 /*-------------------------------------------------------------------------
   2  *
   3  * xlog.c
   4  *              PostgreSQL transaction log manager
   5  *
   6  *
   7  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
   8  * Portions Copyright (c) 1994, Regents of the University of California
   9  *
  10  * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.430.2.4 2010/08/30 15:38:17 sriggs Exp $
  11  *
  12  *-------------------------------------------------------------------------
  13  */
  14
  15 #include "postgres.h"
  16
  17 #include <ctype.h>
  18 #include <signal.h>
  19 #include <time.h>
  20 #include <fcntl.h>
  21 #include <sys/stat.h>
  22 #include <sys/time.h>
  23 #include <sys/wait.h>
  24 #include <unistd.h>
  25
  26 #include "access/clog.h"
  27 #include "access/multixact.h"
  28 #include "access/subtrans.h"
  29 #include "access/transam.h"
  30 #include "access/tuptoaster.h"
  31 #include "access/twophase.h"
  32 #include "access/xact.h"
  33 #include "access/xlog_internal.h"
  34 #include "access/xlogutils.h"
  35 #include "catalog/catversion.h"
  36 #include "catalog/pg_control.h"
  37 #include "catalog/pg_database.h"
  38 #include "catalog/pg_type.h"
  39 #include "funcapi.h"
  40 #include "libpq/pqsignal.h"
  41 #include "miscadmin.h"
  42 #include "pgstat.h"
  43 #include "postmaster/bgwriter.h"
  44 #include "replication/walreceiver.h"
  45 #include "replication/walsender.h"
  46 #include "storage/bufmgr.h"
  47 #include "storage/fd.h"
  48 #include "storage/ipc.h"
  49 #include "storage/pmsignal.h"
  50 #include "storage/procarray.h"
  51 #include "storage/smgr.h"
  52 #include "storage/spin.h"
  53 #include "utils/builtins.h"
  54 #include "utils/guc.h"
  55 #include "utils/ps_status.h"
  56 #include "utils/relmapper.h"
  57 #include "pg_trace.h"
  58
  59
  60 /* File path names (all relative to $PGDATA) */
  61 #define BACKUP_LABEL_FILE               "backup_label"
  62 #define BACKUP_LABEL_OLD                "backup_label.old"
  63 #define RECOVERY_COMMAND_FILE   "recovery.conf"
  64 #define RECOVERY_COMMAND_DONE   "recovery.done"
  65
  66
  67 /* User-settable parameters */
  68 int                     CheckPointSegments = 3;
  69 int                     wal_keep_segments = 0;
  70 int                     XLOGbuffers = 8;
  71 int                     XLogArchiveTimeout = 0;
  72 bool            XLogArchiveMode = false;
  73 char       *XLogArchiveCommand = NULL;
  74 bool            EnableHotStandby = false;
  75 bool            fullPageWrites = true;
  76 bool            log_checkpoints = false;
  77 int                     sync_method = DEFAULT_SYNC_METHOD;
  78 int                     wal_level = WAL_LEVEL_MINIMAL;
  79
  80 #ifdef WAL_DEBUG
  81 bool            XLOG_DEBUG = false;
  82 #endif
  83
  84 /*
  85  * XLOGfileslop is the maximum number of preallocated future XLOG segments.
  86  * When we are done with an old XLOG segment file, we will recycle it as a
  87  * future XLOG segment as long as there aren't already XLOGfileslop future
  88  * segments; else we'll delete it.  This could be made a separate GUC
  89  * variable, but at present I think it's sufficient to hardwire it as
  90  * 2*CheckPointSegments+1.      Under normal conditions, a checkpoint will free
  91  * no more than 2*CheckPointSegments log segments, and we want to recycle all
  92  * of them; the +1 allows boundary cases to happen without wasting a
  93  * delete/create-segment cycle.
  94  */
  95 #define XLOGfileslop    (2*CheckPointSegments + 1)
  96
  97 /*
  98  * GUC support
  99  */
 100 const struct config_enum_entry wal_level_options[] = {
 101         {"minimal", WAL_LEVEL_MINIMAL, false},
 102         {"archive", WAL_LEVEL_ARCHIVE, false},
 103         {"hot_standby", WAL_LEVEL_HOT_STANDBY, false},
 104         {NULL, 0, false}
 105 };
 106
 107 const struct config_enum_entry sync_method_options[] = {
 108         {"fsync", SYNC_METHOD_FSYNC, false},
 109 #ifdef HAVE_FSYNC_WRITETHROUGH
 110         {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
 111 #endif
 112 #ifdef HAVE_FDATASYNC
 113         {"fdatasync", SYNC_METHOD_FDATASYNC, false},
 114 #endif
 115 #ifdef OPEN_SYNC_FLAG
 116         {"open_sync", SYNC_METHOD_OPEN, false},
 117 #endif
 118 #ifdef OPEN_DATASYNC_FLAG
 119         {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
 120 #endif
 121         {NULL, 0, false}
 122 };
 123
 124 /*
 125  * Statistics for current checkpoint are collected in this global struct.
 126  * Because only the background writer or a stand-alone backend can perform
 127  * checkpoints, this will be unused in normal backends.
 128  */
 129 CheckpointStatsData CheckpointStats;
 130
 131 /*
 132  * ThisTimeLineID will be same in all backends --- it identifies current
 133  * WAL timeline for the database system.
 134  */
 135 TimeLineID      ThisTimeLineID = 0;
 136
 137 /*
 138  * Are we doing recovery from XLOG?
 139  *
 140  * This is only ever true in the startup process; it should be read as meaning
 141  * "this process is replaying WAL records", rather than "the system is in
 142  * recovery mode".  It should be examined primarily by functions that need
 143  * to act differently when called from a WAL redo function (e.g., to skip WAL
 144  * logging).  To check whether the system is in recovery regardless of which
 145  * process you're running in, use RecoveryInProgress() but only after shared
 146  * memory startup and lock initialization.
 147  */
 148 bool            InRecovery = false;
 149
 150 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
 151 HotStandbyState standbyState = STANDBY_DISABLED;
 152
 153 static XLogRecPtr LastRec;
 154
 155 /*
 156  * Local copy of SharedRecoveryInProgress variable. True actually means "not
 157  * known, need to check the shared state".
 158  */
 159 static bool LocalRecoveryInProgress = true;
 160
 161 /*
 162  * Local state for XLogInsertAllowed():
 163  *              1: unconditionally allowed to insert XLOG
 164  *              0: unconditionally not allowed to insert XLOG
 165  *              -1: must check RecoveryInProgress(); disallow until it is false
 166  * Most processes start with -1 and transition to 1 after seeing that recovery
 167  * is not in progress.  But we can also force the value for special cases.
 168  * The coding in XLogInsertAllowed() depends on the first two of these states
 169  * being numerically the same as bool true and false.
 170  */
 171 static int      LocalXLogInsertAllowed = -1;
 172
 173 /* Are we recovering using offline XLOG archives? */
 174 static bool InArchiveRecovery = false;
 175
 176 /* Was the last xlog file restored from archive, or local? */
 177 static bool restoredFromArchive = false;
 178
 179 /* options taken from recovery.conf for archive recovery */
 180 static char *recoveryRestoreCommand = NULL;
 181 static char *recoveryEndCommand = NULL;
 182 static char *archiveCleanupCommand = NULL;
 183 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
 184 static bool recoveryTargetInclusive = true;
 185 static TransactionId recoveryTargetXid;
 186 static TimestampTz recoveryTargetTime;
 187
 188 /* options taken from recovery.conf for XLOG streaming */
 189 static bool StandbyMode = false;
 190 static char *PrimaryConnInfo = NULL;
 191 static char *TriggerFile = NULL;
 192
 193 /* if recoveryStopsHere returns true, it saves actual stop xid/time here */
 194 static TransactionId recoveryStopXid;
 195 static TimestampTz recoveryStopTime;
 196 static bool recoveryStopAfter;
 197
 198 /*
 199  * During normal operation, the only timeline we care about is ThisTimeLineID.
 200  * During recovery, however, things are more complicated.  To simplify life
 201  * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
 202  * scan through the WAL history (that is, it is the line that was active when
 203  * the currently-scanned WAL record was generated).  We also need these
 204  * timeline values:
 205  *
 206  * recoveryTargetTLI: the desired timeline that we want to end in.
 207  *
 208  * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
 209  * its known parents, newest first (so recoveryTargetTLI is always the
 210  * first list member).  Only these TLIs are expected to be seen in the WAL
 211  * segments we read, and indeed only these TLIs will be considered as
 212  * candidate WAL files to open at all.
 213  *
 214  * curFileTLI: the TLI appearing in the name of the current input WAL file.
 215  * (This is not necessarily the same as ThisTimeLineID, because we could
 216  * be scanning data that was copied from an ancestor timeline when the current
 217  * file was created.)  During a sequential scan we do not allow this value
 218  * to decrease.
 219  */
 220 static TimeLineID recoveryTargetTLI;
 221 static List *expectedTLIs;
 222 static TimeLineID curFileTLI;
 223
 224 /*
 225  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
 226  * current backend.  It is updated for all inserts.  XactLastRecEnd points to
 227  * end+1 of the last record, and is reset when we end a top-level transaction,
 228  * or start a new one; so it can be used to tell if the current transaction has
 229  * created any XLOG records.
 230  */
 231 static XLogRecPtr ProcLastRecPtr = {0, 0};
 232
 233 XLogRecPtr      XactLastRecEnd = {0, 0};
 234
 235 /*
 236  * RedoRecPtr is this backend's local copy of the REDO record pointer
 237  * (which is almost but not quite the same as a pointer to the most recent
 238  * CHECKPOINT record).  We update this from the shared-memory copy,
 239  * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
 240  * hold the Insert lock).  See XLogInsert for details.  We are also allowed
 241  * to update from XLogCtl->Insert.RedoRecPtr if we hold the info_lck;
 242  * see GetRedoRecPtr.  A freshly spawned backend obtains the value during
 243  * InitXLOGAccess.
 244  */
 245 static XLogRecPtr RedoRecPtr;
 246
 247 /*
 248  * RedoStartLSN points to the checkpoint's REDO location which is specified
 249  * in a backup label file, backup history file or control file. In standby
 250  * mode, XLOG streaming usually starts from the position where an invalid
 251  * record was found. But if we fail to read even the initial checkpoint
 252  * record, we use the REDO location instead of the checkpoint location as
 253  * the start position of XLOG streaming. Otherwise we would have to jump
 254  * backwards to the REDO location after reading the checkpoint record,
 255  * because the REDO record can precede the checkpoint record.
 256  */
 257 static XLogRecPtr RedoStartLSN = {0, 0};
 258
 259 /*----------
 260  * Shared-memory data structures for XLOG control
 261  *
 262  * LogwrtRqst indicates a byte position that we need to write and/or fsync
 263  * the log up to (all records before that point must be written or fsynced).
 264  * LogwrtResult indicates the byte positions we have already written/fsynced.
 265  * These structs are identical but are declared separately to indicate their
 266  * slightly different functions.
 267  *
 268  * We do a lot of pushups to minimize the amount of access to lockable
 269  * shared memory values.  There are actually three shared-memory copies of
 270  * LogwrtResult, plus one unshared copy in each backend.  Here's how it works:
 271  *              XLogCtl->LogwrtResult is protected by info_lck
 272  *              XLogCtl->Write.LogwrtResult is protected by WALWriteLock
 273  *              XLogCtl->Insert.LogwrtResult is protected by WALInsertLock
 274  * One must hold the associated lock to read or write any of these, but
 275  * of course no lock is needed to read/write the unshared LogwrtResult.
 276  *
 277  * XLogCtl->LogwrtResult and XLogCtl->Write.LogwrtResult are both "always
 278  * right", since both are updated by a write or flush operation before
 279  * it releases WALWriteLock.  The point of keeping XLogCtl->Write.LogwrtResult
 280  * is that it can be examined/modified by code that already holds WALWriteLock
 281  * without needing to grab info_lck as well.
 282  *
 283  * XLogCtl->Insert.LogwrtResult may lag behind the reality of the other two,
 284  * but is updated when convenient.      Again, it exists for the convenience of
 285  * code that is already holding WALInsertLock but not the other locks.
 286  *
 287  * The unshared LogwrtResult may lag behind any or all of these, and again
 288  * is updated when convenient.
 289  *
 290  * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 291  * (protected by info_lck), but we don't need to cache any copies of it.
 292  *
 293  * Note that this all works because the request and result positions can only
 294  * advance forward, never back up, and so we can easily determine which of two
 295  * values is "more up to date".
 296  *
 297  * info_lck is only held long enough to read/update the protected variables,
 298  * so it's a plain spinlock.  The other locks are held longer (potentially
 299  * over I/O operations), so we use LWLocks for them.  These locks are:
 300  *
 301  * WALInsertLock: must be held to insert a record into the WAL buffers.
 302  *
 303  * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 304  * XLogFlush).
 305  *
 306  * ControlFileLock: must be held to read/update control file or create
 307  * new log file.
 308  *
 309  * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 310  * only one checkpointer at a time; currently, with all checkpoints done by
 311  * the bgwriter, this is just pro forma).
 312  *
 313  *----------
 314  */
 315
 316 typedef struct XLogwrtRqst
 317 {
 318         XLogRecPtr      Write;                  /* last byte + 1 to write out */
 319         XLogRecPtr      Flush;                  /* last byte + 1 to flush */
 320 } XLogwrtRqst;
 321
 322 typedef struct XLogwrtResult
 323 {
 324         XLogRecPtr      Write;                  /* last byte + 1 written out */
 325         XLogRecPtr      Flush;                  /* last byte + 1 flushed */
 326 } XLogwrtResult;
 327
 328 /*
 329  * Shared state data for XLogInsert.
 330  */
 331 typedef struct XLogCtlInsert
 332 {
 333         XLogwrtResult LogwrtResult; /* a recent value of LogwrtResult */
 334         XLogRecPtr      PrevRecord;             /* start of previously-inserted record */
 335         int                     curridx;                /* current block index in cache */
 336         XLogPageHeader currpage;        /* points to header of block in cache */
 337         char       *currpos;            /* current insertion point in cache */
 338         XLogRecPtr      RedoRecPtr;             /* current redo point for insertions */
 339         bool            forcePageWrites;        /* forcing full-page writes for PITR? */
 340 } XLogCtlInsert;
 341
 342 /*
 343  * Shared state data for XLogWrite/XLogFlush.
 344  */
 345 typedef struct XLogCtlWrite
 346 {
 347         XLogwrtResult LogwrtResult; /* current value of LogwrtResult */
 348         int                     curridx;                /* cache index of next block to write */
 349         pg_time_t       lastSegSwitchTime;              /* time of last xlog segment switch */
 350 } XLogCtlWrite;
 351
 352 /*
 353  * Total shared-memory state for XLOG.
 354  */
 355 typedef struct XLogCtlData
 356 {
 357         /* Protected by WALInsertLock: */
 358         XLogCtlInsert Insert;
 359
 360         /* Protected by info_lck: */
 361         XLogwrtRqst LogwrtRqst;
 362         XLogwrtResult LogwrtResult;
 363         uint32          ckptXidEpoch;   /* nextXID & epoch of latest checkpoint */
 364         TransactionId ckptXid;
 365         XLogRecPtr      asyncXactLSN; /* LSN of newest async commit/abort */
 366         uint32          lastRemovedLog; /* latest removed/recycled XLOG segment */
 367         uint32          lastRemovedSeg;
 368
 369         /* Protected by WALWriteLock: */
 370         XLogCtlWrite Write;
 371
 372         /*
 373          * These values do not change after startup, although the pointed-to pages
 374          * and xlblocks values certainly do.  Permission to read/write the pages
 375          * and xlblocks values depends on WALInsertLock and WALWriteLock.
 376          */
 377         char       *pages;                      /* buffers for unwritten XLOG pages */
 378         XLogRecPtr *xlblocks;           /* 1st byte ptr-s + XLOG_BLCKSZ */
 379         int                     XLogCacheBlck;  /* highest allocated xlog buffer index */
 380         TimeLineID      ThisTimeLineID;
 381         TimeLineID      RecoveryTargetTLI;
 382
 383         /*
 384          * archiveCleanupCommand is read from recovery.conf but needs to be in
 385          * shared memory so that the bgwriter process can access it.
 386          */
 387         char            archiveCleanupCommand[MAXPGPATH];
 388
 389         /*
 390          * SharedRecoveryInProgress indicates if we're still in crash or archive
 391          * recovery.  Protected by info_lck.
 392          */
 393         bool            SharedRecoveryInProgress;
 394
 395         /*
 396          * During recovery, we keep a copy of the latest checkpoint record here.
 397          * Used by the background writer when it wants to create a restartpoint.
 398          *
 399          * Protected by info_lck.
 400          */
 401         XLogRecPtr      lastCheckPointRecPtr;
 402         CheckPoint      lastCheckPoint;
 403
 404         /* end+1 of the last record replayed (or being replayed) */
 405         XLogRecPtr      replayEndRecPtr;
 406         /* end+1 of the last record replayed */
 407         XLogRecPtr      recoveryLastRecPtr;
 408         /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
 409         TimestampTz recoveryLastXTime;
 410
 411         slock_t         info_lck;               /* locks shared variables shown above */
 412 } XLogCtlData;
 413
 414 static XLogCtlData *XLogCtl = NULL;
 415
 416 /*
 417  * We maintain an image of pg_control in shared memory.
 418  */
 419 static ControlFileData *ControlFile = NULL;
 420
 421 /*
 422  * Macros for managing XLogInsert state.  In most cases, the calling routine
 423  * has local copies of XLogCtl->Insert and/or XLogCtl->Insert->curridx,
 424  * so these are passed as parameters instead of being fetched via XLogCtl.
 425  */
 426
 427 /* Free space remaining in the current xlog page buffer */
 428 #define INSERT_FREESPACE(Insert)  \
 429         (XLOG_BLCKSZ - ((Insert)->currpos - (char *) (Insert)->currpage))
 430
 431 /* Construct XLogRecPtr value for current insertion point */
 432 #define INSERT_RECPTR(recptr,Insert,curridx)  \
 433         ( \
 434           (recptr).xlogid = XLogCtl->xlblocks[curridx].xlogid, \
 435           (recptr).xrecoff = \
 436                 XLogCtl->xlblocks[curridx].xrecoff - INSERT_FREESPACE(Insert) \
 437         )
 438
 439 #define PrevBufIdx(idx)         \
 440                 (((idx) == 0) ? XLogCtl->XLogCacheBlck : ((idx) - 1))
 441
 442 #define NextBufIdx(idx)         \
 443                 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 444
 445 /*
 446  * Private, possibly out-of-date copy of shared LogwrtResult.
 447  * See discussion above.
 448  */
 449 static XLogwrtResult LogwrtResult = {{0, 0}, {0, 0}};
 450
 451 /*
 452  * Codes indicating where we got a WAL file from during recovery, or where
 453  * to attempt to get one.  These are chosen so that they can be OR'd together
 454  * in a bitmask state variable.
 455  */
 456 #define XLOG_FROM_ARCHIVE               (1<<0)  /* Restored using restore_command */
 457 #define XLOG_FROM_PG_XLOG               (1<<1)  /* Existing file in pg_xlog */
 458 #define XLOG_FROM_STREAM                (1<<2)  /* Streamed from master */
 459
 460 /*
 461  * openLogFile is -1 or a kernel FD for an open log file segment.
 462  * When it's open, openLogOff is the current seek offset in the file.
 463  * openLogId/openLogSeg identify the segment.  These variables are only
 464  * used to write the XLOG, and so will normally refer to the active segment.
 465  */
 466 static int      openLogFile = -1;
 467 static uint32 openLogId = 0;
 468 static uint32 openLogSeg = 0;
 469 static uint32 openLogOff = 0;
 470
 471 /*
 472  * These variables are used similarly to the ones above, but for reading
 473  * the XLOG.  Note, however, that readOff generally represents the offset
 474  * of the page just read, not the seek position of the FD itself, which
 475  * will be just past that page. readLen indicates how much of the current
 476  * page has been read into readBuf, and readSource indicates where we got
 477  * the currently open file from.
 478  */
 479 static int      readFile = -1;
 480 static uint32 readId = 0;
 481 static uint32 readSeg = 0;
 482 static uint32 readOff = 0;
 483 static uint32 readLen = 0;
 484 static int      readSource = 0;         /* XLOG_FROM_* code */
 485
 486 /*
 487  * Keeps track of which sources we've tried to read the current WAL
 488  * record from and failed.
 489  */
 490 static int      failedSources = 0;      /* OR of XLOG_FROM_* codes */
 491
 492 /*
 493  * These variables track when we last obtained some WAL data to process,
 494  * and where we got it from.  (XLogReceiptSource is initially the same as
 495  * readSource, but readSource gets reset to zero when we don't have data
 496  * to process right now.)
 497  */
 498 static TimestampTz XLogReceiptTime = 0;
 499 static int      XLogReceiptSource = 0;          /* XLOG_FROM_* code */
 500
 501 /* Buffer for currently read page (XLOG_BLCKSZ bytes) */
 502 static char *readBuf = NULL;
 503
 504 /* Buffer for current ReadRecord result (expandable) */
 505 static char *readRecordBuf = NULL;
 506 static uint32 readRecordBufSize = 0;
 507
 508 /* State information for XLOG reading */
 509 static XLogRecPtr ReadRecPtr;   /* start of last record read */
 510 static XLogRecPtr EndRecPtr;    /* end+1 of last record read */
 511 static TimeLineID lastPageTLI = 0;
 512
 513 static XLogRecPtr minRecoveryPoint;             /* local copy of
 514                                                                                  * ControlFile->minRecoveryPoint */
 515 static bool updateMinRecoveryPoint = true;
 516 static bool reachedMinRecoveryPoint = false;
 517
 518 static bool InRedo = false;
 519
 520 /* Have we launched bgwriter during recovery? */
 521 static bool bgwriterLaunched = false;
 522
 523 /*
 524  * Information logged when we detect a change in one of the parameters
 525  * important for Hot Standby.
 526  */
 527 typedef struct xl_parameter_change
 528 {
 529         int                     MaxConnections;
 530         int                     max_prepared_xacts;
 531         int                     max_locks_per_xact;
 532         int                     wal_level;
 533 } xl_parameter_change;
 534
 535 /*
 536  * Flags set by interrupt handlers for later service in the redo loop.
 537  */
 538 static volatile sig_atomic_t got_SIGHUP = false;
 539 static volatile sig_atomic_t shutdown_requested = false;
 540
 541 /*
 542  * Flag set when executing a restore command, to tell SIGTERM signal handler
 543  * that it's safe to just proc_exit.
 544  */
 545 static volatile sig_atomic_t in_restore_command = false;
 546
 547
 548 static void XLogArchiveNotify(const char *xlog);
 549 static void XLogArchiveNotifySeg(uint32 log, uint32 seg);
 550 static bool XLogArchiveCheckDone(const char *xlog);
 551 static bool XLogArchiveIsBusy(const char *xlog);
 552 static void XLogArchiveCleanup(const char *xlog);
 553 static void readRecoveryCommandFile(void);
 554 static void exitArchiveRecovery(TimeLineID endTLI,
 555                                         uint32 endLogId, uint32 endLogSeg);
 556 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 557 static void SetLatestXTime(TimestampTz xtime);
 558 static TimestampTz GetLatestXTime(void);
 559 static void CheckRequiredParameterValues(void);
 560 static void XLogReportParameters(void);
 561 static void LocalSetXLogInsertAllowed(void);
 562 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
 563
 564 static bool XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
 565                                 XLogRecPtr *lsn, BkpBlock *bkpb);
 566 static bool AdvanceXLInsertBuffer(bool new_segment);
 567 static bool XLogCheckpointNeeded(uint32 logid, uint32 logseg);
 568 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch);
 569 static bool InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
 570                                            bool find_free, int *max_advance,
 571                                            bool use_lock);
 572 static int XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
 573                          int source, bool notexistOk);
 574 static int XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode,
 575                                    int sources);
 576 static bool XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
 577                          bool randAccess);
 578 static int      emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
 579 static void XLogFileClose(void);
 580 static bool RestoreArchivedFile(char *path, const char *xlogfname,
 581                                         const char *recovername, off_t expectedSize);
 582 static void ExecuteRecoveryCommand(char *command, char *commandName,
 583                                            bool failOnerror);
 584 static void PreallocXlogFiles(XLogRecPtr endptr);
 585 static void RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr);
 586 static void UpdateLastRemovedPtr(char *filename);
 587 static void ValidateXLOGDirectoryStructure(void);
 588 static void CleanupBackupHistory(void);
 589 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
 590 static XLogRecord *ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt);
 591 static void CheckRecoveryConsistency(void);
 592 static bool ValidXLOGHeader(XLogPageHeader hdr, int emode);
 593 static XLogRecord *ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt);
 594 static List *readTimeLineHistory(TimeLineID targetTLI);
 595 static bool existsTimeLineHistory(TimeLineID probeTLI);
 596 static TimeLineID findNewestTimeLine(TimeLineID startTLI);
 597 static void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 598                                          TimeLineID endTLI,
 599                                          uint32 endLogId, uint32 endLogSeg);
 600 static void WriteControlFile(void);
 601 static void ReadControlFile(void);
 602 static char *str_time(pg_time_t tnow);
 603 static bool CheckForStandbyTrigger(void);
 604
 605 #ifdef WAL_DEBUG
 606 static void xlog_outrec(StringInfo buf, XLogRecord *record);
 607 #endif
 608 static void pg_start_backup_callback(int code, Datum arg);
 609 static bool read_backup_label(XLogRecPtr *checkPointLoc);
 610 static void rm_redo_error_callback(void *arg);
 611 static int      get_sync_bit(int method);
 612
 613
 614 /*
 615  * Insert an XLOG record having the specified RMID and info bytes,
 616  * with the body of the record being the data chunk(s) described by
 617  * the rdata chain (see xlog.h for notes about rdata).
 618  *
 619  * Returns XLOG pointer to end of record (beginning of next record).
 620  * This can be used as LSN for data pages affected by the logged action.
 621  * (LSN is the XLOG point up to which the XLOG must be flushed to disk
 622  * before the data page can be written out.  This implements the basic
 623  * WAL rule "write the log before the data".)
 624  *
 625  * NB: this routine feels free to scribble on the XLogRecData structs,
 626  * though not on the data they reference.  This is OK since the XLogRecData
 627  * structs are always just temporaries in the calling code.
 628  */
 629 XLogRecPtr
 630 XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 631 {
 632         XLogCtlInsert *Insert = &XLogCtl->Insert;
 633         XLogRecord *record;
 634         XLogContRecord *contrecord;
 635         XLogRecPtr      RecPtr;
 636         XLogRecPtr      WriteRqst;
 637         uint32          freespace;
 638         int                     curridx;
 639         XLogRecData *rdt;
 640         Buffer          dtbuf[XLR_MAX_BKP_BLOCKS];
 641         bool            dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 642         BkpBlock        dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 643         XLogRecPtr      dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
 644         XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
 645         XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
 646         XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
 647         pg_crc32        rdata_crc;
 648         uint32          len,
 649                                 write_len;
 650         unsigned        i;
 651         bool            updrqst;
 652         bool            doPageWrites;
 653         bool            isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
 654
 655         /* cross-check on whether we should be here or not */
 656         if (!XLogInsertAllowed())
 657                 elog(ERROR, "cannot make new WAL entries during recovery");
 658
 659         /* info's high bits are reserved for use by me */
 660         if (info & XLR_INFO_MASK)
 661                 elog(PANIC, "invalid xlog info mask %02X", info);
 662
 663         TRACE_POSTGRESQL_XLOG_INSERT(rmid, info);
 664
 665         /*
 666          * In bootstrap mode, we don't actually log anything but XLOG resources;
 667          * return a phony record pointer.
 668          */
 669         if (IsBootstrapProcessingMode() && rmid != RM_XLOG_ID)
 670         {
 671                 RecPtr.xlogid = 0;
 672                 RecPtr.xrecoff = SizeOfXLogLongPHD;             /* start of 1st chkpt record */
 673                 return RecPtr;
 674         }
 675
 676         /*
 677          * Here we scan the rdata chain, determine which buffers must be backed
 678          * up, and compute the CRC values for the data.  Note that the record
 679          * header isn't added into the CRC initially since we don't know the final
 680          * length or info bits quite yet.  Thus, the CRC will represent the CRC of
 681          * the whole record in the order "rdata, then backup blocks, then record
 682          * header".
 683          *
 684          * We may have to loop back to here if a race condition is detected below.
 685          * We could prevent the race by doing all this work while holding the
 686          * insert lock, but it seems better to avoid doing CRC calculations while
 687          * holding the lock.  This means we have to be careful about modifying the
 688          * rdata chain until we know we aren't going to loop back again.  The only
 689          * change we allow ourselves to make earlier is to set rdt->data = NULL in
 690          * chain items we have decided we will have to back up the whole buffer
 691          * for.  This is OK because we will certainly decide the same thing again
 692          * for those items if we do it over; doing it here saves an extra pass
 693          * over the chain later.
 694          */
 695 begin:;
 696         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 697         {
 698                 dtbuf[i] = InvalidBuffer;
 699                 dtbuf_bkp[i] = false;
 700         }
 701
 702         /*
 703          * Decide if we need to do full-page writes in this XLOG record: true if
 704          * full_page_writes is on or we have a PITR request for it.  Since we
 705          * don't yet have the insert lock, forcePageWrites could change under us,
 706          * but we'll recheck it once we have the lock.
 707          */
 708         doPageWrites = fullPageWrites || Insert->forcePageWrites;
 709
 710         INIT_CRC32(rdata_crc);
 711         len = 0;
 712         for (rdt = rdata;;)
 713         {
 714                 if (rdt->buffer == InvalidBuffer)
 715                 {
 716                         /* Simple data, just include it */
 717                         len += rdt->len;
 718                         COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 719                 }
 720                 else
 721                 {
 722                         /* Find info for buffer */
 723                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 724                         {
 725                                 if (rdt->buffer == dtbuf[i])
 726                                 {
 727                                         /* Buffer already referenced by earlier chain item */
 728                                         if (dtbuf_bkp[i])
 729                                                 rdt->data = NULL;
 730                                         else if (rdt->data)
 731                                         {
 732                                                 len += rdt->len;
 733                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 734                                         }
 735                                         break;
 736                                 }
 737                                 if (dtbuf[i] == InvalidBuffer)
 738                                 {
 739                                         /* OK, put it in this slot */
 740                                         dtbuf[i] = rdt->buffer;
 741                                         if (XLogCheckBuffer(rdt, doPageWrites,
 742                                                                                 &(dtbuf_lsn[i]), &(dtbuf_xlg[i])))
 743                                         {
 744                                                 dtbuf_bkp[i] = true;
 745                                                 rdt->data = NULL;
 746                                         }
 747                                         else if (rdt->data)
 748                                         {
 749                                                 len += rdt->len;
 750                                                 COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 751                                         }
 752                                         break;
 753                                 }
 754                         }
 755                         if (i >= XLR_MAX_BKP_BLOCKS)
 756                                 elog(PANIC, "can backup at most %d blocks per xlog record",
 757                                          XLR_MAX_BKP_BLOCKS);
 758                 }
 759                 /* Break out of loop when rdt points to last chain item */
 760                 if (rdt->next == NULL)
 761                         break;
 762                 rdt = rdt->next;
 763         }
 764
 765         /*
 766          * Now add the backup block headers and data into the CRC
 767          */
 768         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 769         {
 770                 if (dtbuf_bkp[i])
 771                 {
 772                         BkpBlock   *bkpb = &(dtbuf_xlg[i]);
 773                         char       *page;
 774
 775                         COMP_CRC32(rdata_crc,
 776                                            (char *) bkpb,
 777                                            sizeof(BkpBlock));
 778                         page = (char *) BufferGetBlock(dtbuf[i]);
 779                         if (bkpb->hole_length == 0)
 780                         {
 781                                 COMP_CRC32(rdata_crc,
 782                                                    page,
 783                                                    BLCKSZ);
 784                         }
 785                         else
 786                         {
 787                                 /* must skip the hole */
 788                                 COMP_CRC32(rdata_crc,
 789                                                    page,
 790                                                    bkpb->hole_offset);
 791                                 COMP_CRC32(rdata_crc,
 792                                                    page + (bkpb->hole_offset + bkpb->hole_length),
 793                                                    BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
 794                         }
 795                 }
 796         }
 797
 798         /*
 799          * NOTE: We disallow len == 0 because it provides a useful bit of extra
 800          * error checking in ReadRecord.  This means that all callers of
 801          * XLogInsert must supply at least some not-in-a-buffer data.  However, we
 802          * make an exception for XLOG SWITCH records because we don't want them to
 803          * ever cross a segment boundary.
 804          */
 805         if (len == 0 && !isLogSwitch)
 806                 elog(PANIC, "invalid xlog record length %u", len);
 807
 808         START_CRIT_SECTION();
 809
 810         /* Now wait to get insert lock */
 811         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
 812
 813         /*
 814          * Check to see if my RedoRecPtr is out of date.  If so, may have to go
 815          * back and recompute everything.  This can only happen just after a
 816          * checkpoint, so it's better to be slow in this case and fast otherwise.
 817          *
 818          * If we aren't doing full-page writes then RedoRecPtr doesn't actually
 819          * affect the contents of the XLOG record, so we'll update our local copy
 820          * but not force a recomputation.
 821          */
 822         if (!XLByteEQ(RedoRecPtr, Insert->RedoRecPtr))
 823         {
 824                 Assert(XLByteLT(RedoRecPtr, Insert->RedoRecPtr));
 825                 RedoRecPtr = Insert->RedoRecPtr;
 826
 827                 if (doPageWrites)
 828                 {
 829                         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 830                         {
 831                                 if (dtbuf[i] == InvalidBuffer)
 832                                         continue;
 833                                 if (dtbuf_bkp[i] == false &&
 834                                         XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 835                                 {
 836                                         /*
 837                                          * Oops, this buffer now needs to be backed up, but we
 838                                          * didn't think so above.  Start over.
 839                                          */
 840                                         LWLockRelease(WALInsertLock);
 841                                         END_CRIT_SECTION();
 842                                         goto begin;
 843                                 }
 844                         }
 845                 }
 846         }
 847
 848         /*
 849          * Also check to see if forcePageWrites was just turned on; if we weren't
 850          * already doing full-page writes then go back and recompute. (If it was
 851          * just turned off, we could recompute the record without full pages, but
 852          * we choose not to bother.)
 853          */
 854         if (Insert->forcePageWrites && !doPageWrites)
 855         {
 856                 /* Oops, must redo it with full-page data */
 857                 LWLockRelease(WALInsertLock);
 858                 END_CRIT_SECTION();
 859                 goto begin;
 860         }
 861
 862         /*
 863          * Make additional rdata chain entries for the backup blocks, so that we
 864          * don't need to special-case them in the write loop.  Note that we have
 865          * now irrevocably changed the input rdata chain.  At the exit of this
 866          * loop, write_len includes the backup block data.
 867          *
 868          * Also set the appropriate info bits to show which buffers were backed
 869          * up. The i'th XLR_SET_BKP_BLOCK bit corresponds to the i'th distinct
 870          * buffer value (ignoring InvalidBuffer) appearing in the rdata chain.
 871          */
 872         write_len = len;
 873         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 874         {
 875                 BkpBlock   *bkpb;
 876                 char       *page;
 877
 878                 if (!dtbuf_bkp[i])
 879                         continue;
 880
 881                 info |= XLR_SET_BKP_BLOCK(i);
 882
 883                 bkpb = &(dtbuf_xlg[i]);
 884                 page = (char *) BufferGetBlock(dtbuf[i]);
 885
 886                 rdt->next = &(dtbuf_rdt1[i]);
 887                 rdt = rdt->next;
 888
 889                 rdt->data = (char *) bkpb;
 890                 rdt->len = sizeof(BkpBlock);
 891                 write_len += sizeof(BkpBlock);
 892
 893                 rdt->next = &(dtbuf_rdt2[i]);
 894                 rdt = rdt->next;
 895
 896                 if (bkpb->hole_length == 0)
 897                 {
 898                         rdt->data = page;
 899                         rdt->len = BLCKSZ;
 900                         write_len += BLCKSZ;
 901                         rdt->next = NULL;
 902                 }
 903                 else
 904                 {
 905                         /* must skip the hole */
 906                         rdt->data = page;
 907                         rdt->len = bkpb->hole_offset;
 908                         write_len += bkpb->hole_offset;
 909
 910                         rdt->next = &(dtbuf_rdt3[i]);
 911                         rdt = rdt->next;
 912
 913                         rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
 914                         rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
 915                         write_len += rdt->len;
 916                         rdt->next = NULL;
 917                 }
 918         }
 919
 920         /*
 921          * If we backed up any full blocks and online backup is not in progress,
 922          * mark the backup blocks as removable.  This allows the WAL archiver to
 923          * know whether it is safe to compress archived WAL data by transforming
 924          * full-block records into the non-full-block format.
 925          *
 926          * Note: we could just set the flag whenever !forcePageWrites, but
 927          * defining it like this leaves the info bit free for some potential other
 928          * use in records without any backup blocks.
 929          */
 930         if ((info & XLR_BKP_BLOCK_MASK) && !Insert->forcePageWrites)
 931                 info |= XLR_BKP_REMOVABLE;
 932
 933         /*
 934          * If there isn't enough space on the current XLOG page for a record
 935          * header, advance to the next page (leaving the unused space as zeroes).
 936          */
 937         updrqst = false;
 938         freespace = INSERT_FREESPACE(Insert);
 939         if (freespace < SizeOfXLogRecord)
 940         {
 941                 updrqst = AdvanceXLInsertBuffer(false);
 942                 freespace = INSERT_FREESPACE(Insert);
 943         }
 944
 945         /* Compute record's XLOG location */
 946         curridx = Insert->curridx;
 947         INSERT_RECPTR(RecPtr, Insert, curridx);
 948
 949         /*
 950          * If the record is an XLOG_SWITCH, and we are exactly at the start of a
 951          * segment, we need not insert it (and don't want to because we'd like
 952          * consecutive switch requests to be no-ops).  Instead, make sure
 953          * everything is written and flushed through the end of the prior segment,
 954          * and return the prior segment's end address.
 955          */
 956         if (isLogSwitch &&
 957                 (RecPtr.xrecoff % XLogSegSize) == SizeOfXLogLongPHD)
 958         {
 959                 /* We can release insert lock immediately */
 960                 LWLockRelease(WALInsertLock);
 961
 962                 RecPtr.xrecoff -= SizeOfXLogLongPHD;
 963                 if (RecPtr.xrecoff == 0)
 964                 {
 965                         /* crossing a logid boundary */
 966                         RecPtr.xlogid -= 1;
 967                         RecPtr.xrecoff = XLogFileSize;
 968                 }
 969
 970                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
 971                 LogwrtResult = XLogCtl->Write.LogwrtResult;
 972                 if (!XLByteLE(RecPtr, LogwrtResult.Flush))
 973                 {
 974                         XLogwrtRqst FlushRqst;
 975
 976                         FlushRqst.Write = RecPtr;
 977                         FlushRqst.Flush = RecPtr;
 978                         XLogWrite(FlushRqst, false, false);
 979                 }
 980                 LWLockRelease(WALWriteLock);
 981
 982                 END_CRIT_SECTION();
 983
 984                 return RecPtr;
 985         }
 986
 987         /* Insert record header */
 988
 989         record = (XLogRecord *) Insert->currpos;
 990         record->xl_prev = Insert->PrevRecord;
 991         record->xl_xid = GetCurrentTransactionIdIfAny();
 992         record->xl_tot_len = SizeOfXLogRecord + write_len;
 993         record->xl_len = len;           /* doesn't include backup blocks */
 994         record->xl_info = info;
 995         record->xl_rmid = rmid;
 996
 997         /* Now we can finish computing the record's CRC */
 998         COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
 999                            SizeOfXLogRecord - sizeof(pg_crc32));
1000         FIN_CRC32(rdata_crc);
1001         record->xl_crc = rdata_crc;
1002
1003 #ifdef WAL_DEBUG
1004         if (XLOG_DEBUG)
1005         {
1006                 StringInfoData buf;
1007
1008                 initStringInfo(&buf);
1009                 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1010                                                  RecPtr.xlogid, RecPtr.xrecoff);
1011                 xlog_outrec(&buf, record);
1012                 if (rdata->data != NULL)
1013                 {
1014                         appendStringInfo(&buf, " - ");
1015                         RmgrTable[record->xl_rmid].rm_desc(&buf, record->xl_info, rdata->data);
1016                 }
1017                 elog(LOG, "%s", buf.data);
1018                 pfree(buf.data);
1019         }
1020 #endif
1021
1022         /* Record begin of record in appropriate places */
1023         ProcLastRecPtr = RecPtr;
1024         Insert->PrevRecord = RecPtr;
1025
1026         Insert->currpos += SizeOfXLogRecord;
1027         freespace -= SizeOfXLogRecord;
1028
1029         /*
1030          * Append the data, including backup blocks if any
1031          */
1032         while (write_len)
1033         {
1034                 while (rdata->data == NULL)
1035                         rdata = rdata->next;
1036
1037                 if (freespace > 0)
1038                 {
1039                         if (rdata->len > freespace)
1040                         {
1041                                 memcpy(Insert->currpos, rdata->data, freespace);
1042                                 rdata->data += freespace;
1043                                 rdata->len -= freespace;
1044                                 write_len -= freespace;
1045                         }
1046                         else
1047                         {
1048                                 memcpy(Insert->currpos, rdata->data, rdata->len);
1049                                 freespace -= rdata->len;
1050                                 write_len -= rdata->len;
1051                                 Insert->currpos += rdata->len;
1052                                 rdata = rdata->next;
1053                                 continue;
1054                         }
1055                 }
1056
1057                 /* Use next buffer */
1058                 updrqst = AdvanceXLInsertBuffer(false);
1059                 curridx = Insert->curridx;
1060                 /* Insert cont-record header */
1061                 Insert->currpage->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1062                 contrecord = (XLogContRecord *) Insert->currpos;
1063                 contrecord->xl_rem_len = write_len;
1064                 Insert->currpos += SizeOfXLogContRecord;
1065                 freespace = INSERT_FREESPACE(Insert);
1066         }
1067
1068         /* Ensure next record will be properly aligned */
1069         Insert->currpos = (char *) Insert->currpage +
1070                 MAXALIGN(Insert->currpos - (char *) Insert->currpage);
1071         freespace = INSERT_FREESPACE(Insert);
1072
1073         /*
1074          * The recptr I return is the beginning of the *next* record. This will be
1075          * stored as LSN for changed data pages...
1076          */
1077         INSERT_RECPTR(RecPtr, Insert, curridx);
1078
1079         /*
1080          * If the record is an XLOG_SWITCH, we must now write and flush all the
1081          * existing data, and then forcibly advance to the start of the next
1082          * segment.  It's not good to do this I/O while holding the insert lock,
1083          * but there seems too much risk of confusion if we try to release the
1084          * lock sooner.  Fortunately xlog switch needn't be a high-performance
1085          * operation anyway...
1086          */
1087         if (isLogSwitch)
1088         {
1089                 XLogCtlWrite *Write = &XLogCtl->Write;
1090                 XLogwrtRqst FlushRqst;
1091                 XLogRecPtr      OldSegEnd;
1092
1093                 TRACE_POSTGRESQL_XLOG_SWITCH();
1094
1095                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1096
1097                 /*
1098                  * Flush through the end of the page containing XLOG_SWITCH, and
1099                  * perform end-of-segment actions (eg, notifying archiver).
1100                  */
1101                 WriteRqst = XLogCtl->xlblocks[curridx];
1102                 FlushRqst.Write = WriteRqst;
1103                 FlushRqst.Flush = WriteRqst;
1104                 XLogWrite(FlushRqst, false, true);
1105
1106                 /* Set up the next buffer as first page of next segment */
1107                 /* Note: AdvanceXLInsertBuffer cannot need to do I/O here */
1108                 (void) AdvanceXLInsertBuffer(true);
1109
1110                 /* There should be no unwritten data */
1111                 curridx = Insert->curridx;
1112                 Assert(curridx == Write->curridx);
1113
1114                 /* Compute end address of old segment */
1115                 OldSegEnd = XLogCtl->xlblocks[curridx];
1116                 OldSegEnd.xrecoff -= XLOG_BLCKSZ;
1117                 if (OldSegEnd.xrecoff == 0)
1118                 {
1119                         /* crossing a logid boundary */
1120                         OldSegEnd.xlogid -= 1;
1121                         OldSegEnd.xrecoff = XLogFileSize;
1122                 }
1123
1124                 /* Make it look like we've written and synced all of old segment */
1125                 LogwrtResult.Write = OldSegEnd;
1126                 LogwrtResult.Flush = OldSegEnd;
1127
1128                 /*
1129                  * Update shared-memory status --- this code should match XLogWrite
1130                  */
1131                 {
1132                         /* use volatile pointer to prevent code rearrangement */
1133                         volatile XLogCtlData *xlogctl = XLogCtl;
1134
1135                         SpinLockAcquire(&xlogctl->info_lck);
1136                         xlogctl->LogwrtResult = LogwrtResult;
1137                         if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1138                                 xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1139                         if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1140                                 xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1141                         SpinLockRelease(&xlogctl->info_lck);
1142                 }
1143
1144                 Write->LogwrtResult = LogwrtResult;
1145
1146                 LWLockRelease(WALWriteLock);
1147
1148                 updrqst = false;                /* done already */
1149         }
1150         else
1151         {
1152                 /* normal case, ie not xlog switch */
1153
1154                 /* Need to update shared LogwrtRqst if some block was filled up */
1155                 if (freespace < SizeOfXLogRecord)
1156                 {
1157                         /* curridx is filled and available for writing out */
1158                         updrqst = true;
1159                 }
1160                 else
1161                 {
1162                         /* if updrqst already set, write through end of previous buf */
1163                         curridx = PrevBufIdx(curridx);
1164                 }
1165                 WriteRqst = XLogCtl->xlblocks[curridx];
1166         }
1167
1168         LWLockRelease(WALInsertLock);
1169
1170         if (updrqst)
1171         {
1172                 /* use volatile pointer to prevent code rearrangement */
1173                 volatile XLogCtlData *xlogctl = XLogCtl;
1174
1175                 SpinLockAcquire(&xlogctl->info_lck);
1176                 /* advance global request to include new block(s) */
1177                 if (XLByteLT(xlogctl->LogwrtRqst.Write, WriteRqst))
1178                         xlogctl->LogwrtRqst.Write = WriteRqst;
1179                 /* update local result copy while I have the chance */
1180                 LogwrtResult = xlogctl->LogwrtResult;
1181                 SpinLockRelease(&xlogctl->info_lck);
1182         }
1183
1184         XactLastRecEnd = RecPtr;
1185
1186         END_CRIT_SECTION();
1187
1188         return RecPtr;
1189 }
1190
1191 /*
1192  * Determine whether the buffer referenced by an XLogRecData item has to
1193  * be backed up, and if so fill a BkpBlock struct for it.  In any case
1194  * save the buffer's LSN at *lsn.
1195  */
1196 static bool
1197 XLogCheckBuffer(XLogRecData *rdata, bool doPageWrites,
1198                                 XLogRecPtr *lsn, BkpBlock *bkpb)
1199 {
1200         Page            page;
1201
1202         page = BufferGetPage(rdata->buffer);
1203
1204         /*
1205          * XXX We assume page LSN is first data on *every* page that can be passed
1206          * to XLogInsert, whether it otherwise has the standard page layout or
1207          * not.
1208          */
1209         *lsn = PageGetLSN(page);
1210
1211         if (doPageWrites &&
1212                 XLByteLE(PageGetLSN(page), RedoRecPtr))
1213         {
1214                 /*
1215                  * The page needs to be backed up, so set up *bkpb
1216                  */
1217                 BufferGetTag(rdata->buffer, &bkpb->node, &bkpb->fork, &bkpb->block);
1218
1219                 if (rdata->buffer_std)
1220                 {
1221                         /* Assume we can omit data between pd_lower and pd_upper */
1222                         uint16          lower = ((PageHeader) page)->pd_lower;
1223                         uint16          upper = ((PageHeader) page)->pd_upper;
1224
1225                         if (lower >= SizeOfPageHeaderData &&
1226                                 upper > lower &&
1227                                 upper <= BLCKSZ)
1228                         {
1229                                 bkpb->hole_offset = lower;
1230                                 bkpb->hole_length = upper - lower;
1231                         }
1232                         else
1233                         {
1234                                 /* No "hole" to compress out */
1235                                 bkpb->hole_offset = 0;
1236                                 bkpb->hole_length = 0;
1237                         }
1238                 }
1239                 else
1240                 {
1241                         /* Not a standard page header, don't try to eliminate "hole" */
1242                         bkpb->hole_offset = 0;
1243                         bkpb->hole_length = 0;
1244                 }
1245
1246                 return true;                    /* buffer requires backup */
1247         }
1248
1249         return false;                           /* buffer does not need to be backed up */
1250 }
1251
1252 /*
1253  * XLogArchiveNotify
1254  *
1255  * Create an archive notification file
1256  *
1257  * The name of the notification file is the message that will be picked up
1258  * by the archiver, e.g. we write 0000000100000001000000C6.ready
1259  * and the archiver then knows to archive XLOGDIR/0000000100000001000000C6,
1260  * then when complete, rename it to 0000000100000001000000C6.done
1261  */
1262 static void
1263 XLogArchiveNotify(const char *xlog)
1264 {
1265         char            archiveStatusPath[MAXPGPATH];
1266         FILE       *fd;
1267
1268         /* insert an otherwise empty file called <XLOG>.ready */
1269         StatusFilePath(archiveStatusPath, xlog, ".ready");
1270         fd = AllocateFile(archiveStatusPath, "w");
1271         if (fd == NULL)
1272         {
1273                 ereport(LOG,
1274                                 (errcode_for_file_access(),
1275                                  errmsg("could not create archive status file \"%s\": %m",
1276                                                 archiveStatusPath)));
1277                 return;
1278         }
1279         if (FreeFile(fd))
1280         {
1281                 ereport(LOG,
1282                                 (errcode_for_file_access(),
1283                                  errmsg("could not write archive status file \"%s\": %m",
1284                                                 archiveStatusPath)));
1285                 return;
1286         }
1287
1288         /* Notify archiver that it's got something to do */
1289         if (IsUnderPostmaster)
1290                 SendPostmasterSignal(PMSIGNAL_WAKEN_ARCHIVER);
1291 }
1292
1293 /*
1294  * Convenience routine to notify using log/seg representation of filename
1295  */
1296 static void
1297 XLogArchiveNotifySeg(uint32 log, uint32 seg)
1298 {
1299         char            xlog[MAXFNAMELEN];
1300
1301         XLogFileName(xlog, ThisTimeLineID, log, seg);
1302         XLogArchiveNotify(xlog);
1303 }
1304
1305 /*
1306  * XLogArchiveCheckDone
1307  *
1308  * This is called when we are ready to delete or recycle an old XLOG segment
1309  * file or backup history file.  If it is okay to delete it then return true.
1310  * If it is not time to delete it, make sure a .ready file exists, and return
1311  * false.
1312  *
1313  * If <XLOG>.done exists, then return true; else if <XLOG>.ready exists,
1314  * then return false; else create <XLOG>.ready and return false.
1315  *
1316  * The reason we do things this way is so that if the original attempt to
1317  * create <XLOG>.ready fails, we'll retry during subsequent checkpoints.
1318  */
1319 static bool
1320 XLogArchiveCheckDone(const char *xlog)
1321 {
1322         char            archiveStatusPath[MAXPGPATH];
1323         struct stat stat_buf;
1324
1325         /* Always deletable if archiving is off */
1326         if (!XLogArchivingActive())
1327                 return true;
1328
1329         /* First check for .done --- this means archiver is done with it */
1330         StatusFilePath(archiveStatusPath, xlog, ".done");
1331         if (stat(archiveStatusPath, &stat_buf) == 0)
1332                 return true;
1333
1334         /* check for .ready --- this means archiver is still busy with it */
1335         StatusFilePath(archiveStatusPath, xlog, ".ready");
1336         if (stat(archiveStatusPath, &stat_buf) == 0)
1337                 return false;
1338
1339         /* Race condition --- maybe archiver just finished, so recheck */
1340         StatusFilePath(archiveStatusPath, xlog, ".done");
1341         if (stat(archiveStatusPath, &stat_buf) == 0)
1342                 return true;
1343
1344         /* Retry creation of the .ready file */
1345         XLogArchiveNotify(xlog);
1346         return false;
1347 }
1348
1349 /*
1350  * XLogArchiveIsBusy
1351  *
1352  * Check to see if an XLOG segment file is still unarchived.
1353  * This is almost but not quite the inverse of XLogArchiveCheckDone: in
1354  * the first place we aren't chartered to recreate the .ready file, and
1355  * in the second place we should consider that if the file is already gone
1356  * then it's not busy.  (This check is needed to handle the race condition
1357  * that a checkpoint already deleted the no-longer-needed file.)
1358  */
1359 static bool
1360 XLogArchiveIsBusy(const char *xlog)
1361 {
1362         char            archiveStatusPath[MAXPGPATH];
1363         struct stat stat_buf;
1364
1365         /* First check for .done --- this means archiver is done with it */
1366         StatusFilePath(archiveStatusPath, xlog, ".done");
1367         if (stat(archiveStatusPath, &stat_buf) == 0)
1368                 return false;
1369
1370         /* check for .ready --- this means archiver is still busy with it */
1371         StatusFilePath(archiveStatusPath, xlog, ".ready");
1372         if (stat(archiveStatusPath, &stat_buf) == 0)
1373                 return true;
1374
1375         /* Race condition --- maybe archiver just finished, so recheck */
1376         StatusFilePath(archiveStatusPath, xlog, ".done");
1377         if (stat(archiveStatusPath, &stat_buf) == 0)
1378                 return false;
1379
1380         /*
1381          * Check to see if the WAL file has been removed by checkpoint, which
1382          * implies it has already been archived, and explains why we can't see a
1383          * status file for it.
1384          */
1385         snprintf(archiveStatusPath, MAXPGPATH, XLOGDIR "/%s", xlog);
1386         if (stat(archiveStatusPath, &stat_buf) != 0 &&
1387                 errno == ENOENT)
1388                 return false;
1389
1390         return true;
1391 }
1392
1393 /*
1394  * XLogArchiveCleanup
1395  *
1396  * Cleanup archive notification file(s) for a particular xlog segment
1397  */
1398 static void
1399 XLogArchiveCleanup(const char *xlog)
1400 {
1401         char            archiveStatusPath[MAXPGPATH];
1402
1403         /* Remove the .done file */
1404         StatusFilePath(archiveStatusPath, xlog, ".done");
1405         unlink(archiveStatusPath);
1406         /* should we complain about failure? */
1407
1408         /* Remove the .ready file if present --- normally it shouldn't be */
1409         StatusFilePath(archiveStatusPath, xlog, ".ready");
1410         unlink(archiveStatusPath);
1411         /* should we complain about failure? */
1412 }
1413
1414 /*
1415  * Advance the Insert state to the next buffer page, writing out the next
1416  * buffer if it still contains unwritten data.
1417  *
1418  * If new_segment is TRUE then we set up the next buffer page as the first
1419  * page of the next xlog segment file, possibly but not usually the next
1420  * consecutive file page.
1421  *
1422  * The global LogwrtRqst.Write pointer needs to be advanced to include the
1423  * just-filled page.  If we can do this for free (without an extra lock),
1424  * we do so here.  Otherwise the caller must do it.  We return TRUE if the
1425  * request update still needs to be done, FALSE if we did it internally.
1426  *
1427  * Must be called with WALInsertLock held.
1428  */
1429 static bool
1430 AdvanceXLInsertBuffer(bool new_segment)
1431 {
1432         XLogCtlInsert *Insert = &XLogCtl->Insert;
1433         XLogCtlWrite *Write = &XLogCtl->Write;
1434         int                     nextidx = NextBufIdx(Insert->curridx);
1435         bool            update_needed = true;
1436         XLogRecPtr      OldPageRqstPtr;
1437         XLogwrtRqst WriteRqst;
1438         XLogRecPtr      NewPageEndPtr;
1439         XLogPageHeader NewPage;
1440
1441         /* Use Insert->LogwrtResult copy if it's more fresh */
1442         if (XLByteLT(LogwrtResult.Write, Insert->LogwrtResult.Write))
1443                 LogwrtResult = Insert->LogwrtResult;
1444
1445         /*
1446          * Get ending-offset of the buffer page we need to replace (this may be
1447          * zero if the buffer hasn't been used yet).  Fall through if it's already
1448          * written out.
1449          */
1450         OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1451         if (!XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1452         {
1453                 /* nope, got work to do... */
1454                 XLogRecPtr      FinishedPageRqstPtr;
1455
1456                 FinishedPageRqstPtr = XLogCtl->xlblocks[Insert->curridx];
1457
1458                 /* Before waiting, get info_lck and update LogwrtResult */
1459                 {
1460                         /* use volatile pointer to prevent code rearrangement */
1461                         volatile XLogCtlData *xlogctl = XLogCtl;
1462
1463                         SpinLockAcquire(&xlogctl->info_lck);
1464                         if (XLByteLT(xlogctl->LogwrtRqst.Write, FinishedPageRqstPtr))
1465                                 xlogctl->LogwrtRqst.Write = FinishedPageRqstPtr;
1466                         LogwrtResult = xlogctl->LogwrtResult;
1467                         SpinLockRelease(&xlogctl->info_lck);
1468                 }
1469
1470                 update_needed = false;  /* Did the shared-request update */
1471
1472                 if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1473                 {
1474                         /* OK, someone wrote it already */
1475                         Insert->LogwrtResult = LogwrtResult;
1476                 }
1477                 else
1478                 {
1479                         /* Must acquire write lock */
1480                         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1481                         LogwrtResult = Write->LogwrtResult;
1482                         if (XLByteLE(OldPageRqstPtr, LogwrtResult.Write))
1483                         {
1484                                 /* OK, someone wrote it already */
1485                                 LWLockRelease(WALWriteLock);
1486                                 Insert->LogwrtResult = LogwrtResult;
1487                         }
1488                         else
1489                         {
1490                                 /*
1491                                  * Have to write buffers while holding insert lock. This is
1492                                  * not good, so only write as much as we absolutely must.
1493                                  */
1494                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1495                                 WriteRqst.Write = OldPageRqstPtr;
1496                                 WriteRqst.Flush.xlogid = 0;
1497                                 WriteRqst.Flush.xrecoff = 0;
1498                                 XLogWrite(WriteRqst, false, false);
1499                                 LWLockRelease(WALWriteLock);
1500                                 Insert->LogwrtResult = LogwrtResult;
1501                                 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1502                         }
1503                 }
1504         }
1505
1506         /*
1507          * Now the next buffer slot is free and we can set it up to be the next
1508          * output page.
1509          */
1510         NewPageEndPtr = XLogCtl->xlblocks[Insert->curridx];
1511
1512         if (new_segment)
1513         {
1514                 /* force it to a segment start point */
1515                 NewPageEndPtr.xrecoff += XLogSegSize - 1;
1516                 NewPageEndPtr.xrecoff -= NewPageEndPtr.xrecoff % XLogSegSize;
1517         }
1518
1519         if (NewPageEndPtr.xrecoff >= XLogFileSize)
1520         {
1521                 /* crossing a logid boundary */
1522                 NewPageEndPtr.xlogid += 1;
1523                 NewPageEndPtr.xrecoff = XLOG_BLCKSZ;
1524         }
1525         else
1526                 NewPageEndPtr.xrecoff += XLOG_BLCKSZ;
1527         XLogCtl->xlblocks[nextidx] = NewPageEndPtr;
1528         NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
1529
1530         Insert->curridx = nextidx;
1531         Insert->currpage = NewPage;
1532
1533         Insert->currpos = ((char *) NewPage) +SizeOfXLogShortPHD;
1534
1535         /*
1536          * Be sure to re-zero the buffer so that bytes beyond what we've written
1537          * will look like zeroes and not valid XLOG records...
1538          */
1539         MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
1540
1541         /*
1542          * Fill the new page's header
1543          */
1544         NewPage   ->xlp_magic = XLOG_PAGE_MAGIC;
1545
1546         /* NewPage->xlp_info = 0; */    /* done by memset */
1547         NewPage   ->xlp_tli = ThisTimeLineID;
1548         NewPage   ->xlp_pageaddr.xlogid = NewPageEndPtr.xlogid;
1549         NewPage   ->xlp_pageaddr.xrecoff = NewPageEndPtr.xrecoff - XLOG_BLCKSZ;
1550
1551         /*
1552          * If first page of an XLOG segment file, make it a long header.
1553          */
1554         if ((NewPage->xlp_pageaddr.xrecoff % XLogSegSize) == 0)
1555         {
1556                 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
1557
1558                 NewLongPage->xlp_sysid = ControlFile->system_identifier;
1559                 NewLongPage->xlp_seg_size = XLogSegSize;
1560                 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
1561                 NewPage   ->xlp_info |= XLP_LONG_HEADER;
1562
1563                 Insert->currpos = ((char *) NewPage) +SizeOfXLogLongPHD;
1564         }
1565
1566         return update_needed;
1567 }
1568
1569 /*
1570  * Check whether we've consumed enough xlog space that a checkpoint is needed.
1571  *
1572  * logid/logseg indicate a log file that has just been filled up (or read
1573  * during recovery). We measure the distance from RedoRecPtr to logid/logseg
1574  * and see if that exceeds CheckPointSegments.
1575  *
1576  * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
1577  */
1578 static bool
1579 XLogCheckpointNeeded(uint32 logid, uint32 logseg)
1580 {
1581         /*
1582          * A straight computation of segment number could overflow 32 bits. Rather
1583          * than assuming we have working 64-bit arithmetic, we compare the
1584          * highest-order bits separately, and force a checkpoint immediately when
1585          * they change.
1586          */
1587         uint32          old_segno,
1588                                 new_segno;
1589         uint32          old_highbits,
1590                                 new_highbits;
1591
1592         old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
1593                 (RedoRecPtr.xrecoff / XLogSegSize);
1594         old_highbits = RedoRecPtr.xlogid / XLogSegSize;
1595         new_segno = (logid % XLogSegSize) * XLogSegsPerFile + logseg;
1596         new_highbits = logid / XLogSegSize;
1597         if (new_highbits != old_highbits ||
1598                 new_segno >= old_segno + (uint32) (CheckPointSegments - 1))
1599                 return true;
1600         return false;
1601 }
1602
1603 /*
1604  * Write and/or fsync the log at least as far as WriteRqst indicates.
1605  *
1606  * If flexible == TRUE, we don't have to write as far as WriteRqst, but
1607  * may stop at any convenient boundary (such as a cache or logfile boundary).
1608  * This option allows us to avoid uselessly issuing multiple writes when a
1609  * single one would do.
1610  *
1611  * If xlog_switch == TRUE, we are intending an xlog segment switch, so
1612  * perform end-of-segment actions after writing the last page, even if
1613  * it's not physically the end of its segment.  (NB: this will work properly
1614  * only if caller specifies WriteRqst == page-end and flexible == false,
1615  * and there is some data to write.)
1616  *
1617  * Must be called with WALWriteLock held.
1618  */
1619 static void
1620 XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
1621 {
1622         XLogCtlWrite *Write = &XLogCtl->Write;
1623         bool            ispartialpage;
1624         bool            last_iteration;
1625         bool            finishing_seg;
1626         bool            use_existent;
1627         int                     curridx;
1628         int                     npages;
1629         int                     startidx;
1630         uint32          startoffset;
1631
1632         /* We should always be inside a critical section here */
1633         Assert(CritSectionCount > 0);
1634
1635         /*
1636          * Update local LogwrtResult (caller probably did this already, but...)
1637          */
1638         LogwrtResult = Write->LogwrtResult;
1639
1640         /*
1641          * Since successive pages in the xlog cache are consecutively allocated,
1642          * we can usually gather multiple pages together and issue just one
1643          * write() call.  npages is the number of pages we have determined can be
1644          * written together; startidx is the cache block index of the first one,
1645          * and startoffset is the file offset at which it should go. The latter
1646          * two variables are only valid when npages > 0, but we must initialize
1647          * all of them to keep the compiler quiet.
1648          */
1649         npages = 0;
1650         startidx = 0;
1651         startoffset = 0;
1652
1653         /*
1654          * Within the loop, curridx is the cache block index of the page to
1655          * consider writing.  We advance Write->curridx only after successfully
1656          * writing pages.  (Right now, this refinement is useless since we are
1657          * going to PANIC if any error occurs anyway; but someday it may come in
1658          * useful.)
1659          */
1660         curridx = Write->curridx;
1661
1662         while (XLByteLT(LogwrtResult.Write, WriteRqst.Write))
1663         {
1664                 /*
1665                  * Make sure we're not ahead of the insert process.  This could happen
1666                  * if we're passed a bogus WriteRqst.Write that is past the end of the
1667                  * last page that's been initialized by AdvanceXLInsertBuffer.
1668                  */
1669                 if (!XLByteLT(LogwrtResult.Write, XLogCtl->xlblocks[curridx]))
1670                         elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
1671                                  LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1672                                  XLogCtl->xlblocks[curridx].xlogid,
1673                                  XLogCtl->xlblocks[curridx].xrecoff);
1674
1675                 /* Advance LogwrtResult.Write to end of current buffer page */
1676                 LogwrtResult.Write = XLogCtl->xlblocks[curridx];
1677                 ispartialpage = XLByteLT(WriteRqst.Write, LogwrtResult.Write);
1678
1679                 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1680                 {
1681                         /*
1682                          * Switch to new logfile segment.  We cannot have any pending
1683                          * pages here (since we dump what we have at segment end).
1684                          */
1685                         Assert(npages == 0);
1686                         if (openLogFile >= 0)
1687                                 XLogFileClose();
1688                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1689
1690                         /* create/use new log file */
1691                         use_existent = true;
1692                         openLogFile = XLogFileInit(openLogId, openLogSeg,
1693                                                                            &use_existent, true);
1694                         openLogOff = 0;
1695                 }
1696
1697                 /* Make sure we have the current logfile open */
1698                 if (openLogFile < 0)
1699                 {
1700                         XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1701                         openLogFile = XLogFileOpen(openLogId, openLogSeg);
1702                         openLogOff = 0;
1703                 }
1704
1705                 /* Add current page to the set of pending pages-to-dump */
1706                 if (npages == 0)
1707                 {
1708                         /* first of group */
1709                         startidx = curridx;
1710                         startoffset = (LogwrtResult.Write.xrecoff - XLOG_BLCKSZ) % XLogSegSize;
1711                 }
1712                 npages++;
1713
1714                 /*
1715                  * Dump the set if this will be the last loop iteration, or if we are
1716                  * at the last page of the cache area (since the next page won't be
1717                  * contiguous in memory), or if we are at the end of the logfile
1718                  * segment.
1719                  */
1720                 last_iteration = !XLByteLT(LogwrtResult.Write, WriteRqst.Write);
1721
1722                 finishing_seg = !ispartialpage &&
1723                         (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
1724
1725                 if (last_iteration ||
1726                         curridx == XLogCtl->XLogCacheBlck ||
1727                         finishing_seg)
1728                 {
1729                         char       *from;
1730                         Size            nbytes;
1731
1732                         /* Need to seek in the file? */
1733                         if (openLogOff != startoffset)
1734                         {
1735                                 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
1736                                         ereport(PANIC,
1737                                                         (errcode_for_file_access(),
1738                                                          errmsg("could not seek in log file %u, "
1739                                                                         "segment %u to offset %u: %m",
1740                                                                         openLogId, openLogSeg, startoffset)));
1741                                 openLogOff = startoffset;
1742                         }
1743
1744                         /* OK to write the page(s) */
1745                         from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
1746                         nbytes = npages * (Size) XLOG_BLCKSZ;
1747                         errno = 0;
1748                         if (write(openLogFile, from, nbytes) != nbytes)
1749                         {
1750                                 /* if write didn't set errno, assume no disk space */
1751                                 if (errno == 0)
1752                                         errno = ENOSPC;
1753                                 ereport(PANIC,
1754                                                 (errcode_for_file_access(),
1755                                                  errmsg("could not write to log file %u, segment %u "
1756                                                                 "at offset %u, length %lu: %m",
1757                                                                 openLogId, openLogSeg,
1758                                                                 openLogOff, (unsigned long) nbytes)));
1759                         }
1760
1761                         /* Update state for write */
1762                         openLogOff += nbytes;
1763                         Write->curridx = ispartialpage ? curridx : NextBufIdx(curridx);
1764                         npages = 0;
1765
1766                         /*
1767                          * If we just wrote the whole last page of a logfile segment,
1768                          * fsync the segment immediately.  This avoids having to go back
1769                          * and re-open prior segments when an fsync request comes along
1770                          * later. Doing it here ensures that one and only one backend will
1771                          * perform this fsync.
1772                          *
1773                          * We also do this if this is the last page written for an xlog
1774                          * switch.
1775                          *
1776                          * This is also the right place to notify the Archiver that the
1777                          * segment is ready to copy to archival storage, and to update the
1778                          * timer for archive_timeout, and to signal for a checkpoint if
1779                          * too many logfile segments have been used since the last
1780                          * checkpoint.
1781                          */
1782                         if (finishing_seg || (xlog_switch && last_iteration))
1783                         {
1784                                 issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
1785                                 LogwrtResult.Flush = LogwrtResult.Write;                /* end of page */
1786
1787                                 if (XLogArchivingActive())
1788                                         XLogArchiveNotifySeg(openLogId, openLogSeg);
1789
1790                                 Write->lastSegSwitchTime = (pg_time_t) time(NULL);
1791
1792                                 /*
1793                                  * Signal bgwriter to start a checkpoint if we've consumed too
1794                                  * much xlog since the last one.  For speed, we first check
1795                                  * using the local copy of RedoRecPtr, which might be out of
1796                                  * date; if it looks like a checkpoint is needed, forcibly
1797                                  * update RedoRecPtr and recheck.
1798                                  */
1799                                 if (IsUnderPostmaster &&
1800                                         XLogCheckpointNeeded(openLogId, openLogSeg))
1801                                 {
1802                                         (void) GetRedoRecPtr();
1803                                         if (XLogCheckpointNeeded(openLogId, openLogSeg))
1804                                                 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
1805                                 }
1806                         }
1807                 }
1808
1809                 if (ispartialpage)
1810                 {
1811                         /* Only asked to write a partial page */
1812                         LogwrtResult.Write = WriteRqst.Write;
1813                         break;
1814                 }
1815                 curridx = NextBufIdx(curridx);
1816
1817                 /* If flexible, break out of loop as soon as we wrote something */
1818                 if (flexible && npages == 0)
1819                         break;
1820         }
1821
1822         Assert(npages == 0);
1823         Assert(curridx == Write->curridx);
1824
1825         /*
1826          * If asked to flush, do so
1827          */
1828         if (XLByteLT(LogwrtResult.Flush, WriteRqst.Flush) &&
1829                 XLByteLT(LogwrtResult.Flush, LogwrtResult.Write))
1830         {
1831                 /*
1832                  * Could get here without iterating above loop, in which case we might
1833                  * have no open file or the wrong one.  However, we do not need to
1834                  * fsync more than one file.
1835                  */
1836                 if (sync_method != SYNC_METHOD_OPEN &&
1837                         sync_method != SYNC_METHOD_OPEN_DSYNC)
1838                 {
1839                         if (openLogFile >= 0 &&
1840                                 !XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
1841                                 XLogFileClose();
1842                         if (openLogFile < 0)
1843                         {
1844                                 XLByteToPrevSeg(LogwrtResult.Write, openLogId, openLogSeg);
1845                                 openLogFile = XLogFileOpen(openLogId, openLogSeg);
1846                                 openLogOff = 0;
1847                         }
1848                         issue_xlog_fsync(openLogFile, openLogId, openLogSeg);
1849                 }
1850                 LogwrtResult.Flush = LogwrtResult.Write;
1851         }
1852
1853         /*
1854          * Update shared-memory status
1855          *
1856          * We make sure that the shared 'request' values do not fall behind the
1857          * 'result' values.  This is not absolutely essential, but it saves some
1858          * code in a couple of places.
1859          */
1860         {
1861                 /* use volatile pointer to prevent code rearrangement */
1862                 volatile XLogCtlData *xlogctl = XLogCtl;
1863
1864                 SpinLockAcquire(&xlogctl->info_lck);
1865                 xlogctl->LogwrtResult = LogwrtResult;
1866                 if (XLByteLT(xlogctl->LogwrtRqst.Write, LogwrtResult.Write))
1867                         xlogctl->LogwrtRqst.Write = LogwrtResult.Write;
1868                 if (XLByteLT(xlogctl->LogwrtRqst.Flush, LogwrtResult.Flush))
1869                         xlogctl->LogwrtRqst.Flush = LogwrtResult.Flush;
1870                 SpinLockRelease(&xlogctl->info_lck);
1871         }
1872
1873         Write->LogwrtResult = LogwrtResult;
1874 }
1875
1876 /*
1877  * Record the LSN for an asynchronous transaction commit/abort.
1878  * (This should not be called for for synchronous commits.)
1879  */
1880 void
1881 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
1882 {
1883         /* use volatile pointer to prevent code rearrangement */
1884         volatile XLogCtlData *xlogctl = XLogCtl;
1885
1886         SpinLockAcquire(&xlogctl->info_lck);
1887         if (XLByteLT(xlogctl->asyncXactLSN, asyncXactLSN))
1888                 xlogctl->asyncXactLSN = asyncXactLSN;
1889         SpinLockRelease(&xlogctl->info_lck);
1890 }
1891
1892 /*
1893  * Advance minRecoveryPoint in control file.
1894  *
1895  * If we crash during recovery, we must reach this point again before the
1896  * database is consistent.
1897  *
1898  * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
1899  * is only updated if it's not already greater than or equal to 'lsn'.
1900  */
1901 static void
1902 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
1903 {
1904         /* Quick check using our local copy of the variable */
1905         if (!updateMinRecoveryPoint || (!force && XLByteLE(lsn, minRecoveryPoint)))
1906                 return;
1907
1908         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
1909
1910         /* update local copy */
1911         minRecoveryPoint = ControlFile->minRecoveryPoint;
1912
1913         /*
1914          * An invalid minRecoveryPoint means that we need to recover all the WAL,
1915          * i.e., we're doing crash recovery.  We never modify the control file's
1916          * value in that case, so we can short-circuit future checks here too.
1917          */
1918         if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
1919                 updateMinRecoveryPoint = false;
1920         else if (force || XLByteLT(minRecoveryPoint, lsn))
1921         {
1922                 /* use volatile pointer to prevent code rearrangement */
1923                 volatile XLogCtlData *xlogctl = XLogCtl;
1924                 XLogRecPtr      newMinRecoveryPoint;
1925
1926                 /*
1927                  * To avoid having to update the control file too often, we update it
1928                  * all the way to the last record being replayed, even though 'lsn'
1929                  * would suffice for correctness.  This also allows the 'force' case
1930                  * to not need a valid 'lsn' value.
1931                  *
1932                  * Another important reason for doing it this way is that the passed
1933                  * 'lsn' value could be bogus, i.e., past the end of available WAL, if
1934                  * the caller got it from a corrupted heap page.  Accepting such a
1935                  * value as the min recovery point would prevent us from coming up at
1936                  * all.  Instead, we just log a warning and continue with recovery.
1937                  * (See also the comments about corrupt LSNs in XLogFlush.)
1938                  */
1939                 SpinLockAcquire(&xlogctl->info_lck);
1940                 newMinRecoveryPoint = xlogctl->replayEndRecPtr;
1941                 SpinLockRelease(&xlogctl->info_lck);
1942
1943                 if (!force && XLByteLT(newMinRecoveryPoint, lsn))
1944                         elog(WARNING,
1945                            "xlog min recovery request %X/%X is past current point %X/%X",
1946                                  lsn.xlogid, lsn.xrecoff,
1947                                  newMinRecoveryPoint.xlogid, newMinRecoveryPoint.xrecoff);
1948
1949                 /* update control file */
1950                 if (XLByteLT(ControlFile->minRecoveryPoint, newMinRecoveryPoint))
1951                 {
1952                         ControlFile->minRecoveryPoint = newMinRecoveryPoint;
1953                         UpdateControlFile();
1954                         minRecoveryPoint = newMinRecoveryPoint;
1955
1956                         ereport(DEBUG2,
1957                                         (errmsg("updated min recovery point to %X/%X",
1958                                                 minRecoveryPoint.xlogid, minRecoveryPoint.xrecoff)));
1959                 }
1960         }
1961         LWLockRelease(ControlFileLock);
1962 }
1963
1964 /*
1965  * Ensure that all XLOG data through the given position is flushed to disk.
1966  *
1967  * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
1968  * already held, and we try to avoid acquiring it if possible.
1969  */
1970 void
1971 XLogFlush(XLogRecPtr record)
1972 {
1973         XLogRecPtr      WriteRqstPtr;
1974         XLogwrtRqst WriteRqst;
1975
1976         /*
1977          * During REDO, we are reading not writing WAL.  Therefore, instead of
1978          * trying to flush the WAL, we should update minRecoveryPoint instead. We
1979          * test XLogInsertAllowed(), not InRecovery, because we need the bgwriter
1980          * to act this way too, and because when the bgwriter tries to write the
1981          * end-of-recovery checkpoint, it should indeed flush.
1982          */
1983         if (!XLogInsertAllowed())
1984         {
1985                 UpdateMinRecoveryPoint(record, false);
1986                 return;
1987         }
1988
1989         /* Quick exit if already known flushed */
1990         if (XLByteLE(record, LogwrtResult.Flush))
1991                 return;
1992
1993 #ifdef WAL_DEBUG
1994         if (XLOG_DEBUG)
1995                 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
1996                          record.xlogid, record.xrecoff,
1997                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
1998                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
1999 #endif
2000
2001         START_CRIT_SECTION();
2002
2003         /*
2004          * Since fsync is usually a horribly expensive operation, we try to
2005          * piggyback as much data as we can on each fsync: if we see any more data
2006          * entered into the xlog buffer, we'll write and fsync that too, so that
2007          * the final value of LogwrtResult.Flush is as large as possible. This
2008          * gives us some chance of avoiding another fsync immediately after.
2009          */
2010
2011         /* initialize to given target; may increase below */
2012         WriteRqstPtr = record;
2013
2014         /* read LogwrtResult and update local state */
2015         {
2016                 /* use volatile pointer to prevent code rearrangement */
2017                 volatile XLogCtlData *xlogctl = XLogCtl;
2018
2019                 SpinLockAcquire(&xlogctl->info_lck);
2020                 if (XLByteLT(WriteRqstPtr, xlogctl->LogwrtRqst.Write))
2021                         WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2022                 LogwrtResult = xlogctl->LogwrtResult;
2023                 SpinLockRelease(&xlogctl->info_lck);
2024         }
2025
2026         /* done already? */
2027         if (!XLByteLE(record, LogwrtResult.Flush))
2028         {
2029                 /* now wait for the write lock */
2030                 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2031                 LogwrtResult = XLogCtl->Write.LogwrtResult;
2032                 if (!XLByteLE(record, LogwrtResult.Flush))
2033                 {
2034                         /* try to write/flush later additions to XLOG as well */
2035                         if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
2036                         {
2037                                 XLogCtlInsert *Insert = &XLogCtl->Insert;
2038                                 uint32          freespace = INSERT_FREESPACE(Insert);
2039
2040                                 if (freespace < SizeOfXLogRecord)               /* buffer is full */
2041                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2042                                 else
2043                                 {
2044                                         WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
2045                                         WriteRqstPtr.xrecoff -= freespace;
2046                                 }
2047                                 LWLockRelease(WALInsertLock);
2048                                 WriteRqst.Write = WriteRqstPtr;
2049                                 WriteRqst.Flush = WriteRqstPtr;
2050                         }
2051                         else
2052                         {
2053                                 WriteRqst.Write = WriteRqstPtr;
2054                                 WriteRqst.Flush = record;
2055                         }
2056                         XLogWrite(WriteRqst, false, false);
2057                 }
2058                 LWLockRelease(WALWriteLock);
2059         }
2060
2061         END_CRIT_SECTION();
2062
2063         /*
2064          * If we still haven't flushed to the request point then we have a
2065          * problem; most likely, the requested flush point is past end of XLOG.
2066          * This has been seen to occur when a disk page has a corrupted LSN.
2067          *
2068          * Formerly we treated this as a PANIC condition, but that hurts the
2069          * system's robustness rather than helping it: we do not want to take down
2070          * the whole system due to corruption on one data page.  In particular, if
2071          * the bad page is encountered again during recovery then we would be
2072          * unable to restart the database at all!  (This scenario actually
2073          * happened in the field several times with 7.1 releases.)      As of 8.4, bad
2074          * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2075          * the only time we can reach here during recovery is while flushing the
2076          * end-of-recovery checkpoint record, and we don't expect that to have a
2077          * bad LSN.
2078          *
2079          * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2080          * since xact.c calls this routine inside a critical section.  However,
2081          * calls from bufmgr.c are not within critical sections and so we will not
2082          * force a restart for a bad LSN on a data page.
2083          */
2084         if (XLByteLT(LogwrtResult.Flush, record))
2085                 elog(ERROR,
2086                 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2087                          record.xlogid, record.xrecoff,
2088                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2089 }
2090
2091 /*
2092  * Flush xlog, but without specifying exactly where to flush to.
2093  *
2094  * We normally flush only completed blocks; but if there is nothing to do on
2095  * that basis, we check for unflushed async commits in the current incomplete
2096  * block, and flush through the latest one of those.  Thus, if async commits
2097  * are not being used, we will flush complete blocks only.      We can guarantee
2098  * that async commits reach disk after at most three cycles; normally only
2099  * one or two.  (We allow XLogWrite to write "flexibly", meaning it can stop
2100  * at the end of the buffer ring; this makes a difference only with very high
2101  * load or long wal_writer_delay, but imposes one extra cycle for the worst
2102  * case for async commits.)
2103  *
2104  * This routine is invoked periodically by the background walwriter process.
2105  */
2106 void
2107 XLogBackgroundFlush(void)
2108 {
2109         XLogRecPtr      WriteRqstPtr;
2110         bool            flexible = true;
2111
2112         /* XLOG doesn't need flushing during recovery */
2113         if (RecoveryInProgress())
2114                 return;
2115
2116         /* read LogwrtResult and update local state */
2117         {
2118                 /* use volatile pointer to prevent code rearrangement */
2119                 volatile XLogCtlData *xlogctl = XLogCtl;
2120
2121                 SpinLockAcquire(&xlogctl->info_lck);
2122                 LogwrtResult = xlogctl->LogwrtResult;
2123                 WriteRqstPtr = xlogctl->LogwrtRqst.Write;
2124                 SpinLockRelease(&xlogctl->info_lck);
2125         }
2126
2127         /* back off to last completed page boundary */
2128         WriteRqstPtr.xrecoff -= WriteRqstPtr.xrecoff % XLOG_BLCKSZ;
2129
2130         /* if we have already flushed that far, consider async commit records */
2131         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2132         {
2133                 /* use volatile pointer to prevent code rearrangement */
2134                 volatile XLogCtlData *xlogctl = XLogCtl;
2135
2136                 SpinLockAcquire(&xlogctl->info_lck);
2137                 WriteRqstPtr = xlogctl->asyncXactLSN;
2138                 SpinLockRelease(&xlogctl->info_lck);
2139                 flexible = false;               /* ensure it all gets written */
2140         }
2141
2142         /*
2143          * If already known flushed, we're done. Just need to check if we are
2144          * holding an open file handle to a logfile that's no longer in use,
2145          * preventing the file from being deleted.
2146          */
2147         if (XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2148         {
2149                 if (openLogFile >= 0)
2150                 {
2151                         if (!XLByteInPrevSeg(LogwrtResult.Write, openLogId, openLogSeg))
2152                         {
2153                                 XLogFileClose();
2154                         }
2155                 }
2156                 return;
2157         }
2158
2159 #ifdef WAL_DEBUG
2160         if (XLOG_DEBUG)
2161                 elog(LOG, "xlog bg flush request %X/%X; write %X/%X; flush %X/%X",
2162                          WriteRqstPtr.xlogid, WriteRqstPtr.xrecoff,
2163                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff,
2164                          LogwrtResult.Flush.xlogid, LogwrtResult.Flush.xrecoff);
2165 #endif
2166
2167         START_CRIT_SECTION();
2168
2169         /* now wait for the write lock */
2170         LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2171         LogwrtResult = XLogCtl->Write.LogwrtResult;
2172         if (!XLByteLE(WriteRqstPtr, LogwrtResult.Flush))
2173         {
2174                 XLogwrtRqst WriteRqst;
2175
2176                 WriteRqst.Write = WriteRqstPtr;
2177                 WriteRqst.Flush = WriteRqstPtr;
2178                 XLogWrite(WriteRqst, flexible, false);
2179         }
2180         LWLockRelease(WALWriteLock);
2181
2182         END_CRIT_SECTION();
2183 }
2184
2185 /*
2186  * Test whether XLOG data has been flushed up to (at least) the given position.
2187  *
2188  * Returns true if a flush is still needed.  (It may be that someone else
2189  * is already in process of flushing that far, however.)
2190  */
2191 bool
2192 XLogNeedsFlush(XLogRecPtr record)
2193 {
2194         /*
2195          * During recovery, we don't flush WAL but update minRecoveryPoint
2196          * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2197          * would need to be updated.
2198          */
2199         if (RecoveryInProgress())
2200         {
2201                 /* Quick exit if already known updated */
2202                 if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
2203                         return false;
2204
2205                 /*
2206                  * Update local copy of minRecoveryPoint. But if the lock is busy,
2207                  * just return a conservative guess.
2208                  */
2209                 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
2210                         return true;
2211                 minRecoveryPoint = ControlFile->minRecoveryPoint;
2212                 LWLockRelease(ControlFileLock);
2213
2214                 /*
2215                  * An invalid minRecoveryPoint means that we need to recover all the
2216                  * WAL, i.e., we're doing crash recovery.  We never modify the control
2217                  * file's value in that case, so we can short-circuit future checks
2218                  * here too.
2219                  */
2220                 if (minRecoveryPoint.xlogid == 0 && minRecoveryPoint.xrecoff == 0)
2221                         updateMinRecoveryPoint = false;
2222
2223                 /* check again */
2224                 if (XLByteLE(record, minRecoveryPoint) || !updateMinRecoveryPoint)
2225                         return false;
2226                 else
2227                         return true;
2228         }
2229
2230         /* Quick exit if already known flushed */
2231         if (XLByteLE(record, LogwrtResult.Flush))
2232                 return false;
2233
2234         /* read LogwrtResult and update local state */
2235         {
2236                 /* use volatile pointer to prevent code rearrangement */
2237                 volatile XLogCtlData *xlogctl = XLogCtl;
2238
2239                 SpinLockAcquire(&xlogctl->info_lck);
2240                 LogwrtResult = xlogctl->LogwrtResult;
2241                 SpinLockRelease(&xlogctl->info_lck);
2242         }
2243
2244         /* check again */
2245         if (XLByteLE(record, LogwrtResult.Flush))
2246                 return false;
2247
2248         return true;
2249 }
2250
2251 /*
2252  * Create a new XLOG file segment, or open a pre-existing one.
2253  *
2254  * log, seg: identify segment to be created/opened.
2255  *
2256  * *use_existent: if TRUE, OK to use a pre-existing file (else, any
2257  * pre-existing file will be deleted).  On return, TRUE if a pre-existing
2258  * file was used.
2259  *
2260  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2261  * place.  This should be TRUE except during bootstrap log creation.  The
2262  * caller must *not* hold the lock at call.
2263  *
2264  * Returns FD of opened file.
2265  *
2266  * Note: errors here are ERROR not PANIC because we might or might not be
2267  * inside a critical section (eg, during checkpoint there is no reason to
2268  * take down the system on failure).  They will promote to PANIC if we are
2269  * in a critical section.
2270  */
2271 int
2272 XLogFileInit(uint32 log, uint32 seg,
2273                          bool *use_existent, bool use_lock)
2274 {
2275         char            path[MAXPGPATH];
2276         char            tmppath[MAXPGPATH];
2277         char       *zbuffer;
2278         uint32          installed_log;
2279         uint32          installed_seg;
2280         int                     max_advance;
2281         int                     fd;
2282         int                     nbytes;
2283
2284         XLogFilePath(path, ThisTimeLineID, log, seg);
2285
2286         /*
2287          * Try to use existent file (checkpoint maker may have created it already)
2288          */
2289         if (*use_existent)
2290         {
2291                 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2292                                                    S_IRUSR | S_IWUSR);
2293                 if (fd < 0)
2294                 {
2295                         if (errno != ENOENT)
2296                                 ereport(ERROR,
2297                                                 (errcode_for_file_access(),
2298                                                  errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2299                                                                 path, log, seg)));
2300                 }
2301                 else
2302                         return fd;
2303         }
2304
2305         /*
2306          * Initialize an empty (all zeroes) segment.  NOTE: it is possible that
2307          * another process is doing the same thing.  If so, we will end up
2308          * pre-creating an extra log segment.  That seems OK, and better than
2309          * holding the lock throughout this lengthy process.
2310          */
2311         elog(DEBUG2, "creating and filling new WAL file");
2312
2313         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2314
2315         unlink(tmppath);
2316
2317         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2318         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2319                                            S_IRUSR | S_IWUSR);
2320         if (fd < 0)
2321                 ereport(ERROR,
2322                                 (errcode_for_file_access(),
2323                                  errmsg("could not create file \"%s\": %m", tmppath)));
2324
2325         /*
2326          * Zero-fill the file.  We have to do this the hard way to ensure that all
2327          * the file space has really been allocated --- on platforms that allow
2328          * "holes" in files, just seeking to the end doesn't allocate intermediate
2329          * space.  This way, we know that we have all the space and (after the
2330          * fsync below) that all the indirect blocks are down on disk.  Therefore,
2331          * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
2332          * log file.
2333          *
2334          * Note: palloc zbuffer, instead of just using a local char array, to
2335          * ensure it is reasonably well-aligned; this may save a few cycles
2336          * transferring data to the kernel.
2337          */
2338         zbuffer = (char *) palloc0(XLOG_BLCKSZ);
2339         for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
2340         {
2341                 errno = 0;
2342                 if ((int) write(fd, zbuffer, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
2343                 {
2344                         int                     save_errno = errno;
2345
2346                         /*
2347                          * If we fail to make the file, delete it to release disk space
2348                          */
2349                         unlink(tmppath);
2350                         /* if write didn't set errno, assume problem is no disk space */
2351                         errno = save_errno ? save_errno : ENOSPC;
2352
2353                         ereport(ERROR,
2354                                         (errcode_for_file_access(),
2355                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2356                 }
2357         }
2358         pfree(zbuffer);
2359
2360         if (pg_fsync(fd) != 0)
2361                 ereport(ERROR,
2362                                 (errcode_for_file_access(),
2363                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2364
2365         if (close(fd))
2366                 ereport(ERROR,
2367                                 (errcode_for_file_access(),
2368                                  errmsg("could not close file \"%s\": %m", tmppath)));
2369
2370         /*
2371          * Now move the segment into place with its final name.
2372          *
2373          * If caller didn't want to use a pre-existing file, get rid of any
2374          * pre-existing file.  Otherwise, cope with possibility that someone else
2375          * has created the file while we were filling ours: if so, use ours to
2376          * pre-create a future log segment.
2377          */
2378         installed_log = log;
2379         installed_seg = seg;
2380         max_advance = XLOGfileslop;
2381         if (!InstallXLogFileSegment(&installed_log, &installed_seg, tmppath,
2382                                                                 *use_existent, &max_advance,
2383                                                                 use_lock))
2384         {
2385                 /*
2386                  * No need for any more future segments, or InstallXLogFileSegment()
2387                  * failed to rename the file into place. If the rename failed, opening
2388                  * the file below will fail.
2389                  */
2390                 unlink(tmppath);
2391         }
2392
2393         /* Set flag to tell caller there was no existent file */
2394         *use_existent = false;
2395
2396         /* Now open original target segment (might not be file I just made) */
2397         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2398                                            S_IRUSR | S_IWUSR);
2399         if (fd < 0)
2400                 ereport(ERROR,
2401                                 (errcode_for_file_access(),
2402                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2403                                   path, log, seg)));
2404
2405         elog(DEBUG2, "done creating and filling new WAL file");
2406
2407         return fd;
2408 }
2409
2410 /*
2411  * Create a new XLOG file segment by copying a pre-existing one.
2412  *
2413  * log, seg: identify segment to be created.
2414  *
2415  * srcTLI, srclog, srcseg: identify segment to be copied (could be from
2416  *              a different timeline)
2417  *
2418  * Currently this is only used during recovery, and so there are no locking
2419  * considerations.      But we should be just as tense as XLogFileInit to avoid
2420  * emplacing a bogus file.
2421  */
2422 static void
2423 XLogFileCopy(uint32 log, uint32 seg,
2424                          TimeLineID srcTLI, uint32 srclog, uint32 srcseg)
2425 {
2426         char            path[MAXPGPATH];
2427         char            tmppath[MAXPGPATH];
2428         char            buffer[XLOG_BLCKSZ];
2429         int                     srcfd;
2430         int                     fd;
2431         int                     nbytes;
2432
2433         /*
2434          * Open the source file
2435          */
2436         XLogFilePath(path, srcTLI, srclog, srcseg);
2437         srcfd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2438         if (srcfd < 0)
2439                 ereport(ERROR,
2440                                 (errcode_for_file_access(),
2441                                  errmsg("could not open file \"%s\": %m", path)));
2442
2443         /*
2444          * Copy into a temp file name.
2445          */
2446         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
2447
2448         unlink(tmppath);
2449
2450         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
2451         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
2452                                            S_IRUSR | S_IWUSR);
2453         if (fd < 0)
2454                 ereport(ERROR,
2455                                 (errcode_for_file_access(),
2456                                  errmsg("could not create file \"%s\": %m", tmppath)));
2457
2458         /*
2459          * Do the data copying.
2460          */
2461         for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
2462         {
2463                 errno = 0;
2464                 if ((int) read(srcfd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2465                 {
2466                         if (errno != 0)
2467                                 ereport(ERROR,
2468                                                 (errcode_for_file_access(),
2469                                                  errmsg("could not read file \"%s\": %m", path)));
2470                         else
2471                                 ereport(ERROR,
2472                                                 (errmsg("not enough data in file \"%s\"", path)));
2473                 }
2474                 errno = 0;
2475                 if ((int) write(fd, buffer, sizeof(buffer)) != (int) sizeof(buffer))
2476                 {
2477                         int                     save_errno = errno;
2478
2479                         /*
2480                          * If we fail to make the file, delete it to release disk space
2481                          */
2482                         unlink(tmppath);
2483                         /* if write didn't set errno, assume problem is no disk space */
2484                         errno = save_errno ? save_errno : ENOSPC;
2485
2486                         ereport(ERROR,
2487                                         (errcode_for_file_access(),
2488                                          errmsg("could not write to file \"%s\": %m", tmppath)));
2489                 }
2490         }
2491
2492         if (pg_fsync(fd) != 0)
2493                 ereport(ERROR,
2494                                 (errcode_for_file_access(),
2495                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
2496
2497         if (close(fd))
2498                 ereport(ERROR,
2499                                 (errcode_for_file_access(),
2500                                  errmsg("could not close file \"%s\": %m", tmppath)));
2501
2502         close(srcfd);
2503
2504         /*
2505          * Now move the segment into place with its final name.
2506          */
2507         if (!InstallXLogFileSegment(&log, &seg, tmppath, false, NULL, false))
2508                 elog(ERROR, "InstallXLogFileSegment should not have failed");
2509 }
2510
2511 /*
2512  * Install a new XLOG segment file as a current or future log segment.
2513  *
2514  * This is used both to install a newly-created segment (which has a temp
2515  * filename while it's being created) and to recycle an old segment.
2516  *
2517  * *log, *seg: identify segment to install as (or first possible target).
2518  * When find_free is TRUE, these are modified on return to indicate the
2519  * actual installation location or last segment searched.
2520  *
2521  * tmppath: initial name of file to install.  It will be renamed into place.
2522  *
2523  * find_free: if TRUE, install the new segment at the first empty log/seg
2524  * number at or after the passed numbers.  If FALSE, install the new segment
2525  * exactly where specified, deleting any existing segment file there.
2526  *
2527  * *max_advance: maximum number of log/seg slots to advance past the starting
2528  * point.  Fail if no free slot is found in this range.  On return, reduced
2529  * by the number of slots skipped over.  (Irrelevant, and may be NULL,
2530  * when find_free is FALSE.)
2531  *
2532  * use_lock: if TRUE, acquire ControlFileLock while moving file into
2533  * place.  This should be TRUE except during bootstrap log creation.  The
2534  * caller must *not* hold the lock at call.
2535  *
2536  * Returns TRUE if the file was installed successfully.  FALSE indicates that
2537  * max_advance limit was exceeded, or an error occurred while renaming the
2538  * file into place.
2539  */
2540 static bool
2541 InstallXLogFileSegment(uint32 *log, uint32 *seg, char *tmppath,
2542                                            bool find_free, int *max_advance,
2543                                            bool use_lock)
2544 {
2545         char            path[MAXPGPATH];
2546         struct stat stat_buf;
2547
2548         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2549
2550         /*
2551          * We want to be sure that only one process does this at a time.
2552          */
2553         if (use_lock)
2554                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2555
2556         if (!find_free)
2557         {
2558                 /* Force installation: get rid of any pre-existing segment file */
2559                 unlink(path);
2560         }
2561         else
2562         {
2563                 /* Find a free slot to put it in */
2564                 while (stat(path, &stat_buf) == 0)
2565                 {
2566                         if (*max_advance <= 0)
2567                         {
2568                                 /* Failed to find a free slot within specified range */
2569                                 if (use_lock)
2570                                         LWLockRelease(ControlFileLock);
2571                                 return false;
2572                         }
2573                         NextLogSeg(*log, *seg);
2574                         (*max_advance)--;
2575                         XLogFilePath(path, ThisTimeLineID, *log, *seg);
2576                 }
2577         }
2578
2579         /*
2580          * Prefer link() to rename() here just to be really sure that we don't
2581          * overwrite an existing logfile.  However, there shouldn't be one, so
2582          * rename() is an acceptable substitute except for the truly paranoid.
2583          */
2584 #if HAVE_WORKING_LINK
2585         if (link(tmppath, path) < 0)
2586         {
2587                 if (use_lock)
2588                         LWLockRelease(ControlFileLock);
2589                 ereport(LOG,
2590                                 (errcode_for_file_access(),
2591                                  errmsg("could not link file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2592                                                 tmppath, path, *log, *seg)));
2593                 return false;
2594         }
2595         unlink(tmppath);
2596 #else
2597         if (rename(tmppath, path) < 0)
2598         {
2599                 if (use_lock)
2600                         LWLockRelease(ControlFileLock);
2601                 ereport(LOG,
2602                                 (errcode_for_file_access(),
2603                                  errmsg("could not rename file \"%s\" to \"%s\" (initialization of log file %u, segment %u): %m",
2604                                                 tmppath, path, *log, *seg)));
2605                 return false;
2606         }
2607 #endif
2608
2609         if (use_lock)
2610                 LWLockRelease(ControlFileLock);
2611
2612         return true;
2613 }
2614
2615 /*
2616  * Open a pre-existing logfile segment for writing.
2617  */
2618 int
2619 XLogFileOpen(uint32 log, uint32 seg)
2620 {
2621         char            path[MAXPGPATH];
2622         int                     fd;
2623
2624         XLogFilePath(path, ThisTimeLineID, log, seg);
2625
2626         fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
2627                                            S_IRUSR | S_IWUSR);
2628         if (fd < 0)
2629                 ereport(PANIC,
2630                                 (errcode_for_file_access(),
2631                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2632                                   path, log, seg)));
2633
2634         return fd;
2635 }
2636
2637 /*
2638  * Open a logfile segment for reading (during recovery).
2639  *
2640  * If source = XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
2641  * Otherwise, it's assumed to be already available in pg_xlog.
2642  */
2643 static int
2644 XLogFileRead(uint32 log, uint32 seg, int emode, TimeLineID tli,
2645                          int source, bool notfoundOk)
2646 {
2647         char            xlogfname[MAXFNAMELEN];
2648         char            activitymsg[MAXFNAMELEN + 16];
2649         char            path[MAXPGPATH];
2650         int                     fd;
2651
2652         XLogFileName(xlogfname, tli, log, seg);
2653
2654         switch (source)
2655         {
2656                 case XLOG_FROM_ARCHIVE:
2657                         /* Report recovery progress in PS display */
2658                         snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
2659                                          xlogfname);
2660                         set_ps_display(activitymsg, false);
2661
2662                         restoredFromArchive = RestoreArchivedFile(path, xlogfname,
2663                                                                                                           "RECOVERYXLOG",
2664                                                                                                           XLogSegSize);
2665                         if (!restoredFromArchive)
2666                                 return -1;
2667                         break;
2668
2669                 case XLOG_FROM_PG_XLOG:
2670                 case XLOG_FROM_STREAM:
2671                         XLogFilePath(path, tli, log, seg);
2672                         restoredFromArchive = false;
2673                         break;
2674
2675                 default:
2676                         elog(ERROR, "invalid XLogFileRead source %d", source);
2677         }
2678
2679         fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
2680         if (fd >= 0)
2681         {
2682                 /* Success! */
2683                 curFileTLI = tli;
2684
2685                 /* Report recovery progress in PS display */
2686                 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
2687                                  xlogfname);
2688                 set_ps_display(activitymsg, false);
2689
2690                 /* Track source of data in assorted state variables */
2691                 readSource = source;
2692                 XLogReceiptSource = source;
2693                 /* In FROM_STREAM case, caller tracks receipt time, not me */
2694                 if (source != XLOG_FROM_STREAM)
2695                         XLogReceiptTime = GetCurrentTimestamp();
2696
2697                 return fd;
2698         }
2699         if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
2700                 ereport(PANIC,
2701                                 (errcode_for_file_access(),
2702                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2703                                   path, log, seg)));
2704         return -1;
2705 }
2706
2707 /*
2708  * Open a logfile segment for reading (during recovery).
2709  *
2710  * This version searches for the segment with any TLI listed in expectedTLIs.
2711  */
2712 static int
2713 XLogFileReadAnyTLI(uint32 log, uint32 seg, int emode, int sources)
2714 {
2715         char            path[MAXPGPATH];
2716         ListCell   *cell;
2717         int                     fd;
2718
2719         /*
2720          * Loop looking for a suitable timeline ID: we might need to read any of
2721          * the timelines listed in expectedTLIs.
2722          *
2723          * We expect curFileTLI on entry to be the TLI of the preceding file in
2724          * sequence, or 0 if there was no predecessor.  We do not allow curFileTLI
2725          * to go backwards; this prevents us from picking up the wrong file when a
2726          * parent timeline extends to higher segment numbers than the child we
2727          * want to read.
2728          */
2729         foreach(cell, expectedTLIs)
2730         {
2731                 TimeLineID      tli = (TimeLineID) lfirst_int(cell);
2732
2733                 if (tli < curFileTLI)
2734                         break;                          /* don't bother looking at too-old TLIs */
2735
2736                 if (sources & XLOG_FROM_ARCHIVE)
2737                 {
2738                         fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_ARCHIVE, true);
2739                         if (fd != -1)
2740                         {
2741                                 elog(DEBUG1, "got WAL segment from archive");
2742                                 return fd;
2743                         }
2744                 }
2745
2746                 if (sources & XLOG_FROM_PG_XLOG)
2747                 {
2748                         fd = XLogFileRead(log, seg, emode, tli, XLOG_FROM_PG_XLOG, true);
2749                         if (fd != -1)
2750                                 return fd;
2751                 }
2752         }
2753
2754         /* Couldn't find it.  For simplicity, complain about front timeline */
2755         XLogFilePath(path, recoveryTargetTLI, log, seg);
2756         errno = ENOENT;
2757         ereport(emode,
2758                         (errcode_for_file_access(),
2759                    errmsg("could not open file \"%s\" (log file %u, segment %u): %m",
2760                                   path, log, seg)));
2761         return -1;
2762 }
2763
2764 /*
2765  * Close the current logfile segment for writing.
2766  */
2767 static void
2768 XLogFileClose(void)
2769 {
2770         Assert(openLogFile >= 0);
2771
2772         /*
2773          * WAL segment files will not be re-read in normal operation, so we advise
2774          * the OS to release any cached pages.  But do not do so if WAL archiving
2775          * or streaming is active, because archiver and walsender process could
2776          * use the cache to read the WAL segment.
2777          */
2778 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
2779         if (!XLogIsNeeded())
2780                 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
2781 #endif
2782
2783         if (close(openLogFile))
2784                 ereport(PANIC,
2785                                 (errcode_for_file_access(),
2786                                  errmsg("could not close log file %u, segment %u: %m",
2787                                                 openLogId, openLogSeg)));
2788         openLogFile = -1;
2789 }
2790
2791 /*
2792  * Attempt to retrieve the specified file from off-line archival storage.
2793  * If successful, fill "path" with its complete path (note that this will be
2794  * a temp file name that doesn't follow the normal naming convention), and
2795  * return TRUE.
2796  *
2797  * If not successful, fill "path" with the name of the normal on-line file
2798  * (which may or may not actually exist, but we'll try to use it), and return
2799  * FALSE.
2800  *
2801  * For fixed-size files, the caller may pass the expected size as an
2802  * additional crosscheck on successful recovery.  If the file size is not
2803  * known, set expectedSize = 0.
2804  */
2805 static bool
2806 RestoreArchivedFile(char *path, const char *xlogfname,
2807                                         const char *recovername, off_t expectedSize)
2808 {
2809         char            xlogpath[MAXPGPATH];
2810         char            xlogRestoreCmd[MAXPGPATH];
2811         char            lastRestartPointFname[MAXPGPATH];
2812         char       *dp;
2813         char       *endp;
2814         const char *sp;
2815         int                     rc;
2816         bool            signaled;
2817         struct stat stat_buf;
2818         uint32          restartLog;
2819         uint32          restartSeg;
2820
2821         /* In standby mode, restore_command might not be supplied */
2822         if (recoveryRestoreCommand == NULL)
2823                 goto not_available;
2824
2825         /*
2826          * When doing archive recovery, we always prefer an archived log file even
2827          * if a file of the same name exists in XLOGDIR.  The reason is that the
2828          * file in XLOGDIR could be an old, un-filled or partly-filled version
2829          * that was copied and restored as part of backing up $PGDATA.
2830          *
2831          * We could try to optimize this slightly by checking the local copy
2832          * lastchange timestamp against the archived copy, but we have no API to
2833          * do this, nor can we guarantee that the lastchange timestamp was
2834          * preserved correctly when we copied to archive. Our aim is robustness,
2835          * so we elect not to do this.
2836          *
2837          * If we cannot obtain the log file from the archive, however, we will try
2838          * to use the XLOGDIR file if it exists.  This is so that we can make use
2839          * of log segments that weren't yet transferred to the archive.
2840          *
2841          * Notice that we don't actually overwrite any files when we copy back
2842          * from archive because the recoveryRestoreCommand may inadvertently
2843          * restore inappropriate xlogs, or they may be corrupt, so we may wish to
2844          * fallback to the segments remaining in current XLOGDIR later. The
2845          * copy-from-archive filename is always the same, ensuring that we don't
2846          * run out of disk space on long recoveries.
2847          */
2848         snprintf(xlogpath, MAXPGPATH, XLOGDIR "/%s", recovername);
2849
2850         /*
2851          * Make sure there is no existing file named recovername.
2852          */
2853         if (stat(xlogpath, &stat_buf) != 0)
2854         {
2855                 if (errno != ENOENT)
2856                         ereport(FATAL,
2857                                         (errcode_for_file_access(),
2858                                          errmsg("could not stat file \"%s\": %m",
2859                                                         xlogpath)));
2860         }
2861         else
2862         {
2863                 if (unlink(xlogpath) != 0)
2864                         ereport(FATAL,
2865                                         (errcode_for_file_access(),
2866                                          errmsg("could not remove file \"%s\": %m",
2867                                                         xlogpath)));
2868         }
2869
2870         /*
2871          * Calculate the archive file cutoff point for use during log shipping
2872          * replication. All files earlier than this point can be deleted from the
2873          * archive, though there is no requirement to do so.
2874          *
2875          * We initialise this with the filename of an InvalidXLogRecPtr, which
2876          * will prevent the deletion of any WAL files from the archive because of
2877          * the alphabetic sorting property of WAL filenames.
2878          *
2879          * Once we have successfully located the redo pointer of the checkpoint
2880          * from which we start recovery we never request a file prior to the redo
2881          * pointer of the last restartpoint. When redo begins we know that we have
2882          * successfully located it, so there is no need for additional status
2883          * flags to signify the point when we can begin deleting WAL files from
2884          * the archive.
2885          */
2886         if (InRedo)
2887         {
2888                 XLByteToSeg(ControlFile->checkPointCopy.redo,
2889                                         restartLog, restartSeg);
2890                 XLogFileName(lastRestartPointFname,
2891                                          ControlFile->checkPointCopy.ThisTimeLineID,
2892                                          restartLog, restartSeg);
2893                 /* we shouldn't need anything earlier than last restart point */
2894                 Assert(strcmp(lastRestartPointFname, xlogfname) <= 0);
2895         }
2896         else
2897                 XLogFileName(lastRestartPointFname, 0, 0, 0);
2898
2899         /*
2900          * construct the command to be executed
2901          */
2902         dp = xlogRestoreCmd;
2903         endp = xlogRestoreCmd + MAXPGPATH - 1;
2904         *endp = '\0';
2905
2906         for (sp = recoveryRestoreCommand; *sp; sp++)
2907         {
2908                 if (*sp == '%')
2909                 {
2910                         switch (sp[1])
2911                         {
2912                                 case 'p':
2913                                         /* %p: relative path of target file */
2914                                         sp++;
2915                                         StrNCpy(dp, xlogpath, endp - dp);
2916                                         make_native_path(dp);
2917                                         dp += strlen(dp);
2918                                         break;
2919                                 case 'f':
2920                                         /* %f: filename of desired file */
2921                                         sp++;
2922                                         StrNCpy(dp, xlogfname, endp - dp);
2923                                         dp += strlen(dp);
2924                                         break;
2925                                 case 'r':
2926                                         /* %r: filename of last restartpoint */
2927                                         sp++;
2928                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
2929                                         dp += strlen(dp);
2930                                         break;
2931                                 case '%':
2932                                         /* convert %% to a single % */
2933                                         sp++;
2934                                         if (dp < endp)
2935                                                 *dp++ = *sp;
2936                                         break;
2937                                 default:
2938                                         /* otherwise treat the % as not special */
2939                                         if (dp < endp)
2940                                                 *dp++ = *sp;
2941                                         break;
2942                         }
2943                 }
2944                 else
2945                 {
2946                         if (dp < endp)
2947                                 *dp++ = *sp;
2948                 }
2949         }
2950         *dp = '\0';
2951
2952         ereport(DEBUG3,
2953                         (errmsg_internal("executing restore command \"%s\"",
2954                                                          xlogRestoreCmd)));
2955
2956         /*
2957          * Set in_restore_command to tell the signal handler that we should exit
2958          * right away on SIGTERM. We know that we're at a safe point to do that.
2959          * Check if we had already received the signal, so that we don't miss a
2960          * shutdown request received just before this.
2961          */
2962         in_restore_command = true;
2963         if (shutdown_requested)
2964                 proc_exit(1);
2965
2966         /*
2967          * Copy xlog from archival storage to XLOGDIR
2968          */
2969         rc = system(xlogRestoreCmd);
2970
2971         in_restore_command = false;
2972
2973         if (rc == 0)
2974         {
2975                 /*
2976                  * command apparently succeeded, but let's make sure the file is
2977                  * really there now and has the correct size.
2978                  */
2979                 if (stat(xlogpath, &stat_buf) == 0)
2980                 {
2981                         if (expectedSize > 0 && stat_buf.st_size != expectedSize)
2982                         {
2983                                 int                     elevel;
2984
2985                                 /*
2986                                  * If we find a partial file in standby mode, we assume it's
2987                                  * because it's just being copied to the archive, and keep
2988                                  * trying.
2989                                  *
2990                                  * Otherwise treat a wrong-sized file as FATAL to ensure the
2991                                  * DBA would notice it, but is that too strong? We could try
2992                                  * to plow ahead with a local copy of the file ... but the
2993                                  * problem is that there probably isn't one, and we'd
2994                                  * incorrectly conclude we've reached the end of WAL and we're
2995                                  * done recovering ...
2996                                  */
2997                                 if (StandbyMode && stat_buf.st_size < expectedSize)
2998                                         elevel = DEBUG1;
2999                                 else
3000                                         elevel = FATAL;
3001                                 ereport(elevel,
3002                                                 (errmsg("archive file \"%s\" has wrong size: %lu instead of %lu",
3003                                                                 xlogfname,
3004                                                                 (unsigned long) stat_buf.st_size,
3005                                                                 (unsigned long) expectedSize)));
3006                                 return false;
3007                         }
3008                         else
3009                         {
3010                                 ereport(LOG,
3011                                                 (errmsg("restored log file \"%s\" from archive",
3012                                                                 xlogfname)));
3013                                 strcpy(path, xlogpath);
3014                                 return true;
3015                         }
3016                 }
3017                 else
3018                 {
3019                         /* stat failed */
3020                         if (errno != ENOENT)
3021                                 ereport(FATAL,
3022                                                 (errcode_for_file_access(),
3023                                                  errmsg("could not stat file \"%s\": %m",
3024                                                                 xlogpath)));
3025                 }
3026         }
3027
3028         /*
3029          * Remember, we rollforward UNTIL the restore fails so failure here is
3030          * just part of the process... that makes it difficult to determine
3031          * whether the restore failed because there isn't an archive to restore,
3032          * or because the administrator has specified the restore program
3033          * incorrectly.  We have to assume the former.
3034          *
3035          * However, if the failure was due to any sort of signal, it's best to
3036          * punt and abort recovery.  (If we "return false" here, upper levels will
3037          * assume that recovery is complete and start up the database!) It's
3038          * essential to abort on child SIGINT and SIGQUIT, because per spec
3039          * system() ignores SIGINT and SIGQUIT while waiting; if we see one of
3040          * those it's a good bet we should have gotten it too.
3041          *
3042          * On SIGTERM, assume we have received a fast shutdown request, and exit
3043          * cleanly. It's pure chance whether we receive the SIGTERM first, or the
3044          * child process. If we receive it first, the signal handler will call
3045          * proc_exit, otherwise we do it here. If we or the child process received
3046          * SIGTERM for any other reason than a fast shutdown request, postmaster
3047          * will perform an immediate shutdown when it sees us exiting
3048          * unexpectedly.
3049          *
3050          * Per the Single Unix Spec, shells report exit status > 128 when a called
3051          * command died on a signal.  Also, 126 and 127 are used to report
3052          * problems such as an unfindable command; treat those as fatal errors
3053          * too.
3054          */
3055         if (WIFSIGNALED(rc) && WTERMSIG(rc) == SIGTERM)
3056                 proc_exit(1);
3057
3058         signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
3059
3060         ereport(signaled ? FATAL : DEBUG2,
3061                 (errmsg("could not restore file \"%s\" from archive: return code %d",
3062                                 xlogfname, rc)));
3063
3064 not_available:
3065
3066         /*
3067          * if an archived file is not available, there might still be a version of
3068          * this file in XLOGDIR, so return that as the filename to open.
3069          *
3070          * In many recovery scenarios we expect this to fail also, but if so that
3071          * just means we've reached the end of WAL.
3072          */
3073         snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3074         return false;
3075 }
3076
3077 /*
3078  * Attempt to execute an external shell command during recovery.
3079  *
3080  * 'command' is the shell command to be executed, 'commandName' is a
3081  * human-readable name describing the command emitted in the logs. If
3082  * 'failOnSignal' is true and the command is killed by a signal, a FATAL
3083  * error is thrown. Otherwise a WARNING is emitted.
3084  *
3085  * This is currently used for recovery_end_command and archive_cleanup_command.
3086  */
3087 static void
3088 ExecuteRecoveryCommand(char *command, char *commandName, bool failOnSignal)
3089 {
3090         char            xlogRecoveryCmd[MAXPGPATH];
3091         char            lastRestartPointFname[MAXPGPATH];
3092         char       *dp;
3093         char       *endp;
3094         const char *sp;
3095         int                     rc;
3096         bool            signaled;
3097         uint32          restartLog;
3098         uint32          restartSeg;
3099
3100         Assert(command && commandName);
3101
3102         /*
3103          * Calculate the archive file cutoff point for use during log shipping
3104          * replication. All files earlier than this point can be deleted from the
3105          * archive, though there is no requirement to do so.
3106          */
3107         LWLockAcquire(ControlFileLock, LW_SHARED);
3108         XLByteToSeg(ControlFile->checkPointCopy.redo,
3109                                 restartLog, restartSeg);
3110         XLogFileName(lastRestartPointFname,
3111                                  ControlFile->checkPointCopy.ThisTimeLineID,
3112                                  restartLog, restartSeg);
3113         LWLockRelease(ControlFileLock);
3114
3115         /*
3116          * construct the command to be executed
3117          */
3118         dp = xlogRecoveryCmd;
3119         endp = xlogRecoveryCmd + MAXPGPATH - 1;
3120         *endp = '\0';
3121
3122         for (sp = command; *sp; sp++)
3123         {
3124                 if (*sp == '%')
3125                 {
3126                         switch (sp[1])
3127                         {
3128                                 case 'r':
3129                                         /* %r: filename of last restartpoint */
3130                                         sp++;
3131                                         StrNCpy(dp, lastRestartPointFname, endp - dp);
3132                                         dp += strlen(dp);
3133                                         break;
3134                                 case '%':
3135                                         /* convert %% to a single % */
3136                                         sp++;
3137                                         if (dp < endp)
3138                                                 *dp++ = *sp;
3139                                         break;
3140                                 default:
3141                                         /* otherwise treat the % as not special */
3142                                         if (dp < endp)
3143                                                 *dp++ = *sp;
3144                                         break;
3145                         }
3146                 }
3147                 else
3148                 {
3149                         if (dp < endp)
3150                                 *dp++ = *sp;
3151                 }
3152         }
3153         *dp = '\0';
3154
3155         ereport(DEBUG3,
3156                         (errmsg_internal("executing %s \"%s\"", commandName, command)));
3157
3158         /*
3159          * execute the constructed command
3160          */
3161         rc = system(xlogRecoveryCmd);
3162         if (rc != 0)
3163         {
3164                 /*
3165                  * If the failure was due to any sort of signal, it's best to punt and
3166                  * abort recovery. See also detailed comments on signals in
3167                  * RestoreArchivedFile().
3168                  */
3169                 signaled = WIFSIGNALED(rc) || WEXITSTATUS(rc) > 125;
3170
3171                 /*
3172                  * translator: First %s represents a recovery.conf parameter name like
3173                  * "recovery_end_command", and the 2nd is the value of that parameter.
3174                  */
3175                 ereport((signaled && failOnSignal) ? FATAL : WARNING,
3176                                 (errmsg("%s \"%s\": return code %d", commandName,
3177                                                 command, rc)));
3178         }
3179 }
3180
3181 /*
3182  * Preallocate log files beyond the specified log endpoint.
3183  *
3184  * XXX this is currently extremely conservative, since it forces only one
3185  * future log segment to exist, and even that only if we are 75% done with
3186  * the current one.  This is only appropriate for very low-WAL-volume systems.
3187  * High-volume systems will be OK once they've built up a sufficient set of
3188  * recycled log segments, but the startup transient is likely to include
3189  * a lot of segment creations by foreground processes, which is not so good.
3190  */
3191 static void
3192 PreallocXlogFiles(XLogRecPtr endptr)
3193 {
3194         uint32          _logId;
3195         uint32          _logSeg;
3196         int                     lf;
3197         bool            use_existent;
3198
3199         XLByteToPrevSeg(endptr, _logId, _logSeg);
3200         if ((endptr.xrecoff - 1) % XLogSegSize >=
3201                 (uint32) (0.75 * XLogSegSize))
3202         {
3203                 NextLogSeg(_logId, _logSeg);
3204                 use_existent = true;
3205                 lf = XLogFileInit(_logId, _logSeg, &use_existent, true);
3206                 close(lf);
3207                 if (!use_existent)
3208                         CheckpointStats.ckpt_segs_added++;
3209         }
3210 }
3211
3212 /*
3213  * Get the log/seg of the latest removed or recycled WAL segment.
3214  * Returns 0/0 if no WAL segments have been removed since startup.
3215  */
3216 void
3217 XLogGetLastRemoved(uint32 *log, uint32 *seg)
3218 {
3219         /* use volatile pointer to prevent code rearrangement */
3220         volatile XLogCtlData *xlogctl = XLogCtl;
3221
3222         SpinLockAcquire(&xlogctl->info_lck);
3223         *log = xlogctl->lastRemovedLog;
3224         *seg = xlogctl->lastRemovedSeg;
3225         SpinLockRelease(&xlogctl->info_lck);
3226 }
3227
3228 /*
3229  * Update the last removed log/seg pointer in shared memory, to reflect
3230  * that the given XLOG file has been removed.
3231  */
3232 static void
3233 UpdateLastRemovedPtr(char *filename)
3234 {
3235         /* use volatile pointer to prevent code rearrangement */
3236         volatile XLogCtlData *xlogctl = XLogCtl;
3237         uint32          tli,
3238                                 log,
3239                                 seg;
3240
3241         XLogFromFileName(filename, &tli, &log, &seg);
3242
3243         SpinLockAcquire(&xlogctl->info_lck);
3244         if (log > xlogctl->lastRemovedLog ||
3245                 (log == xlogctl->lastRemovedLog && seg > xlogctl->lastRemovedSeg))
3246         {
3247                 xlogctl->lastRemovedLog = log;
3248                 xlogctl->lastRemovedSeg = seg;
3249         }
3250         SpinLockRelease(&xlogctl->info_lck);
3251 }
3252
3253 /*
3254  * Recycle or remove all log files older or equal to passed log/seg#
3255  *
3256  * endptr is current (or recent) end of xlog; this is used to determine
3257  * whether we want to recycle rather than delete no-longer-wanted log files.
3258  */
3259 static void
3260 RemoveOldXlogFiles(uint32 log, uint32 seg, XLogRecPtr endptr)
3261 {
3262         uint32          endlogId;
3263         uint32          endlogSeg;
3264         int                     max_advance;
3265         DIR                *xldir;
3266         struct dirent *xlde;
3267         char            lastoff[MAXFNAMELEN];
3268         char            path[MAXPGPATH];
3269
3270 #ifdef WIN32
3271         char            newpath[MAXPGPATH];
3272 #endif
3273         struct stat statbuf;
3274
3275         /*
3276          * Initialize info about where to try to recycle to.  We allow recycling
3277          * segments up to XLOGfileslop segments beyond the current XLOG location.
3278          */
3279         XLByteToPrevSeg(endptr, endlogId, endlogSeg);
3280         max_advance = XLOGfileslop;
3281
3282         xldir = AllocateDir(XLOGDIR);
3283         if (xldir == NULL)
3284                 ereport(ERROR,
3285                                 (errcode_for_file_access(),
3286                                  errmsg("could not open transaction log directory \"%s\": %m",
3287                                                 XLOGDIR)));
3288
3289         XLogFileName(lastoff, ThisTimeLineID, log, seg);
3290
3291         elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3292                  lastoff);
3293
3294         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3295         {
3296                 /*
3297                  * We ignore the timeline part of the XLOG segment identifiers in
3298                  * deciding whether a segment is still needed.  This ensures that we
3299                  * won't prematurely remove a segment from a parent timeline. We could
3300                  * probably be a little more proactive about removing segments of
3301                  * non-parent timelines, but that would be a whole lot more
3302                  * complicated.
3303                  *
3304                  * We use the alphanumeric sorting property of the filenames to decide
3305                  * which ones are earlier than the lastoff segment.
3306                  */
3307                 if (strlen(xlde->d_name) == 24 &&
3308                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3309                         strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3310                 {
3311                         /*
3312                          * Normally we don't delete old XLOG files during recovery to
3313                          * avoid accidentally deleting a file that looks stale due to a
3314                          * bug or hardware issue, but in fact contains important data.
3315                          * During streaming recovery, however, we will eventually fill the
3316                          * disk if we never clean up, so we have to. That's not an issue
3317                          * with file-based archive recovery because in that case we
3318                          * restore one XLOG file at a time, on-demand, and with a
3319                          * different filename that can't be confused with regular XLOG
3320                          * files.
3321                          */
3322                         if (WalRcvInProgress() || XLogArchiveCheckDone(xlde->d_name))
3323                         {
3324                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3325
3326                                 /* Update the last removed location in shared memory first */
3327                                 UpdateLastRemovedPtr(xlde->d_name);
3328
3329                                 /*
3330                                  * Before deleting the file, see if it can be recycled as a
3331                                  * future log segment. Only recycle normal files, pg_standby
3332                                  * for example can create symbolic links pointing to a
3333                                  * separate archive directory.
3334                                  */
3335                                 if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3336                                         InstallXLogFileSegment(&endlogId, &endlogSeg, path,
3337                                                                                    true, &max_advance, true))
3338                                 {
3339                                         ereport(DEBUG2,
3340                                                         (errmsg("recycled transaction log file \"%s\"",
3341                                                                         xlde->d_name)));
3342                                         CheckpointStats.ckpt_segs_recycled++;
3343                                         /* Needn't recheck that slot on future iterations */
3344                                         if (max_advance > 0)
3345                                         {
3346                                                 NextLogSeg(endlogId, endlogSeg);
3347                                                 max_advance--;
3348                                         }
3349                                 }
3350                                 else
3351                                 {
3352                                         /* No need for any more future segments... */
3353                                         int                     rc;
3354
3355                                         ereport(DEBUG2,
3356                                                         (errmsg("removing transaction log file \"%s\"",
3357                                                                         xlde->d_name)));
3358
3359 #ifdef WIN32
3360
3361                                         /*
3362                                          * On Windows, if another process (e.g another backend)
3363                                          * holds the file open in FILE_SHARE_DELETE mode, unlink
3364                                          * will succeed, but the file will still show up in
3365                                          * directory listing until the last handle is closed. To
3366                                          * avoid confusing the lingering deleted file for a live
3367                                          * WAL file that needs to be archived, rename it before
3368                                          * deleting it.
3369                                          *
3370                                          * If another process holds the file open without
3371                                          * FILE_SHARE_DELETE flag, rename will fail. We'll try
3372                                          * again at the next checkpoint.
3373                                          */
3374                                         snprintf(newpath, MAXPGPATH, "%s.deleted", path);
3375                                         if (rename(path, newpath) != 0)
3376                                         {
3377                                                 ereport(LOG,
3378                                                                 (errcode_for_file_access(),
3379                                                                  errmsg("could not rename old transaction log file \"%s\": %m",
3380                                                                                 path)));
3381                                                 continue;
3382                                         }
3383                                         rc = unlink(newpath);
3384 #else
3385                                         rc = unlink(path);
3386 #endif
3387                                         if (rc != 0)
3388                                         {
3389                                                 ereport(LOG,
3390                                                                 (errcode_for_file_access(),
3391                                                                  errmsg("could not remove old transaction log file \"%s\": %m",
3392                                                                                 path)));
3393                                                 continue;
3394                                         }
3395                                         CheckpointStats.ckpt_segs_removed++;
3396                                 }
3397
3398                                 XLogArchiveCleanup(xlde->d_name);
3399                         }
3400                 }
3401         }
3402
3403         FreeDir(xldir);
3404 }
3405
3406 /*
3407  * Verify whether pg_xlog and pg_xlog/archive_status exist.
3408  * If the latter does not exist, recreate it.
3409  *
3410  * It is not the goal of this function to verify the contents of these
3411  * directories, but to help in cases where someone has performed a cluster
3412  * copy for PITR purposes but omitted pg_xlog from the copy.
3413  *
3414  * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3415  * policy decision was made not to.  It is fairly common for pg_xlog to be
3416  * a symlink, and if that was the DBA's intent then automatically making a
3417  * plain directory would result in degraded performance with no notice.
3418  */
3419 static void
3420 ValidateXLOGDirectoryStructure(void)
3421 {
3422         char            path[MAXPGPATH];
3423         struct stat stat_buf;
3424
3425         /* Check for pg_xlog; if it doesn't exist, error out */
3426         if (stat(XLOGDIR, &stat_buf) != 0 ||
3427                 !S_ISDIR(stat_buf.st_mode))
3428                 ereport(FATAL,
3429                                 (errmsg("required WAL directory \"%s\" does not exist",
3430                                                 XLOGDIR)));
3431
3432         /* Check for archive_status */
3433         snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
3434         if (stat(path, &stat_buf) == 0)
3435         {
3436                 /* Check for weird cases where it exists but isn't a directory */
3437                 if (!S_ISDIR(stat_buf.st_mode))
3438                         ereport(FATAL,
3439                                         (errmsg("required WAL directory \"%s\" does not exist",
3440                                                         path)));
3441         }
3442         else
3443         {
3444                 ereport(LOG,
3445                                 (errmsg("creating missing WAL directory \"%s\"", path)));
3446                 if (mkdir(path, 0700) < 0)
3447                         ereport(FATAL,
3448                                         (errmsg("could not create missing directory \"%s\": %m",
3449                                                         path)));
3450         }
3451 }
3452
3453 /*
3454  * Remove previous backup history files.  This also retries creation of
3455  * .ready files for any backup history files for which XLogArchiveNotify
3456  * failed earlier.
3457  */
3458 static void
3459 CleanupBackupHistory(void)
3460 {
3461         DIR                *xldir;
3462         struct dirent *xlde;
3463         char            path[MAXPGPATH];
3464
3465         xldir = AllocateDir(XLOGDIR);
3466         if (xldir == NULL)
3467                 ereport(ERROR,
3468                                 (errcode_for_file_access(),
3469                                  errmsg("could not open transaction log directory \"%s\": %m",
3470                                                 XLOGDIR)));
3471
3472         while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3473         {
3474                 if (strlen(xlde->d_name) > 24 &&
3475                         strspn(xlde->d_name, "0123456789ABCDEF") == 24 &&
3476                         strcmp(xlde->d_name + strlen(xlde->d_name) - strlen(".backup"),
3477                                    ".backup") == 0)
3478                 {
3479                         if (XLogArchiveCheckDone(xlde->d_name))
3480                         {
3481                                 ereport(DEBUG2,
3482                                 (errmsg("removing transaction log backup history file \"%s\"",
3483                                                 xlde->d_name)));
3484                                 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3485                                 unlink(path);
3486                                 XLogArchiveCleanup(xlde->d_name);
3487                         }
3488                 }
3489         }
3490
3491         FreeDir(xldir);
3492 }
3493
3494 /*
3495  * Restore the backup blocks present in an XLOG record, if any.
3496  *
3497  * We assume all of the record has been read into memory at *record.
3498  *
3499  * Note: when a backup block is available in XLOG, we restore it
3500  * unconditionally, even if the page in the database appears newer.
3501  * This is to protect ourselves against database pages that were partially
3502  * or incorrectly written during a crash.  We assume that the XLOG data
3503  * must be good because it has passed a CRC check, while the database
3504  * page might not be.  This will force us to replay all subsequent
3505  * modifications of the page that appear in XLOG, rather than possibly
3506  * ignoring them as already applied, but that's not a huge drawback.
3507  *
3508  * If 'cleanup' is true, a cleanup lock is used when restoring blocks.
3509  * Otherwise, a normal exclusive lock is used.  During crash recovery, that's
3510  * just pro forma because there can't be any regular backends in the system,
3511  * but in hot standby mode the distinction is important. The 'cleanup'
3512  * argument applies to all backup blocks in the WAL record, that suffices for
3513  * now.
3514  */
3515 void
3516 RestoreBkpBlocks(XLogRecPtr lsn, XLogRecord *record, bool cleanup)
3517 {
3518         Buffer          buffer;
3519         Page            page;
3520         BkpBlock        bkpb;
3521         char       *blk;
3522         int                     i;
3523
3524         if (!(record->xl_info & XLR_BKP_BLOCK_MASK))
3525                 return;
3526
3527         blk = (char *) XLogRecGetData(record) + record->xl_len;
3528         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3529         {
3530                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3531                         continue;
3532
3533                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3534                 blk += sizeof(BkpBlock);
3535
3536                 buffer = XLogReadBufferExtended(bkpb.node, bkpb.fork, bkpb.block,
3537                                                                                 RBM_ZERO);
3538                 Assert(BufferIsValid(buffer));
3539                 if (cleanup)
3540                         LockBufferForCleanup(buffer);
3541                 else
3542                         LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE);
3543
3544                 page = (Page) BufferGetPage(buffer);
3545
3546                 if (bkpb.hole_length == 0)
3547                 {
3548                         memcpy((char *) page, blk, BLCKSZ);
3549                 }
3550                 else
3551                 {
3552                         /* must zero-fill the hole */
3553                         MemSet((char *) page, 0, BLCKSZ);
3554                         memcpy((char *) page, blk, bkpb.hole_offset);
3555                         memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
3556                                    blk + bkpb.hole_offset,
3557                                    BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
3558                 }
3559
3560                 PageSetLSN(page, lsn);
3561                 PageSetTLI(page, ThisTimeLineID);
3562                 MarkBufferDirty(buffer);
3563                 UnlockReleaseBuffer(buffer);
3564
3565                 blk += BLCKSZ - bkpb.hole_length;
3566         }
3567 }
3568
3569 /*
3570  * CRC-check an XLOG record.  We do not believe the contents of an XLOG
3571  * record (other than to the minimal extent of computing the amount of
3572  * data to read in) until we've checked the CRCs.
3573  *
3574  * We assume all of the record has been read into memory at *record.
3575  */
3576 static bool
3577 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
3578 {
3579         pg_crc32        crc;
3580         int                     i;
3581         uint32          len = record->xl_len;
3582         BkpBlock        bkpb;
3583         char       *blk;
3584
3585         /* First the rmgr data */
3586         INIT_CRC32(crc);
3587         COMP_CRC32(crc, XLogRecGetData(record), len);
3588
3589         /* Add in the backup blocks, if any */
3590         blk = (char *) XLogRecGetData(record) + len;
3591         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
3592         {
3593                 uint32          blen;
3594
3595                 if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
3596                         continue;
3597
3598                 memcpy(&bkpb, blk, sizeof(BkpBlock));
3599                 if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
3600                 {
3601                         ereport(emode_for_corrupt_record(emode, recptr),
3602                                         (errmsg("incorrect hole size in record at %X/%X",
3603                                                         recptr.xlogid, recptr.xrecoff)));
3604                         return false;
3605                 }
3606                 blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
3607                 COMP_CRC32(crc, blk, blen);
3608                 blk += blen;
3609         }
3610
3611         /* Check that xl_tot_len agrees with our calculation */
3612         if (blk != (char *) record + record->xl_tot_len)
3613         {
3614                 ereport(emode_for_corrupt_record(emode, recptr),
3615                                 (errmsg("incorrect total length in record at %X/%X",
3616                                                 recptr.xlogid, recptr.xrecoff)));
3617                 return false;
3618         }
3619
3620         /* Finally include the record header */
3621         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
3622                            SizeOfXLogRecord - sizeof(pg_crc32));
3623         FIN_CRC32(crc);
3624
3625         if (!EQ_CRC32(record->xl_crc, crc))
3626         {
3627                 ereport(emode_for_corrupt_record(emode, recptr),
3628                 (errmsg("incorrect resource manager data checksum in record at %X/%X",
3629                                 recptr.xlogid, recptr.xrecoff)));
3630                 return false;
3631         }
3632
3633         return true;
3634 }
3635
3636 /*
3637  * Attempt to read an XLOG record.
3638  *
3639  * If RecPtr is not NULL, try to read a record at that position.  Otherwise
3640  * try to read a record just after the last one previously read.
3641  *
3642  * If no valid record is available, returns NULL, or fails if emode is PANIC.
3643  * (emode must be either PANIC, LOG)
3644  *
3645  * The record is copied into readRecordBuf, so that on successful return,
3646  * the returned record pointer always points there.
3647  */
3648 static XLogRecord *
3649 ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
3650 {
3651         XLogRecord *record;
3652         char       *buffer;
3653         XLogRecPtr      tmpRecPtr = EndRecPtr;
3654         bool            randAccess = false;
3655         uint32          len,
3656                                 total_len;
3657         uint32          targetRecOff;
3658         uint32          pageHeaderSize;
3659
3660         if (readBuf == NULL)
3661         {
3662                 /*
3663                  * First time through, permanently allocate readBuf.  We do it this
3664                  * way, rather than just making a static array, for two reasons: (1)
3665                  * no need to waste the storage in most instantiations of the backend;
3666                  * (2) a static char array isn't guaranteed to have any particular
3667                  * alignment, whereas malloc() will provide MAXALIGN'd storage.
3668                  */
3669                 readBuf = (char *) malloc(XLOG_BLCKSZ);
3670                 Assert(readBuf != NULL);
3671         }
3672
3673         if (RecPtr == NULL)
3674         {
3675                 RecPtr = &tmpRecPtr;
3676
3677                 /*
3678                  * Align recptr to next page if no more records can fit on the current
3679                  * page.
3680                  */
3681                 if (XLOG_BLCKSZ - (RecPtr->xrecoff % XLOG_BLCKSZ) < SizeOfXLogRecord)
3682                 {
3683                         NextLogPage(tmpRecPtr);
3684                         /* We will account for page header size below */
3685                 }
3686
3687                 if (tmpRecPtr.xrecoff >= XLogFileSize)
3688                 {
3689                         (tmpRecPtr.xlogid)++;
3690                         tmpRecPtr.xrecoff = 0;
3691                 }
3692         }
3693         else
3694         {
3695                 if (!XRecOffIsValid(RecPtr->xrecoff))
3696                         ereport(PANIC,
3697                                         (errmsg("invalid record offset at %X/%X",
3698                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3699
3700                 /*
3701                  * Since we are going to a random position in WAL, forget any prior
3702                  * state about what timeline we were in, and allow it to be any
3703                  * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
3704                  * to go backwards (but we can't reset that variable right here, since
3705                  * we might not change files at all).
3706                  */
3707                 lastPageTLI = 0;                /* see comment in ValidXLOGHeader */
3708                 randAccess = true;              /* allow curFileTLI to go backwards too */
3709         }
3710
3711         /* This is the first try to read this page. */
3712         failedSources = 0;
3713 retry:
3714         /* Read the page containing the record */
3715         if (!XLogPageRead(RecPtr, emode, fetching_ckpt, randAccess))
3716                 return NULL;
3717
3718         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3719         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
3720         if (targetRecOff == 0)
3721         {
3722                 /*
3723                  * Can only get here in the continuing-from-prev-page case, because
3724                  * XRecOffIsValid eliminated the zero-page-offset case otherwise. Need
3725                  * to skip over the new page's header.
3726                  */
3727                 tmpRecPtr.xrecoff += pageHeaderSize;
3728                 targetRecOff = pageHeaderSize;
3729         }
3730         else if (targetRecOff < pageHeaderSize)
3731         {
3732                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3733                                 (errmsg("invalid record offset at %X/%X",
3734                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3735                 goto next_record_is_invalid;
3736         }
3737         if ((((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) &&
3738                 targetRecOff == pageHeaderSize)
3739         {
3740                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3741                                 (errmsg("contrecord is requested by %X/%X",
3742                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3743                 goto next_record_is_invalid;
3744         }
3745         record = (XLogRecord *) ((char *) readBuf + RecPtr->xrecoff % XLOG_BLCKSZ);
3746
3747         /*
3748          * xl_len == 0 is bad data for everything except XLOG SWITCH, where it is
3749          * required.
3750          */
3751         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3752         {
3753                 if (record->xl_len != 0)
3754                 {
3755                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3756                                         (errmsg("invalid xlog switch record at %X/%X",
3757                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3758                         goto next_record_is_invalid;
3759                 }
3760         }
3761         else if (record->xl_len == 0)
3762         {
3763                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3764                                 (errmsg("record with zero length at %X/%X",
3765                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3766                 goto next_record_is_invalid;
3767         }
3768         if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
3769                 record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
3770                 XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
3771         {
3772                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3773                                 (errmsg("invalid record length at %X/%X",
3774                                                 RecPtr->xlogid, RecPtr->xrecoff)));
3775                 goto next_record_is_invalid;
3776         }
3777         if (record->xl_rmid > RM_MAX_ID)
3778         {
3779                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3780                                 (errmsg("invalid resource manager ID %u at %X/%X",
3781                                                 record->xl_rmid, RecPtr->xlogid, RecPtr->xrecoff)));
3782                 goto next_record_is_invalid;
3783         }
3784         if (randAccess)
3785         {
3786                 /*
3787                  * We can't exactly verify the prev-link, but surely it should be less
3788                  * than the record's own address.
3789                  */
3790                 if (!XLByteLT(record->xl_prev, *RecPtr))
3791                 {
3792                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3793                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3794                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3795                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3796                         goto next_record_is_invalid;
3797                 }
3798         }
3799         else
3800         {
3801                 /*
3802                  * Record's prev-link should exactly match our previous location. This
3803                  * check guards against torn WAL pages where a stale but valid-looking
3804                  * WAL record starts on a sector boundary.
3805                  */
3806                 if (!XLByteEQ(record->xl_prev, ReadRecPtr))
3807                 {
3808                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3809                                         (errmsg("record with incorrect prev-link %X/%X at %X/%X",
3810                                                         record->xl_prev.xlogid, record->xl_prev.xrecoff,
3811                                                         RecPtr->xlogid, RecPtr->xrecoff)));
3812                         goto next_record_is_invalid;
3813                 }
3814         }
3815
3816         /*
3817          * Allocate or enlarge readRecordBuf as needed.  To avoid useless small
3818          * increases, round its size to a multiple of XLOG_BLCKSZ, and make sure
3819          * it's at least 4*Max(BLCKSZ, XLOG_BLCKSZ) to start with.  (That is
3820          * enough for all "normal" records, but very large commit or abort records
3821          * might need more space.)
3822          */
3823         total_len = record->xl_tot_len;
3824         if (total_len > readRecordBufSize)
3825         {
3826                 uint32          newSize = total_len;
3827
3828                 newSize += XLOG_BLCKSZ - (newSize % XLOG_BLCKSZ);
3829                 newSize = Max(newSize, 4 * Max(BLCKSZ, XLOG_BLCKSZ));
3830                 if (readRecordBuf)
3831                         free(readRecordBuf);
3832                 readRecordBuf = (char *) malloc(newSize);
3833                 if (!readRecordBuf)
3834                 {
3835                         readRecordBufSize = 0;
3836                         /* We treat this as a "bogus data" condition */
3837                         ereport(emode_for_corrupt_record(emode, *RecPtr),
3838                                         (errmsg("record length %u at %X/%X too long",
3839                                                         total_len, RecPtr->xlogid, RecPtr->xrecoff)));
3840                         goto next_record_is_invalid;
3841                 }
3842                 readRecordBufSize = newSize;
3843         }
3844
3845         buffer = readRecordBuf;
3846         len = XLOG_BLCKSZ - RecPtr->xrecoff % XLOG_BLCKSZ;
3847         if (total_len > len)
3848         {
3849                 /* Need to reassemble record */
3850                 XLogContRecord *contrecord;
3851                 XLogRecPtr      pagelsn;
3852                 uint32          gotlen = len;
3853
3854                 /* Initialize pagelsn to the beginning of the page this record is on */
3855                 pagelsn = *RecPtr;
3856                 pagelsn.xrecoff = (pagelsn.xrecoff / XLOG_BLCKSZ) * XLOG_BLCKSZ;
3857
3858                 memcpy(buffer, record, len);
3859                 record = (XLogRecord *) buffer;
3860                 buffer += len;
3861                 for (;;)
3862                 {
3863                         /* Calculate pointer to beginning of next page */
3864                         pagelsn.xrecoff += XLOG_BLCKSZ;
3865                         if (pagelsn.xrecoff >= XLogFileSize)
3866                         {
3867                                 (pagelsn.xlogid)++;
3868                                 pagelsn.xrecoff = 0;
3869                         }
3870                         /* Wait for the next page to become available */
3871                         if (!XLogPageRead(&pagelsn, emode, false, false))
3872                                 return NULL;
3873
3874                         /* Check that the continuation record looks valid */
3875                         if (!(((XLogPageHeader) readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD))
3876                         {
3877                                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3878                                                 (errmsg("there is no contrecord flag in log file %u, segment %u, offset %u",
3879                                                                 readId, readSeg, readOff)));
3880                                 goto next_record_is_invalid;
3881                         }
3882                         pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3883                         contrecord = (XLogContRecord *) ((char *) readBuf + pageHeaderSize);
3884                         if (contrecord->xl_rem_len == 0 ||
3885                                 total_len != (contrecord->xl_rem_len + gotlen))
3886                         {
3887                                 ereport(emode_for_corrupt_record(emode, *RecPtr),
3888                                                 (errmsg("invalid contrecord length %u in log file %u, segment %u, offset %u",
3889                                                                 contrecord->xl_rem_len,
3890                                                                 readId, readSeg, readOff)));
3891                                 goto next_record_is_invalid;
3892                         }
3893                         len = XLOG_BLCKSZ - pageHeaderSize - SizeOfXLogContRecord;
3894                         if (contrecord->xl_rem_len > len)
3895                         {
3896                                 memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord, len);
3897                                 gotlen += len;
3898                                 buffer += len;
3899                                 continue;
3900                         }
3901                         memcpy(buffer, (char *) contrecord + SizeOfXLogContRecord,
3902                                    contrecord->xl_rem_len);
3903                         break;
3904                 }
3905                 if (!RecordIsValid(record, *RecPtr, emode))
3906                         goto next_record_is_invalid;
3907                 pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
3908                 EndRecPtr.xlogid = readId;
3909                 EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
3910                         pageHeaderSize +
3911                         MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
3912
3913                 ReadRecPtr = *RecPtr;
3914                 /* needn't worry about XLOG SWITCH, it can't cross page boundaries */
3915                 return record;
3916         }
3917
3918         /* Record does not cross a page boundary */
3919         if (!RecordIsValid(record, *RecPtr, emode))
3920                 goto next_record_is_invalid;
3921         EndRecPtr.xlogid = RecPtr->xlogid;
3922         EndRecPtr.xrecoff = RecPtr->xrecoff + MAXALIGN(total_len);
3923
3924         ReadRecPtr = *RecPtr;
3925         memcpy(buffer, record, total_len);
3926
3927         /*
3928          * Special processing if it's an XLOG SWITCH record
3929          */
3930         if (record->xl_rmid == RM_XLOG_ID && record->xl_info == XLOG_SWITCH)
3931         {
3932                 /* Pretend it extends to end of segment */
3933                 EndRecPtr.xrecoff += XLogSegSize - 1;
3934                 EndRecPtr.xrecoff -= EndRecPtr.xrecoff % XLogSegSize;
3935
3936                 /*
3937                  * Pretend that readBuf contains the last page of the segment. This is
3938                  * just to avoid Assert failure in StartupXLOG if XLOG ends with this
3939                  * segment.
3940                  */
3941                 readOff = XLogSegSize - XLOG_BLCKSZ;
3942         }
3943         return (XLogRecord *) buffer;
3944
3945 next_record_is_invalid:
3946         failedSources |= readSource;
3947
3948         if (readFile >= 0)
3949         {
3950                 close(readFile);
3951                 readFile = -1;
3952         }
3953
3954         /* In standby-mode, keep trying */
3955         if (StandbyMode)
3956                 goto retry;
3957         else
3958                 return NULL;
3959 }
3960
3961 /*
3962  * Check whether the xlog header of a page just read in looks valid.
3963  *
3964  * This is just a convenience subroutine to avoid duplicated code in
3965  * ReadRecord.  It's not intended for use from anywhere else.
3966  */
3967 static bool
3968 ValidXLOGHeader(XLogPageHeader hdr, int emode)
3969 {
3970         XLogRecPtr      recaddr;
3971
3972         recaddr.xlogid = readId;
3973         recaddr.xrecoff = readSeg * XLogSegSize + readOff;
3974
3975         if (hdr->xlp_magic != XLOG_PAGE_MAGIC)
3976         {
3977                 ereport(emode_for_corrupt_record(emode, recaddr),
3978                                 (errmsg("invalid magic number %04X in log file %u, segment %u, offset %u",
3979                                                 hdr->xlp_magic, readId, readSeg, readOff)));
3980                 return false;
3981         }
3982         if ((hdr->xlp_info & ~XLP_ALL_FLAGS) != 0)
3983         {
3984                 ereport(emode_for_corrupt_record(emode, recaddr),
3985                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
3986                                                 hdr->xlp_info, readId, readSeg, readOff)));
3987                 return false;
3988         }
3989         if (hdr->xlp_info & XLP_LONG_HEADER)
3990         {
3991                 XLogLongPageHeader longhdr = (XLogLongPageHeader) hdr;
3992
3993                 if (longhdr->xlp_sysid != ControlFile->system_identifier)
3994                 {
3995                         char            fhdrident_str[32];
3996                         char            sysident_str[32];
3997
3998                         /*
3999                          * Format sysids separately to keep platform-dependent format code
4000                          * out of the translatable message string.
4001                          */
4002                         snprintf(fhdrident_str, sizeof(fhdrident_str), UINT64_FORMAT,
4003                                          longhdr->xlp_sysid);
4004                         snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
4005                                          ControlFile->system_identifier);
4006                         ereport(emode_for_corrupt_record(emode, recaddr),
4007                                         (errmsg("WAL file is from different database system"),
4008                                          errdetail("WAL file database system identifier is %s, pg_control database system identifier is %s.",
4009                                                            fhdrident_str, sysident_str)));
4010                         return false;
4011                 }
4012                 if (longhdr->xlp_seg_size != XLogSegSize)
4013                 {
4014                         ereport(emode_for_corrupt_record(emode, recaddr),
4015                                         (errmsg("WAL file is from different database system"),
4016                                          errdetail("Incorrect XLOG_SEG_SIZE in page header.")));
4017                         return false;
4018                 }
4019                 if (longhdr->xlp_xlog_blcksz != XLOG_BLCKSZ)
4020                 {
4021                         ereport(emode_for_corrupt_record(emode, recaddr),
4022                                         (errmsg("WAL file is from different database system"),
4023                                          errdetail("Incorrect XLOG_BLCKSZ in page header.")));
4024                         return false;
4025                 }
4026         }
4027         else if (readOff == 0)
4028         {
4029                 /* hmm, first page of file doesn't have a long header? */
4030                 ereport(emode_for_corrupt_record(emode, recaddr),
4031                                 (errmsg("invalid info bits %04X in log file %u, segment %u, offset %u",
4032                                                 hdr->xlp_info, readId, readSeg, readOff)));
4033                 return false;
4034         }
4035
4036         if (!XLByteEQ(hdr->xlp_pageaddr, recaddr))
4037         {
4038                 ereport(emode_for_corrupt_record(emode, recaddr),
4039                                 (errmsg("unexpected pageaddr %X/%X in log file %u, segment %u, offset %u",
4040                                                 hdr->xlp_pageaddr.xlogid, hdr->xlp_pageaddr.xrecoff,
4041                                                 readId, readSeg, readOff)));
4042                 return false;
4043         }
4044
4045         /*
4046          * Check page TLI is one of the expected values.
4047          */
4048         if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
4049         {
4050                 ereport(emode_for_corrupt_record(emode, recaddr),
4051                                 (errmsg("unexpected timeline ID %u in log file %u, segment %u, offset %u",
4052                                                 hdr->xlp_tli,
4053                                                 readId, readSeg, readOff)));
4054                 return false;
4055         }
4056
4057         /*
4058          * Since child timelines are always assigned a TLI greater than their
4059          * immediate parent's TLI, we should never see TLI go backwards across
4060          * successive pages of a consistent WAL sequence.
4061          *
4062          * Of course this check should only be applied when advancing sequentially
4063          * across pages; therefore ReadRecord resets lastPageTLI to zero when
4064          * going to a random page.
4065          */
4066         if (hdr->xlp_tli < lastPageTLI)
4067         {
4068                 ereport(emode_for_corrupt_record(emode, recaddr),
4069                                 (errmsg("out-of-sequence timeline ID %u (after %u) in log file %u, segment %u, offset %u",
4070                                                 hdr->xlp_tli, lastPageTLI,
4071                                                 readId, readSeg, readOff)));
4072                 return false;
4073         }
4074         lastPageTLI = hdr->xlp_tli;
4075         return true;
4076 }
4077
4078 /*
4079  * Try to read a timeline's history file.
4080  *
4081  * If successful, return the list of component TLIs (the given TLI followed by
4082  * its ancestor TLIs).  If we can't find the history file, assume that the
4083  * timeline has no parents, and return a list of just the specified timeline
4084  * ID.
4085  */
4086 static List *
4087 readTimeLineHistory(TimeLineID targetTLI)
4088 {
4089         List       *result;
4090         char            path[MAXPGPATH];
4091         char            histfname[MAXFNAMELEN];
4092         char            fline[MAXPGPATH];
4093         FILE       *fd;
4094
4095         /* Timeline 1 does not have a history file, so no need to check */
4096         if (targetTLI == 1)
4097                 return list_make1_int((int) targetTLI);
4098
4099         if (InArchiveRecovery)
4100         {
4101                 TLHistoryFileName(histfname, targetTLI);
4102                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4103         }
4104         else
4105                 TLHistoryFilePath(path, targetTLI);
4106
4107         fd = AllocateFile(path, "r");
4108         if (fd == NULL)
4109         {
4110                 if (errno != ENOENT)
4111                         ereport(FATAL,
4112                                         (errcode_for_file_access(),
4113                                          errmsg("could not open file \"%s\": %m", path)));
4114                 /* Not there, so assume no parents */
4115                 return list_make1_int((int) targetTLI);
4116         }
4117
4118         result = NIL;
4119
4120         /*
4121          * Parse the file...
4122          */
4123         while (fgets(fline, sizeof(fline), fd) != NULL)
4124         {
4125                 /* skip leading whitespace and check for # comment */
4126                 char       *ptr;
4127                 char       *endptr;
4128                 TimeLineID      tli;
4129
4130                 for (ptr = fline; *ptr; ptr++)
4131                 {
4132                         if (!isspace((unsigned char) *ptr))
4133                                 break;
4134                 }
4135                 if (*ptr == '\0' || *ptr == '#')
4136                         continue;
4137
4138                 /* expect a numeric timeline ID as first field of line */
4139                 tli = (TimeLineID) strtoul(ptr, &endptr, 0);
4140                 if (endptr == ptr)
4141                         ereport(FATAL,
4142                                         (errmsg("syntax error in history file: %s", fline),
4143                                          errhint("Expected a numeric timeline ID.")));
4144
4145                 if (result &&
4146                         tli <= (TimeLineID) linitial_int(result))
4147                         ereport(FATAL,
4148                                         (errmsg("invalid data in history file: %s", fline),
4149                                    errhint("Timeline IDs must be in increasing sequence.")));
4150
4151                 /* Build list with newest item first */
4152                 result = lcons_int((int) tli, result);
4153
4154                 /* we ignore the remainder of each line */
4155         }
4156
4157         FreeFile(fd);
4158
4159         if (result &&
4160                 targetTLI <= (TimeLineID) linitial_int(result))
4161                 ereport(FATAL,
4162                                 (errmsg("invalid data in history file \"%s\"", path),
4163                         errhint("Timeline IDs must be less than child timeline's ID.")));
4164
4165         result = lcons_int((int) targetTLI, result);
4166
4167         ereport(DEBUG3,
4168                         (errmsg_internal("history of timeline %u is %s",
4169                                                          targetTLI, nodeToString(result))));
4170
4171         return result;
4172 }
4173
4174 /*
4175  * Probe whether a timeline history file exists for the given timeline ID
4176  */
4177 static bool
4178 existsTimeLineHistory(TimeLineID probeTLI)
4179 {
4180         char            path[MAXPGPATH];
4181         char            histfname[MAXFNAMELEN];
4182         FILE       *fd;
4183
4184         /* Timeline 1 does not have a history file, so no need to check */
4185         if (probeTLI == 1)
4186                 return false;
4187
4188         if (InArchiveRecovery)
4189         {
4190                 TLHistoryFileName(histfname, probeTLI);
4191                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4192         }
4193         else
4194                 TLHistoryFilePath(path, probeTLI);
4195
4196         fd = AllocateFile(path, "r");
4197         if (fd != NULL)
4198         {
4199                 FreeFile(fd);
4200                 return true;
4201         }
4202         else
4203         {
4204                 if (errno != ENOENT)
4205                         ereport(FATAL,
4206                                         (errcode_for_file_access(),
4207                                          errmsg("could not open file \"%s\": %m", path)));
4208                 return false;
4209         }
4210 }
4211
4212 /*
4213  * Find the newest existing timeline, assuming that startTLI exists.
4214  *
4215  * Note: while this is somewhat heuristic, it does positively guarantee
4216  * that (result + 1) is not a known timeline, and therefore it should
4217  * be safe to assign that ID to a new timeline.
4218  */
4219 static TimeLineID
4220 findNewestTimeLine(TimeLineID startTLI)
4221 {
4222         TimeLineID      newestTLI;
4223         TimeLineID      probeTLI;
4224
4225         /*
4226          * The algorithm is just to probe for the existence of timeline history
4227          * files.  XXX is it useful to allow gaps in the sequence?
4228          */
4229         newestTLI = startTLI;
4230
4231         for (probeTLI = startTLI + 1;; probeTLI++)
4232         {
4233                 if (existsTimeLineHistory(probeTLI))
4234                 {
4235                         newestTLI = probeTLI;           /* probeTLI exists */
4236                 }
4237                 else
4238                 {
4239                         /* doesn't exist, assume we're done */
4240                         break;
4241                 }
4242         }
4243
4244         return newestTLI;
4245 }
4246
4247 /*
4248  * Create a new timeline history file.
4249  *
4250  *      newTLI: ID of the new timeline
4251  *      parentTLI: ID of its immediate parent
4252  *      endTLI et al: ID of the last used WAL file, for annotation purposes
4253  *
4254  * Currently this is only used during recovery, and so there are no locking
4255  * considerations.      But we should be just as tense as XLogFileInit to avoid
4256  * emplacing a bogus file.
4257  */
4258 static void
4259 writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
4260                                          TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
4261 {
4262         char            path[MAXPGPATH];
4263         char            tmppath[MAXPGPATH];
4264         char            histfname[MAXFNAMELEN];
4265         char            xlogfname[MAXFNAMELEN];
4266         char            buffer[BLCKSZ];
4267         int                     srcfd;
4268         int                     fd;
4269         int                     nbytes;
4270
4271         Assert(newTLI > parentTLI); /* else bad selection of newTLI */
4272
4273         /*
4274          * Write into a temp file name.
4275          */
4276         snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
4277
4278         unlink(tmppath);
4279
4280         /* do not use get_sync_bit() here --- want to fsync only at end of fill */
4281         fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL,
4282                                            S_IRUSR | S_IWUSR);
4283         if (fd < 0)
4284                 ereport(ERROR,
4285                                 (errcode_for_file_access(),
4286                                  errmsg("could not create file \"%s\": %m", tmppath)));
4287
4288         /*
4289          * If a history file exists for the parent, copy it verbatim
4290          */
4291         if (InArchiveRecovery)
4292         {
4293                 TLHistoryFileName(histfname, parentTLI);
4294                 RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0);
4295         }
4296         else
4297                 TLHistoryFilePath(path, parentTLI);
4298
4299         srcfd = BasicOpenFile(path, O_RDONLY, 0);
4300         if (srcfd < 0)
4301         {
4302                 if (errno != ENOENT)
4303                         ereport(ERROR,
4304                                         (errcode_for_file_access(),
4305                                          errmsg("could not open file \"%s\": %m", path)));
4306                 /* Not there, so assume parent has no parents */
4307         }
4308         else
4309         {
4310                 for (;;)
4311                 {
4312                         errno = 0;
4313                         nbytes = (int) read(srcfd, buffer, sizeof(buffer));
4314                         if (nbytes < 0 || errno != 0)
4315                                 ereport(ERROR,
4316                                                 (errcode_for_file_access(),
4317                                                  errmsg("could not read file \"%s\": %m", path)));
4318                         if (nbytes == 0)
4319                                 break;
4320                         errno = 0;
4321                         if ((int) write(fd, buffer, nbytes) != nbytes)
4322                         {
4323                                 int                     save_errno = errno;
4324
4325                                 /*
4326                                  * If we fail to make the file, delete it to release disk
4327                                  * space
4328                                  */
4329                                 unlink(tmppath);
4330
4331                                 /*
4332                                  * if write didn't set errno, assume problem is no disk space
4333                                  */
4334                                 errno = save_errno ? save_errno : ENOSPC;
4335
4336                                 ereport(ERROR,
4337                                                 (errcode_for_file_access(),
4338                                          errmsg("could not write to file \"%s\": %m", tmppath)));
4339                         }
4340                 }
4341                 close(srcfd);
4342         }
4343
4344         /*
4345          * Append one line with the details of this timeline split.
4346          *
4347          * If we did have a parent file, insert an extra newline just in case the
4348          * parent file failed to end with one.
4349          */
4350         XLogFileName(xlogfname, endTLI, endLogId, endLogSeg);
4351
4352         /*
4353          * Write comment to history file to explain why and where timeline
4354          * changed. Comment varies according to the recovery target used.
4355          */
4356         if (recoveryTarget == RECOVERY_TARGET_XID)
4357                 snprintf(buffer, sizeof(buffer),
4358                                  "%s%u\t%s\t%s transaction %u\n",
4359                                  (srcfd < 0) ? "" : "\n",
4360                                  parentTLI,
4361                                  xlogfname,
4362                                  recoveryStopAfter ? "after" : "before",
4363                                  recoveryStopXid);
4364         else if (recoveryTarget == RECOVERY_TARGET_TIME)
4365                 snprintf(buffer, sizeof(buffer),
4366                                  "%s%u\t%s\t%s %s\n",
4367                                  (srcfd < 0) ? "" : "\n",
4368                                  parentTLI,
4369                                  xlogfname,
4370                                  recoveryStopAfter ? "after" : "before",
4371                                  timestamptz_to_str(recoveryStopTime));
4372         else
4373                 snprintf(buffer, sizeof(buffer),
4374                                  "%s%u\t%s\tno recovery target specified\n",
4375                                  (srcfd < 0) ? "" : "\n",
4376                                  parentTLI,
4377                                  xlogfname);
4378
4379         nbytes = strlen(buffer);
4380         errno = 0;
4381         if ((int) write(fd, buffer, nbytes) != nbytes)
4382         {
4383                 int                     save_errno = errno;
4384
4385                 /*
4386                  * If we fail to make the file, delete it to release disk space
4387                  */
4388                 unlink(tmppath);
4389                 /* if write didn't set errno, assume problem is no disk space */
4390                 errno = save_errno ? save_errno : ENOSPC;
4391
4392                 ereport(ERROR,
4393                                 (errcode_for_file_access(),
4394                                  errmsg("could not write to file \"%s\": %m", tmppath)));
4395         }
4396
4397         if (pg_fsync(fd) != 0)
4398                 ereport(ERROR,
4399                                 (errcode_for_file_access(),
4400                                  errmsg("could not fsync file \"%s\": %m", tmppath)));
4401
4402         if (close(fd))
4403                 ereport(ERROR,
4404                                 (errcode_for_file_access(),
4405                                  errmsg("could not close file \"%s\": %m", tmppath)));
4406
4407
4408         /*
4409          * Now move the completed history file into place with its final name.
4410          */
4411         TLHistoryFilePath(path, newTLI);
4412
4413         /*
4414          * Prefer link() to rename() here just to be really sure that we don't
4415          * overwrite an existing logfile.  However, there shouldn't be one, so
4416          * rename() is an acceptable substitute except for the truly paranoid.
4417          */
4418 #if HAVE_WORKING_LINK
4419         if (link(tmppath, path) < 0)
4420                 ereport(ERROR,
4421                                 (errcode_for_file_access(),
4422                                  errmsg("could not link file \"%s\" to \"%s\": %m",
4423                                                 tmppath, path)));
4424         unlink(tmppath);
4425 #else
4426         if (rename(tmppath, path) < 0)
4427                 ereport(ERROR,
4428                                 (errcode_for_file_access(),
4429                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
4430                                                 tmppath, path)));
4431 #endif
4432
4433         /* The history file can be archived immediately. */
4434         TLHistoryFileName(histfname, newTLI);
4435         XLogArchiveNotify(histfname);
4436 }
4437
4438 /*
4439  * I/O routines for pg_control
4440  *
4441  * *ControlFile is a buffer in shared memory that holds an image of the
4442  * contents of pg_control.      WriteControlFile() initializes pg_control
4443  * given a preloaded buffer, ReadControlFile() loads the buffer from
4444  * the pg_control file (during postmaster or standalone-backend startup),
4445  * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4446  *
4447  * For simplicity, WriteControlFile() initializes the fields of pg_control
4448  * that are related to checking backend/database compatibility, and
4449  * ReadControlFile() verifies they are correct.  We could split out the
4450  * I/O and compatibility-check functions, but there seems no need currently.
4451  */
4452 static void
4453 WriteControlFile(void)
4454 {
4455         int                     fd;
4456         char            buffer[PG_CONTROL_SIZE];                /* need not be aligned */
4457
4458         /*
4459          * Initialize version and compatibility-check fields
4460          */
4461         ControlFile->pg_control_version = PG_CONTROL_VERSION;
4462         ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4463
4464         ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4465         ControlFile->floatFormat = FLOATFORMAT_VALUE;
4466
4467         ControlFile->blcksz = BLCKSZ;
4468         ControlFile->relseg_size = RELSEG_SIZE;
4469         ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4470         ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4471
4472         ControlFile->nameDataLen = NAMEDATALEN;
4473         ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4474
4475         ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4476
4477 #ifdef HAVE_INT64_TIMESTAMP
4478         ControlFile->enableIntTimes = true;
4479 #else
4480         ControlFile->enableIntTimes = false;
4481 #endif
4482         ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4483         ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4484
4485         /* Contents are protected with a CRC */
4486         INIT_CRC32(ControlFile->crc);
4487         COMP_CRC32(ControlFile->crc,
4488                            (char *) ControlFile,
4489                            offsetof(ControlFileData, crc));
4490         FIN_CRC32(ControlFile->crc);
4491
4492         /*
4493          * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4494          * excess over sizeof(ControlFileData).  This reduces the odds of
4495          * premature-EOF errors when reading pg_control.  We'll still fail when we
4496          * check the contents of the file, but hopefully with a more specific
4497          * error than "couldn't read pg_control".
4498          */
4499         if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4500                 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4501
4502         memset(buffer, 0, PG_CONTROL_SIZE);
4503         memcpy(buffer, ControlFile, sizeof(ControlFileData));
4504
4505         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4506                                            O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4507                                            S_IRUSR | S_IWUSR);
4508         if (fd < 0)
4509                 ereport(PANIC,
4510                                 (errcode_for_file_access(),
4511                                  errmsg("could not create control file \"%s\": %m",
4512                                                 XLOG_CONTROL_FILE)));
4513
4514         errno = 0;
4515         if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4516         {
4517                 /* if write didn't set errno, assume problem is no disk space */
4518                 if (errno == 0)
4519                         errno = ENOSPC;
4520                 ereport(PANIC,
4521                                 (errcode_for_file_access(),
4522                                  errmsg("could not write to control file: %m")));
4523         }
4524
4525         if (pg_fsync(fd) != 0)
4526                 ereport(PANIC,
4527                                 (errcode_for_file_access(),
4528                                  errmsg("could not fsync control file: %m")));
4529
4530         if (close(fd))
4531                 ereport(PANIC,
4532                                 (errcode_for_file_access(),
4533                                  errmsg("could not close control file: %m")));
4534 }
4535
4536 static void
4537 ReadControlFile(void)
4538 {
4539         pg_crc32        crc;
4540         int                     fd;
4541
4542         /*
4543          * Read data...
4544          */
4545         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4546                                            O_RDWR | PG_BINARY,
4547                                            S_IRUSR | S_IWUSR);
4548         if (fd < 0)
4549                 ereport(PANIC,
4550                                 (errcode_for_file_access(),
4551                                  errmsg("could not open control file \"%s\": %m",
4552                                                 XLOG_CONTROL_FILE)));
4553
4554         if (read(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4555                 ereport(PANIC,
4556                                 (errcode_for_file_access(),
4557                                  errmsg("could not read from control file: %m")));
4558
4559         close(fd);
4560
4561         /*
4562          * Check for expected pg_control format version.  If this is wrong, the
4563          * CRC check will likely fail because we'll be checking the wrong number
4564          * of bytes.  Complaining about wrong version will probably be more
4565          * enlightening than complaining about wrong CRC.
4566          */
4567
4568         if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4569                 ereport(FATAL,
4570                                 (errmsg("database files are incompatible with server"),
4571                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4572                  " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4573                         ControlFile->pg_control_version, ControlFile->pg_control_version,
4574                                                    PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4575                                  errhint("This could be a problem of mismatched byte ordering.  It looks like you need to initdb.")));
4576
4577         if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4578                 ereport(FATAL,
4579                                 (errmsg("database files are incompatible with server"),
4580                                  errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4581                                   " but the server was compiled with PG_CONTROL_VERSION %d.",
4582                                                 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4583                                  errhint("It looks like you need to initdb.")));
4584
4585         /* Now check the CRC. */
4586         INIT_CRC32(crc);
4587         COMP_CRC32(crc,
4588                            (char *) ControlFile,
4589                            offsetof(ControlFileData, crc));
4590         FIN_CRC32(crc);
4591
4592         if (!EQ_CRC32(crc, ControlFile->crc))
4593                 ereport(FATAL,
4594                                 (errmsg("incorrect checksum in control file")));
4595
4596         /*
4597          * Do compatibility checking immediately.  If the database isn't
4598          * compatible with the backend executable, we want to abort before we can
4599          * possibly do any damage.
4600          */
4601         if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4602                 ereport(FATAL,
4603                                 (errmsg("database files are incompatible with server"),
4604                                  errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4605                                   " but the server was compiled with CATALOG_VERSION_NO %d.",
4606                                                 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4607                                  errhint("It looks like you need to initdb.")));
4608         if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4609                 ereport(FATAL,
4610                                 (errmsg("database files are incompatible with server"),
4611                    errdetail("The database cluster was initialized with MAXALIGN %d,"
4612                                          " but the server was compiled with MAXALIGN %d.",
4613                                          ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4614                                  errhint("It looks like you need to initdb.")));
4615         if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4616                 ereport(FATAL,
4617                                 (errmsg("database files are incompatible with server"),
4618                                  errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4619                                  errhint("It looks like you need to initdb.")));
4620         if (ControlFile->blcksz != BLCKSZ)
4621                 ereport(FATAL,
4622                                 (errmsg("database files are incompatible with server"),
4623                          errdetail("The database cluster was initialized with BLCKSZ %d,"
4624                                            " but the server was compiled with BLCKSZ %d.",
4625                                            ControlFile->blcksz, BLCKSZ),
4626                                  errhint("It looks like you need to recompile or initdb.")));
4627         if (ControlFile->relseg_size != RELSEG_SIZE)
4628                 ereport(FATAL,
4629                                 (errmsg("database files are incompatible with server"),
4630                 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4631                                   " but the server was compiled with RELSEG_SIZE %d.",
4632                                   ControlFile->relseg_size, RELSEG_SIZE),
4633                                  errhint("It looks like you need to recompile or initdb.")));
4634         if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4635                 ereport(FATAL,
4636                                 (errmsg("database files are incompatible with server"),
4637                 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4638                                   " but the server was compiled with XLOG_BLCKSZ %d.",
4639                                   ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4640                                  errhint("It looks like you need to recompile or initdb.")));
4641         if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4642                 ereport(FATAL,
4643                                 (errmsg("database files are incompatible with server"),
4644                                  errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4645                                            " but the server was compiled with XLOG_SEG_SIZE %d.",
4646                                                    ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4647                                  errhint("It looks like you need to recompile or initdb.")));
4648         if (ControlFile->nameDataLen != NAMEDATALEN)
4649                 ereport(FATAL,
4650                                 (errmsg("database files are incompatible with server"),
4651                 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4652                                   " but the server was compiled with NAMEDATALEN %d.",
4653                                   ControlFile->nameDataLen, NAMEDATALEN),
4654                                  errhint("It looks like you need to recompile or initdb.")));
4655         if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4656                 ereport(FATAL,
4657                                 (errmsg("database files are incompatible with server"),
4658                                  errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4659                                           " but the server was compiled with INDEX_MAX_KEYS %d.",
4660                                                    ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4661                                  errhint("It looks like you need to recompile or initdb.")));
4662         if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4663                 ereport(FATAL,
4664                                 (errmsg("database files are incompatible with server"),
4665                                  errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4666                                 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4667                           ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4668                                  errhint("It looks like you need to recompile or initdb.")));
4669
4670 #ifdef HAVE_INT64_TIMESTAMP
4671         if (ControlFile->enableIntTimes != true)
4672                 ereport(FATAL,
4673                                 (errmsg("database files are incompatible with server"),
4674                                  errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4675                                   " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4676                                  errhint("It looks like you need to recompile or initdb.")));
4677 #else
4678         if (ControlFile->enableIntTimes != false)
4679                 ereport(FATAL,
4680                                 (errmsg("database files are incompatible with server"),
4681                                  errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4682                            " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4683                                  errhint("It looks like you need to recompile or initdb.")));
4684 #endif
4685
4686 #ifdef USE_FLOAT4_BYVAL
4687         if (ControlFile->float4ByVal != true)
4688                 ereport(FATAL,
4689                                 (errmsg("database files are incompatible with server"),
4690                                  errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4691                                           " but the server was compiled with USE_FLOAT4_BYVAL."),
4692                                  errhint("It looks like you need to recompile or initdb.")));
4693 #else
4694         if (ControlFile->float4ByVal != false)
4695                 ereport(FATAL,
4696                                 (errmsg("database files are incompatible with server"),
4697                 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4698                                   " but the server was compiled without USE_FLOAT4_BYVAL."),
4699                                  errhint("It looks like you need to recompile or initdb.")));
4700 #endif
4701
4702 #ifdef USE_FLOAT8_BYVAL
4703         if (ControlFile->float8ByVal != true)
4704                 ereport(FATAL,
4705                                 (errmsg("database files are incompatible with server"),
4706                                  errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4707                                           " but the server was compiled with USE_FLOAT8_BYVAL."),
4708                                  errhint("It looks like you need to recompile or initdb.")));
4709 #else
4710         if (ControlFile->float8ByVal != false)
4711                 ereport(FATAL,
4712                                 (errmsg("database files are incompatible with server"),
4713                 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4714                                   " but the server was compiled without USE_FLOAT8_BYVAL."),
4715                                  errhint("It looks like you need to recompile or initdb.")));
4716 #endif
4717 }
4718
4719 void
4720 UpdateControlFile(void)
4721 {
4722         int                     fd;
4723
4724         INIT_CRC32(ControlFile->crc);
4725         COMP_CRC32(ControlFile->crc,
4726                            (char *) ControlFile,
4727                            offsetof(ControlFileData, crc));
4728         FIN_CRC32(ControlFile->crc);
4729
4730         fd = BasicOpenFile(XLOG_CONTROL_FILE,
4731                                            O_RDWR | PG_BINARY,
4732                                            S_IRUSR | S_IWUSR);
4733         if (fd < 0)
4734                 ereport(PANIC,
4735                                 (errcode_for_file_access(),
4736                                  errmsg("could not open control file \"%s\": %m",
4737                                                 XLOG_CONTROL_FILE)));
4738
4739         errno = 0;
4740         if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4741         {
4742                 /* if write didn't set errno, assume problem is no disk space */
4743                 if (errno == 0)
4744                         errno = ENOSPC;
4745                 ereport(PANIC,
4746                                 (errcode_for_file_access(),
4747                                  errmsg("could not write to control file: %m")));
4748         }
4749
4750         if (pg_fsync(fd) != 0)
4751                 ereport(PANIC,
4752                                 (errcode_for_file_access(),
4753                                  errmsg("could not fsync control file: %m")));
4754
4755         if (close(fd))
4756                 ereport(PANIC,
4757                                 (errcode_for_file_access(),
4758                                  errmsg("could not close control file: %m")));
4759 }
4760
4761 /*
4762  * Returns the unique system identifier from control file.
4763  */
4764 uint64
4765 GetSystemIdentifier(void)
4766 {
4767         Assert(ControlFile != NULL);
4768         return ControlFile->system_identifier;
4769 }
4770
4771 /*
4772  * Initialization of shared memory for XLOG
4773  */
4774 Size
4775 XLOGShmemSize(void)
4776 {
4777         Size            size;
4778
4779         /* XLogCtl */
4780         size = sizeof(XLogCtlData);
4781         /* xlblocks array */
4782         size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4783         /* extra alignment padding for XLOG I/O buffers */
4784         size = add_size(size, ALIGNOF_XLOG_BUFFER);
4785         /* and the buffers themselves */
4786         size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4787
4788         /*
4789          * Note: we don't count ControlFileData, it comes out of the "slop factor"
4790          * added by CreateSharedMemoryAndSemaphores.  This lets us use this
4791          * routine again below to compute the actual allocation size.
4792          */
4793
4794         return size;
4795 }
4796
4797 void
4798 XLOGShmemInit(void)
4799 {
4800         bool            foundCFile,
4801                                 foundXLog;
4802         char       *allocptr;
4803
4804         ControlFile = (ControlFileData *)
4805                 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4806         XLogCtl = (XLogCtlData *)
4807                 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4808
4809         if (foundCFile || foundXLog)
4810         {
4811                 /* both should be present or neither */
4812                 Assert(foundCFile && foundXLog);
4813                 return;
4814         }
4815
4816         memset(XLogCtl, 0, sizeof(XLogCtlData));
4817
4818         /*
4819          * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4820          * multiple of the alignment for same, so no extra alignment padding is
4821          * needed here.
4822          */
4823         allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4824         XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4825         memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4826         allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4827
4828         /*
4829          * Align the start of the page buffers to an ALIGNOF_XLOG_BUFFER boundary.
4830          */
4831         allocptr = (char *) TYPEALIGN(ALIGNOF_XLOG_BUFFER, allocptr);
4832         XLogCtl->pages = allocptr;
4833         memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4834
4835         /*
4836          * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4837          * in additional info.)
4838          */
4839         XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4840         XLogCtl->SharedRecoveryInProgress = true;
4841         XLogCtl->Insert.currpage = (XLogPageHeader) (XLogCtl->pages);
4842         SpinLockInit(&XLogCtl->info_lck);
4843
4844         /*
4845          * If we are not in bootstrap mode, pg_control should already exist. Read
4846          * and validate it immediately (see comments in ReadControlFile() for the
4847          * reasons why).
4848          */
4849         if (!IsBootstrapProcessingMode())
4850                 ReadControlFile();
4851 }
4852
4853 /*
4854  * This func must be called ONCE on system install.  It creates pg_control
4855  * and the initial XLOG segment.
4856  */
4857 void
4858 BootStrapXLOG(void)
4859 {
4860         CheckPoint      checkPoint;
4861         char       *buffer;
4862         XLogPageHeader page;
4863         XLogLongPageHeader longpage;
4864         XLogRecord *record;
4865         bool            use_existent;
4866         uint64          sysidentifier;
4867         struct timeval tv;
4868         pg_crc32        crc;
4869
4870         /*
4871          * Select a hopefully-unique system identifier code for this installation.
4872          * We use the result of gettimeofday(), including the fractional seconds
4873          * field, as being about as unique as we can easily get.  (Think not to
4874          * use random(), since it hasn't been seeded and there's no portable way
4875          * to seed it other than the system clock value...)  The upper half of the
4876          * uint64 value is just the tv_sec part, while the lower half is the XOR
4877          * of tv_sec and tv_usec.  This is to ensure that we don't lose uniqueness
4878          * unnecessarily if "uint64" is really only 32 bits wide.  A person
4879          * knowing this encoding can determine the initialization time of the
4880          * installation, which could perhaps be useful sometimes.
4881          */
4882         gettimeofday(&tv, NULL);
4883         sysidentifier = ((uint64) tv.tv_sec) << 32;
4884         sysidentifier |= (uint32) (tv.tv_sec | tv.tv_usec);
4885
4886         /* First timeline ID is always 1 */
4887         ThisTimeLineID = 1;
4888
4889         /* page buffer must be aligned suitably for O_DIRECT */
4890         buffer = (char *) palloc(XLOG_BLCKSZ + ALIGNOF_XLOG_BUFFER);
4891         page = (XLogPageHeader) TYPEALIGN(ALIGNOF_XLOG_BUFFER, buffer);
4892         memset(page, 0, XLOG_BLCKSZ);
4893
4894         /*
4895          * Set up information for the initial checkpoint record
4896          *
4897          * The initial checkpoint record is written to the beginning of the
4898          * WAL segment with logid=0 logseg=1. The very first WAL segment, 0/0, is
4899          * not used, so that we can use 0/0 to mean "before any valid WAL segment".
4900          */
4901         checkPoint.redo.xlogid = 0;
4902         checkPoint.redo.xrecoff = XLogSegSize + SizeOfXLogLongPHD;
4903         checkPoint.ThisTimeLineID = ThisTimeLineID;
4904         checkPoint.nextXidEpoch = 0;
4905         checkPoint.nextXid = FirstNormalTransactionId;
4906         checkPoint.nextOid = FirstBootstrapObjectId;
4907         checkPoint.nextMulti = FirstMultiXactId;
4908         checkPoint.nextMultiOffset = 0;
4909         checkPoint.oldestXid = FirstNormalTransactionId;
4910         checkPoint.oldestXidDB = TemplateDbOid;
4911         checkPoint.time = (pg_time_t) time(NULL);
4912         checkPoint.oldestActiveXid = InvalidTransactionId;
4913
4914         ShmemVariableCache->nextXid = checkPoint.nextXid;
4915         ShmemVariableCache->nextOid = checkPoint.nextOid;
4916         ShmemVariableCache->oidCount = 0;
4917         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4918         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4919
4920         /* Set up the XLOG page header */
4921         page->xlp_magic = XLOG_PAGE_MAGIC;
4922         page->xlp_info = XLP_LONG_HEADER;
4923         page->xlp_tli = ThisTimeLineID;
4924         page->xlp_pageaddr.xlogid = 0;
4925         page->xlp_pageaddr.xrecoff = XLogSegSize;
4926         longpage = (XLogLongPageHeader) page;
4927         longpage->xlp_sysid = sysidentifier;
4928         longpage->xlp_seg_size = XLogSegSize;
4929         longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4930
4931         /* Insert the initial checkpoint record */
4932         record = (XLogRecord *) ((char *) page + SizeOfXLogLongPHD);
4933         record->xl_prev.xlogid = 0;
4934         record->xl_prev.xrecoff = 0;
4935         record->xl_xid = InvalidTransactionId;
4936         record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
4937         record->xl_len = sizeof(checkPoint);
4938         record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
4939         record->xl_rmid = RM_XLOG_ID;
4940         memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
4941
4942         INIT_CRC32(crc);
4943         COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
4944         COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
4945                            SizeOfXLogRecord - sizeof(pg_crc32));
4946         FIN_CRC32(crc);
4947         record->xl_crc = crc;
4948
4949         /* Create first XLOG segment file */
4950         use_existent = false;
4951         openLogFile = XLogFileInit(0, 1, &use_existent, false);
4952
4953         /* Write the first page with the initial record */
4954         errno = 0;
4955         if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
4956         {
4957                 /* if write didn't set errno, assume problem is no disk space */
4958                 if (errno == 0)
4959                         errno = ENOSPC;
4960                 ereport(PANIC,
4961                                 (errcode_for_file_access(),
4962                           errmsg("could not write bootstrap transaction log file: %m")));
4963         }
4964
4965         if (pg_fsync(openLogFile) != 0)
4966                 ereport(PANIC,
4967                                 (errcode_for_file_access(),
4968                           errmsg("could not fsync bootstrap transaction log file: %m")));
4969
4970         if (close(openLogFile))
4971                 ereport(PANIC,
4972                                 (errcode_for_file_access(),
4973                           errmsg("could not close bootstrap transaction log file: %m")));
4974
4975         openLogFile = -1;
4976
4977         /* Now create pg_control */
4978
4979         memset(ControlFile, 0, sizeof(ControlFileData));
4980         /* Initialize pg_control status fields */
4981         ControlFile->system_identifier = sysidentifier;
4982         ControlFile->state = DB_SHUTDOWNED;
4983         ControlFile->time = checkPoint.time;
4984         ControlFile->checkPoint = checkPoint.redo;
4985         ControlFile->checkPointCopy = checkPoint;
4986
4987         /* Set important parameter values for use when replaying WAL */
4988         ControlFile->MaxConnections = MaxConnections;
4989         ControlFile->max_prepared_xacts = max_prepared_xacts;
4990         ControlFile->max_locks_per_xact = max_locks_per_xact;
4991         ControlFile->wal_level = wal_level;
4992
4993         /* some additional ControlFile fields are set in WriteControlFile() */
4994
4995         WriteControlFile();
4996
4997         /* Bootstrap the commit log, too */
4998         BootStrapCLOG();
4999         BootStrapSUBTRANS();
5000         BootStrapMultiXact();
5001
5002         pfree(buffer);
5003 }
5004
5005 static char *
5006 str_time(pg_time_t tnow)
5007 {
5008         static char buf[128];
5009
5010         pg_strftime(buf, sizeof(buf),
5011                                 "%Y-%m-%d %H:%M:%S %Z",
5012                                 pg_localtime(&tnow, log_timezone));
5013
5014         return buf;
5015 }
5016
5017 /*
5018  * Parse one line from recovery.conf. 'cmdline' is the raw line from the
5019  * file. If the line is parsed successfully, returns true, false indicates
5020  * syntax error. On success, *key_p and *value_p are set to the parameter
5021  * name and value on the line, respectively. If the line is an empty line,
5022  * consisting entirely of whitespace and comments, function returns true
5023  * and *keyp_p and *value_p are set to NULL.
5024  *
5025  * The pointers returned in *key_p and *value_p point to an internal buffer
5026  * that is valid only until the next call of parseRecoveryCommandFile().
5027  */
5028 static bool
5029 parseRecoveryCommandFileLine(char *cmdline, char **key_p, char **value_p)
5030 {
5031         char       *ptr;
5032         char       *bufp;
5033         char       *key;
5034         char       *value;
5035         static char *buf = NULL;
5036
5037         *key_p = *value_p = NULL;
5038
5039         /*
5040          * Allocate the buffer on first use. It's used to hold both the parameter
5041          * name and value.
5042          */
5043         if (buf == NULL)
5044                 buf = malloc(MAXPGPATH + 1);
5045         bufp = buf;
5046
5047         /* Skip any whitespace at the beginning of line */
5048         for (ptr = cmdline; *ptr; ptr++)
5049         {
5050                 if (!isspace((unsigned char) *ptr))
5051                         break;
5052         }
5053         /* Ignore empty lines */
5054         if (*ptr == '\0' || *ptr == '#')
5055                 return true;
5056
5057         /* Read the parameter name */
5058         key = bufp;
5059         while (*ptr && !isspace((unsigned char) *ptr) &&
5060                    *ptr != '=' && *ptr != '\'')
5061                 *(bufp++) = *(ptr++);
5062         *(bufp++) = '\0';
5063
5064         /* Skip to the beginning quote of the parameter value */
5065         ptr = strchr(ptr, '\'');
5066         if (!ptr)
5067                 return false;
5068         ptr++;
5069
5070         /* Read the parameter value to *bufp. Collapse any '' escapes as we go. */
5071         value = bufp;
5072         for (;;)
5073         {
5074                 if (*ptr == '\'')
5075                 {
5076                         ptr++;
5077                         if (*ptr == '\'')
5078                                 *(bufp++) = '\'';
5079                         else
5080                         {
5081                                 /* end of parameter */
5082                                 *bufp = '\0';
5083                                 break;
5084                         }
5085                 }
5086                 else if (*ptr == '\0')
5087                         return false;           /* unterminated quoted string */
5088                 else
5089                         *(bufp++) = *ptr;
5090
5091                 ptr++;
5092         }
5093         *(bufp++) = '\0';
5094
5095         /* Check that there's no garbage after the value */
5096         while (*ptr)
5097         {
5098                 if (*ptr == '#')
5099                         break;
5100                 if (!isspace((unsigned char) *ptr))
5101                         return false;
5102                 ptr++;
5103         }
5104
5105         /* Success! */
5106         *key_p = key;
5107         *value_p = value;
5108         return true;
5109 }
5110
5111 /*
5112  * See if there is a recovery command file (recovery.conf), and if so
5113  * read in parameters for archive recovery and XLOG streaming.
5114  *
5115  * XXX longer term intention is to expand this to
5116  * cater for additional parameters and controls
5117  * possibly use a flex lexer similar to the GUC one
5118  */
5119 static void
5120 readRecoveryCommandFile(void)
5121 {
5122         FILE       *fd;
5123         char            cmdline[MAXPGPATH];
5124         TimeLineID      rtli = 0;
5125         bool            rtliGiven = false;
5126         bool            syntaxError = false;
5127
5128         fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5129         if (fd == NULL)
5130         {
5131                 if (errno == ENOENT)
5132                         return;                         /* not there, so no archive recovery */
5133                 ereport(FATAL,
5134                                 (errcode_for_file_access(),
5135                                  errmsg("could not open recovery command file \"%s\": %m",
5136                                                 RECOVERY_COMMAND_FILE)));
5137         }
5138
5139         /*
5140          * Parse the file...
5141          */
5142         while (fgets(cmdline, sizeof(cmdline), fd) != NULL)
5143         {
5144                 char       *tok1;
5145                 char       *tok2;
5146
5147                 if (!parseRecoveryCommandFileLine(cmdline, &tok1, &tok2))
5148                 {
5149                         syntaxError = true;
5150                         break;
5151                 }
5152                 if (tok1 == NULL)
5153                         continue;
5154
5155                 if (strcmp(tok1, "restore_command") == 0)
5156                 {
5157                         recoveryRestoreCommand = pstrdup(tok2);
5158                         ereport(DEBUG2,
5159                                         (errmsg("restore_command = '%s'",
5160                                                         recoveryRestoreCommand)));
5161                 }
5162                 else if (strcmp(tok1, "recovery_end_command") == 0)
5163                 {
5164                         recoveryEndCommand = pstrdup(tok2);
5165                         ereport(DEBUG2,
5166                                         (errmsg("recovery_end_command = '%s'",
5167                                                         recoveryEndCommand)));
5168                 }
5169                 else if (strcmp(tok1, "archive_cleanup_command") == 0)
5170                 {
5171                         archiveCleanupCommand = pstrdup(tok2);
5172                         ereport(DEBUG2,
5173                                         (errmsg("archive_cleanup_command = '%s'",
5174                                                         archiveCleanupCommand)));
5175                 }
5176                 else if (strcmp(tok1, "recovery_target_timeline") == 0)
5177                 {
5178                         rtliGiven = true;
5179                         if (strcmp(tok2, "latest") == 0)
5180                                 rtli = 0;
5181                         else
5182                         {
5183                                 errno = 0;
5184                                 rtli = (TimeLineID) strtoul(tok2, NULL, 0);
5185                                 if (errno == EINVAL || errno == ERANGE)
5186                                         ereport(FATAL,
5187                                                         (errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5188                                                                         tok2)));
5189                         }
5190                         if (rtli)
5191                                 ereport(DEBUG2,
5192                                                 (errmsg("recovery_target_timeline = %u", rtli)));
5193                         else
5194                                 ereport(DEBUG2,
5195                                                 (errmsg("recovery_target_timeline = latest")));
5196                 }
5197                 else if (strcmp(tok1, "recovery_target_xid") == 0)
5198                 {
5199                         errno = 0;
5200                         recoveryTargetXid = (TransactionId) strtoul(tok2, NULL, 0);
5201                         if (errno == EINVAL || errno == ERANGE)
5202                                 ereport(FATAL,
5203                                  (errmsg("recovery_target_xid is not a valid number: \"%s\"",
5204                                                  tok2)));
5205                         ereport(DEBUG2,
5206                                         (errmsg("recovery_target_xid = %u",
5207                                                         recoveryTargetXid)));
5208                         recoveryTarget = RECOVERY_TARGET_XID;
5209                 }
5210                 else if (strcmp(tok1, "recovery_target_time") == 0)
5211                 {
5212                         /*
5213                          * if recovery_target_xid specified, then this overrides
5214                          * recovery_target_time
5215                          */
5216                         if (recoveryTarget == RECOVERY_TARGET_XID)
5217                                 continue;
5218                         recoveryTarget = RECOVERY_TARGET_TIME;
5219
5220                         /*
5221                          * Convert the time string given by the user to TimestampTz form.
5222                          */
5223                         recoveryTargetTime =
5224                                 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5225                                                                                                                 CStringGetDatum(tok2),
5226                                                                                                 ObjectIdGetDatum(InvalidOid),
5227                                                                                                                 Int32GetDatum(-1)));
5228                         ereport(DEBUG2,
5229                                         (errmsg("recovery_target_time = '%s'",
5230                                                         timestamptz_to_str(recoveryTargetTime))));
5231                 }
5232                 else if (strcmp(tok1, "recovery_target_inclusive") == 0)
5233                 {
5234                         /*
5235                          * does nothing if a recovery_target is not also set
5236                          */
5237                         if (!parse_bool(tok2, &recoveryTargetInclusive))
5238                                 ereport(ERROR,
5239                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5240                                                  errmsg("parameter \"%s\" requires a Boolean value", "recovery_target_inclusive")));
5241                         ereport(DEBUG2,
5242                                         (errmsg("recovery_target_inclusive = %s", tok2)));
5243                 }
5244                 else if (strcmp(tok1, "standby_mode") == 0)
5245                 {
5246                         if (!parse_bool(tok2, &StandbyMode))
5247                                 ereport(ERROR,
5248                                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5249                                                  errmsg("parameter \"%s\" requires a Boolean value", "standby_mode")));
5250                         ereport(DEBUG2,
5251                                         (errmsg("standby_mode = '%s'", tok2)));
5252                 }
5253                 else if (strcmp(tok1, "primary_conninfo") == 0)
5254                 {
5255                         PrimaryConnInfo = pstrdup(tok2);
5256                         ereport(DEBUG2,
5257                                         (errmsg("primary_conninfo = '%s'",
5258                                                         PrimaryConnInfo)));
5259                 }
5260                 else if (strcmp(tok1, "trigger_file") == 0)
5261                 {
5262                         TriggerFile = pstrdup(tok2);
5263                         ereport(DEBUG2,
5264                                         (errmsg("trigger_file = '%s'",
5265                                                         TriggerFile)));
5266                 }
5267                 else
5268                         ereport(FATAL,
5269                                         (errmsg("unrecognized recovery parameter \"%s\"",
5270                                                         tok1)));
5271         }
5272
5273         FreeFile(fd);
5274
5275         if (syntaxError)
5276                 ereport(FATAL,
5277                                 (errmsg("syntax error in recovery command file: %s",
5278                                                 cmdline),
5279                           errhint("Lines should have the format parameter = 'value'.")));
5280
5281         /*
5282          * Check for compulsory parameters
5283          */
5284         if (StandbyMode)
5285         {
5286                 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5287                         ereport(WARNING,
5288                                         (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5289                                                         RECOVERY_COMMAND_FILE),
5290                                          errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5291         }
5292         else
5293         {
5294                 if (recoveryRestoreCommand == NULL)
5295                         ereport(FATAL,
5296                                         (errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5297                                                         RECOVERY_COMMAND_FILE)));
5298         }
5299
5300         /* Enable fetching from archive recovery area */
5301         InArchiveRecovery = true;
5302
5303         /*
5304          * If user specified recovery_target_timeline, validate it or compute the
5305          * "latest" value.      We can't do this until after we've gotten the restore
5306          * command and set InArchiveRecovery, because we need to fetch timeline
5307          * history files from the archive.
5308          */
5309         if (rtliGiven)
5310         {
5311                 if (rtli)
5312                 {
5313                         /* Timeline 1 does not have a history file, all else should */
5314                         if (rtli != 1 && !existsTimeLineHistory(rtli))
5315                                 ereport(FATAL,
5316                                                 (errmsg("recovery target timeline %u does not exist",
5317                                                                 rtli)));
5318                         recoveryTargetTLI = rtli;
5319                 }
5320                 else
5321                 {
5322                         /* We start the "latest" search from pg_control's timeline */
5323                         recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5324                 }
5325         }
5326 }
5327
5328 /*
5329  * Exit archive-recovery state
5330  */
5331 static void
5332 exitArchiveRecovery(TimeLineID endTLI, uint32 endLogId, uint32 endLogSeg)
5333 {
5334         char            recoveryPath[MAXPGPATH];
5335         char            xlogpath[MAXPGPATH];
5336         XLogRecPtr      InvalidXLogRecPtr = {0, 0};
5337
5338         /*
5339          * We are no longer in archive recovery state.
5340          */
5341         InArchiveRecovery = false;
5342
5343         /*
5344          * Update min recovery point one last time.
5345          */
5346         UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5347
5348         /*
5349          * If the ending log segment is still open, close it (to avoid problems on
5350          * Windows with trying to rename or delete an open file).
5351          */
5352         if (readFile >= 0)
5353         {
5354                 close(readFile);
5355                 readFile = -1;
5356         }
5357
5358         /*
5359          * If the segment was fetched from archival storage, we want to replace
5360          * the existing xlog segment (if any) with the archival version.  This is
5361          * because whatever is in XLOGDIR is very possibly older than what we have
5362          * from the archives, since it could have come from restoring a PGDATA
5363          * backup.      In any case, the archival version certainly is more
5364          * descriptive of what our current database state is, because that is what
5365          * we replayed from.
5366          *
5367          * Note that if we are establishing a new timeline, ThisTimeLineID is
5368          * already set to the new value, and so we will create a new file instead
5369          * of overwriting any existing file.  (This is, in fact, always the case
5370          * at present.)
5371          */
5372         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
5373         XLogFilePath(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
5374
5375         if (restoredFromArchive)
5376         {
5377                 ereport(DEBUG3,
5378                                 (errmsg_internal("moving last restored xlog to \"%s\"",
5379                                                                  xlogpath)));
5380                 unlink(xlogpath);               /* might or might not exist */
5381                 if (rename(recoveryPath, xlogpath) != 0)
5382                         ereport(FATAL,
5383                                         (errcode_for_file_access(),
5384                                          errmsg("could not rename file \"%s\" to \"%s\": %m",
5385                                                         recoveryPath, xlogpath)));
5386                 /* XXX might we need to fix permissions on the file? */
5387         }
5388         else
5389         {
5390                 /*
5391                  * If the latest segment is not archival, but there's still a
5392                  * RECOVERYXLOG laying about, get rid of it.
5393                  */
5394                 unlink(recoveryPath);   /* ignore any error */
5395
5396                 /*
5397                  * If we are establishing a new timeline, we have to copy data from
5398                  * the last WAL segment of the old timeline to create a starting WAL
5399                  * segment for the new timeline.
5400                  *
5401                  * Notify the archiver that the last WAL segment of the old timeline
5402                  * is ready to copy to archival storage. Otherwise, it is not archived
5403                  * for a while.
5404                  */
5405                 if (endTLI != ThisTimeLineID)
5406                 {
5407                         XLogFileCopy(endLogId, endLogSeg,
5408                                                  endTLI, endLogId, endLogSeg);
5409
5410                         if (XLogArchivingActive())
5411                         {
5412                                 XLogFileName(xlogpath, endTLI, endLogId, endLogSeg);
5413                                 XLogArchiveNotify(xlogpath);
5414                         }
5415                 }
5416         }
5417
5418         /*
5419          * Let's just make real sure there are not .ready or .done flags posted
5420          * for the new segment.
5421          */
5422         XLogFileName(xlogpath, ThisTimeLineID, endLogId, endLogSeg);
5423         XLogArchiveCleanup(xlogpath);
5424
5425         /* Get rid of any remaining recovered timeline-history file, too */
5426         snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
5427         unlink(recoveryPath);           /* ignore any error */
5428
5429         /*
5430          * Rename the config file out of the way, so that we don't accidentally
5431          * re-enter archive recovery mode in a subsequent crash.
5432          */
5433         unlink(RECOVERY_COMMAND_DONE);
5434         if (rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE) != 0)
5435                 ereport(FATAL,
5436                                 (errcode_for_file_access(),
5437                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
5438                                                 RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE)));
5439
5440         ereport(LOG,
5441                         (errmsg("archive recovery complete")));
5442 }
5443
5444 /*
5445  * For point-in-time recovery, this function decides whether we want to
5446  * stop applying the XLOG at or after the current record.
5447  *
5448  * Returns TRUE if we are stopping, FALSE otherwise.  On TRUE return,
5449  * *includeThis is set TRUE if we should apply this record before stopping.
5450  *
5451  * We also track the timestamp of the latest applied COMMIT/ABORT record
5452  * in XLogCtl->recoveryLastXTime, for logging purposes.
5453  * Also, some information is saved in recoveryStopXid et al for use in
5454  * annotating the new timeline's history file.
5455  */
5456 static bool
5457 recoveryStopsHere(XLogRecord *record, bool *includeThis)
5458 {
5459         bool            stopsHere;
5460         uint8           record_info;
5461         TimestampTz recordXtime;
5462
5463         /* We only consider stopping at COMMIT or ABORT records */
5464         if (record->xl_rmid != RM_XACT_ID)
5465                 return false;
5466         record_info = record->xl_info & ~XLR_INFO_MASK;
5467         if (record_info == XLOG_XACT_COMMIT)
5468         {
5469                 xl_xact_commit *recordXactCommitData;
5470
5471                 recordXactCommitData = (xl_xact_commit *) XLogRecGetData(record);
5472                 recordXtime = recordXactCommitData->xact_time;
5473         }
5474         else if (record_info == XLOG_XACT_ABORT)
5475         {
5476                 xl_xact_abort *recordXactAbortData;
5477
5478                 recordXactAbortData = (xl_xact_abort *) XLogRecGetData(record);
5479                 recordXtime = recordXactAbortData->xact_time;
5480         }
5481         else
5482                 return false;
5483
5484         /* Do we have a PITR target at all? */
5485         if (recoveryTarget == RECOVERY_TARGET_UNSET)
5486         {
5487                 SetLatestXTime(recordXtime);
5488                 return false;
5489         }
5490
5491         if (recoveryTarget == RECOVERY_TARGET_XID)
5492         {
5493                 /*
5494                  * there can be only one transaction end record with this exact
5495                  * transactionid
5496                  *
5497                  * when testing for an xid, we MUST test for equality only, since
5498                  * transactions are numbered in the order they start, not the order
5499                  * they complete. A higher numbered xid will complete before you about
5500                  * 50% of the time...
5501                  */
5502                 stopsHere = (record->xl_xid == recoveryTargetXid);
5503                 if (stopsHere)
5504                         *includeThis = recoveryTargetInclusive;
5505         }
5506         else
5507         {
5508                 /*
5509                  * there can be many transactions that share the same commit time, so
5510                  * we stop after the last one, if we are inclusive, or stop at the
5511                  * first one if we are exclusive
5512                  */
5513                 if (recoveryTargetInclusive)
5514                         stopsHere = (recordXtime > recoveryTargetTime);
5515                 else
5516                         stopsHere = (recordXtime >= recoveryTargetTime);
5517                 if (stopsHere)
5518                         *includeThis = false;
5519         }
5520
5521         if (stopsHere)
5522         {
5523                 recoveryStopXid = record->xl_xid;
5524                 recoveryStopTime = recordXtime;
5525                 recoveryStopAfter = *includeThis;
5526
5527                 if (record_info == XLOG_XACT_COMMIT)
5528                 {
5529                         if (recoveryStopAfter)
5530                                 ereport(LOG,
5531                                                 (errmsg("recovery stopping after commit of transaction %u, time %s",
5532                                                                 recoveryStopXid,
5533                                                                 timestamptz_to_str(recoveryStopTime))));
5534                         else
5535                                 ereport(LOG,
5536                                                 (errmsg("recovery stopping before commit of transaction %u, time %s",
5537                                                                 recoveryStopXid,
5538                                                                 timestamptz_to_str(recoveryStopTime))));
5539                 }
5540                 else
5541                 {
5542                         if (recoveryStopAfter)
5543                                 ereport(LOG,
5544                                                 (errmsg("recovery stopping after abort of transaction %u, time %s",
5545                                                                 recoveryStopXid,
5546                                                                 timestamptz_to_str(recoveryStopTime))));
5547                         else
5548                                 ereport(LOG,
5549                                                 (errmsg("recovery stopping before abort of transaction %u, time %s",
5550                                                                 recoveryStopXid,
5551                                                                 timestamptz_to_str(recoveryStopTime))));
5552                 }
5553
5554                 if (recoveryStopAfter)
5555                         SetLatestXTime(recordXtime);
5556         }
5557         else
5558                 SetLatestXTime(recordXtime);
5559
5560         return stopsHere;
5561 }
5562
5563 /*
5564  * Save timestamp of latest processed commit/abort record.
5565  *
5566  * We keep this in XLogCtl, not a simple static variable, so that it can be
5567  * seen by processes other than the startup process.  Note in particular
5568  * that CreateRestartPoint is executed in the bgwriter.
5569  */
5570 static void
5571 SetLatestXTime(TimestampTz xtime)
5572 {
5573         /* use volatile pointer to prevent code rearrangement */
5574         volatile XLogCtlData *xlogctl = XLogCtl;
5575
5576         SpinLockAcquire(&xlogctl->info_lck);
5577         xlogctl->recoveryLastXTime = xtime;
5578         SpinLockRelease(&xlogctl->info_lck);
5579 }
5580
5581 /*
5582  * Fetch timestamp of latest processed commit/abort record.
5583  */
5584 static TimestampTz
5585 GetLatestXTime(void)
5586 {
5587         /* use volatile pointer to prevent code rearrangement */
5588         volatile XLogCtlData *xlogctl = XLogCtl;
5589         TimestampTz xtime;
5590
5591         SpinLockAcquire(&xlogctl->info_lck);
5592         xtime = xlogctl->recoveryLastXTime;
5593         SpinLockRelease(&xlogctl->info_lck);
5594
5595         return xtime;
5596 }
5597
5598 /*
5599  * Returns bool with current recovery mode, a global state.
5600  */
5601 Datum
5602 pg_is_in_recovery(PG_FUNCTION_ARGS)
5603 {
5604         PG_RETURN_BOOL(RecoveryInProgress());
5605 }
5606
5607 /*
5608  * Returns time of receipt of current chunk of XLOG data, as well as
5609  * whether it was received from streaming replication or from archives.
5610  */
5611 void
5612 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
5613 {
5614         /*
5615          * This must be executed in the startup process, since we don't export the
5616          * relevant state to shared memory.
5617          */
5618         Assert(InRecovery);
5619
5620         *rtime = XLogReceiptTime;
5621         *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
5622 }
5623
5624 /*
5625  * Note that text field supplied is a parameter name and does not require
5626  * translation
5627  */
5628 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
5629 do { \
5630         if (currValue < minValue) \
5631                 ereport(ERROR, \
5632                                 (errmsg("hot standby is not possible because " \
5633                                                 "%s = %d is a lower setting than on the master server " \
5634                                                 "(its value was %d)", \
5635                                                 param_name, \
5636                                                 currValue, \
5637                                                 minValue))); \
5638 } while(0)
5639
5640 /*
5641  * Check to see if required parameters are set high enough on this server
5642  * for various aspects of recovery operation.
5643  */
5644 static void
5645 CheckRequiredParameterValues(void)
5646 {
5647         /*
5648          * For archive recovery, the WAL must be generated with at least 'archive'
5649          * wal_level.
5650          */
5651         if (InArchiveRecovery && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
5652         {
5653                 ereport(WARNING,
5654                                 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
5655                                  errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
5656         }
5657
5658         /*
5659          * For Hot Standby, the WAL must be generated with 'hot_standby' mode, and
5660          * we must have at least as many backend slots as the primary.
5661          */
5662         if (InArchiveRecovery && EnableHotStandby)
5663         {
5664                 if (ControlFile->wal_level < WAL_LEVEL_HOT_STANDBY)
5665                         ereport(ERROR,
5666                                         (errmsg("hot standby is not possible because wal_level was not set to \"hot_standby\" on the master server"),
5667                                          errhint("Either set wal_level to \"hot_standby\" on the master, or turn off hot_standby here.")));
5668
5669                 /* We ignore autovacuum_max_workers when we make this test. */
5670                 RecoveryRequiresIntParameter("max_connections",
5671                                                                          MaxConnections,
5672                                                                          ControlFile->MaxConnections);
5673                 RecoveryRequiresIntParameter("max_prepared_xacts",
5674                                                                          max_prepared_xacts,
5675                                                                          ControlFile->max_prepared_xacts);
5676                 RecoveryRequiresIntParameter("max_locks_per_xact",
5677                                                                          max_locks_per_xact,
5678                                                                          ControlFile->max_locks_per_xact);
5679         }
5680 }
5681
5682 /*
5683  * This must be called ONCE during postmaster or standalone-backend startup
5684  */
5685 void
5686 StartupXLOG(void)
5687 {
5688         XLogCtlInsert *Insert;
5689         CheckPoint      checkPoint;
5690         bool            wasShutdown;
5691         bool            reachedStopPoint = false;
5692         bool            haveBackupLabel = false;
5693         XLogRecPtr      RecPtr,
5694                                 checkPointLoc,
5695                                 EndOfLog;
5696         uint32          endLogId;
5697         uint32          endLogSeg;
5698         XLogRecord *record;
5699         uint32          freespace;
5700         TransactionId oldestActiveXID;
5701
5702         /*
5703          * Read control file and check XLOG status looks valid.
5704          *
5705          * Note: in most control paths, *ControlFile is already valid and we need
5706          * not do ReadControlFile() here, but might as well do it to be sure.
5707          */
5708         ReadControlFile();
5709
5710         if (ControlFile->state < DB_SHUTDOWNED ||
5711                 ControlFile->state > DB_IN_PRODUCTION ||
5712                 !XRecOffIsValid(ControlFile->checkPoint.xrecoff))
5713                 ereport(FATAL,
5714                                 (errmsg("control file contains invalid data")));
5715
5716         if (ControlFile->state == DB_SHUTDOWNED)
5717                 ereport(LOG,
5718                                 (errmsg("database system was shut down at %s",
5719                                                 str_time(ControlFile->time))));
5720         else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
5721                 ereport(LOG,
5722                                 (errmsg("database system was shut down in recovery at %s",
5723                                                 str_time(ControlFile->time))));
5724         else if (ControlFile->state == DB_SHUTDOWNING)
5725                 ereport(LOG,
5726                                 (errmsg("database system shutdown was interrupted; last known up at %s",
5727                                                 str_time(ControlFile->time))));
5728         else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
5729                 ereport(LOG,
5730                    (errmsg("database system was interrupted while in recovery at %s",
5731                                    str_time(ControlFile->time)),
5732                         errhint("This probably means that some data is corrupted and"
5733                                         " you will have to use the last backup for recovery.")));
5734         else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
5735                 ereport(LOG,
5736                                 (errmsg("database system was interrupted while in recovery at log time %s",
5737                                                 str_time(ControlFile->checkPointCopy.time)),
5738                                  errhint("If this has occurred more than once some data might be corrupted"
5739                           " and you might need to choose an earlier recovery target.")));
5740         else if (ControlFile->state == DB_IN_PRODUCTION)
5741                 ereport(LOG,
5742                           (errmsg("database system was interrupted; last known up at %s",
5743                                           str_time(ControlFile->time))));
5744
5745         /* This is just to allow attaching to startup process with a debugger */
5746 #ifdef XLOG_REPLAY_DELAY
5747         if (ControlFile->state != DB_SHUTDOWNED)
5748                 pg_usleep(60000000L);
5749 #endif
5750
5751         /*
5752          * Verify that pg_xlog and pg_xlog/archive_status exist.  In cases where
5753          * someone has performed a copy for PITR, these directories may have been
5754          * excluded and need to be re-created.
5755          */
5756         ValidateXLOGDirectoryStructure();
5757
5758         /*
5759          * Clear out any old relcache cache files.      This is *necessary* if we do
5760          * any WAL replay, since that would probably result in the cache files
5761          * being out of sync with database reality.  In theory we could leave them
5762          * in place if the database had been cleanly shut down, but it seems
5763          * safest to just remove them always and let them be rebuilt during the
5764          * first backend startup.
5765          */
5766         RelationCacheInitFileRemove();
5767
5768         /*
5769          * Initialize on the assumption we want to recover to the same timeline
5770          * that's active according to pg_control.
5771          */
5772         recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
5773
5774         /*
5775          * Check for recovery control file, and if so set up state for offline
5776          * recovery
5777          */
5778         readRecoveryCommandFile();
5779
5780         /* Now we can determine the list of expected TLIs */
5781         expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
5782
5783         /*
5784          * If pg_control's timeline is not in expectedTLIs, then we cannot
5785          * proceed: the backup is not part of the history of the requested
5786          * timeline.
5787          */
5788         if (!list_member_int(expectedTLIs,
5789                                                  (int) ControlFile->checkPointCopy.ThisTimeLineID))
5790                 ereport(FATAL,
5791                                 (errmsg("requested timeline %u is not a child of database system timeline %u",
5792                                                 recoveryTargetTLI,
5793                                                 ControlFile->checkPointCopy.ThisTimeLineID)));
5794
5795         /*
5796          * Save the selected recovery target timeline ID and
5797          * archive_cleanup_command in shared memory so that other processes can
5798          * see them
5799          */
5800         XLogCtl->RecoveryTargetTLI = recoveryTargetTLI;
5801         strncpy(XLogCtl->archiveCleanupCommand,
5802                         archiveCleanupCommand ? archiveCleanupCommand : "",
5803                         sizeof(XLogCtl->archiveCleanupCommand));
5804
5805         if (InArchiveRecovery)
5806         {
5807                 if (StandbyMode)
5808                         ereport(LOG,
5809                                         (errmsg("entering standby mode")));
5810                 else if (recoveryTarget == RECOVERY_TARGET_XID)
5811                         ereport(LOG,
5812                                         (errmsg("starting point-in-time recovery to XID %u",
5813                                                         recoveryTargetXid)));
5814                 else if (recoveryTarget == RECOVERY_TARGET_TIME)
5815                         ereport(LOG,
5816                                         (errmsg("starting point-in-time recovery to %s",
5817                                                         timestamptz_to_str(recoveryTargetTime))));
5818                 else
5819                         ereport(LOG,
5820                                         (errmsg("starting archive recovery")));
5821         }
5822
5823         if (read_backup_label(&checkPointLoc))
5824         {
5825                 /*
5826                  * When a backup_label file is present, we want to roll forward from
5827                  * the checkpoint it identifies, rather than using pg_control.
5828                  */
5829                 record = ReadCheckpointRecord(checkPointLoc, 0);
5830                 if (record != NULL)
5831                 {
5832                         memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5833                         wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5834                         ereport(DEBUG1,
5835                                         (errmsg("checkpoint record is at %X/%X",
5836                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5837                         InRecovery = true;      /* force recovery even if SHUTDOWNED */
5838
5839                         /*
5840                          * Make sure that REDO location exists. This may not be
5841                          * the case if there was a crash during an online backup,
5842                          * which left a backup_label around that references a WAL
5843                          * segment that's already been archived.
5844                          */
5845                         if (XLByteLT(checkPoint.redo, checkPointLoc))
5846                         {
5847                                 if (!ReadRecord(&(checkPoint.redo), LOG, false))
5848                                         ereport(FATAL,
5849                                                         (errmsg("could not find redo location referenced by checkpoint record"),
5850                                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5851                         }
5852                 }
5853                 else
5854                 {
5855                         ereport(FATAL,
5856                                         (errmsg("could not locate required checkpoint record"),
5857                                          errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
5858                         wasShutdown = false; /* keep compiler quiet */
5859                 }
5860                 /* set flag to delete it later */
5861                 haveBackupLabel = true;
5862         }
5863         else
5864         {
5865                 /*
5866                  * Get the last valid checkpoint record.  If the latest one according
5867                  * to pg_control is broken, try the next-to-last one.
5868                  */
5869                 checkPointLoc = ControlFile->checkPoint;
5870                 RedoStartLSN = ControlFile->checkPointCopy.redo;
5871                 record = ReadCheckpointRecord(checkPointLoc, 1);
5872                 if (record != NULL)
5873                 {
5874                         ereport(DEBUG1,
5875                                         (errmsg("checkpoint record is at %X/%X",
5876                                                         checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5877                 }
5878                 else if (StandbyMode)
5879                 {
5880                         /*
5881                          * The last valid checkpoint record required for a streaming
5882                          * recovery exists in neither standby nor the primary.
5883                          */
5884                         ereport(PANIC,
5885                                         (errmsg("could not locate a valid checkpoint record")));
5886                 }
5887                 else
5888                 {
5889                         checkPointLoc = ControlFile->prevCheckPoint;
5890                         record = ReadCheckpointRecord(checkPointLoc, 2);
5891                         if (record != NULL)
5892                         {
5893                                 ereport(LOG,
5894                                                 (errmsg("using previous checkpoint record at %X/%X",
5895                                                           checkPointLoc.xlogid, checkPointLoc.xrecoff)));
5896                                 InRecovery = true;              /* force recovery even if SHUTDOWNED */
5897                         }
5898                         else
5899                                 ereport(PANIC,
5900                                          (errmsg("could not locate a valid checkpoint record")));
5901                 }
5902                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
5903                 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
5904         }
5905
5906         LastRec = RecPtr = checkPointLoc;
5907
5908         ereport(DEBUG1,
5909                         (errmsg("redo record is at %X/%X; shutdown %s",
5910                                         checkPoint.redo.xlogid, checkPoint.redo.xrecoff,
5911                                         wasShutdown ? "TRUE" : "FALSE")));
5912         ereport(DEBUG1,
5913                         (errmsg("next transaction ID: %u/%u; next OID: %u",
5914                                         checkPoint.nextXidEpoch, checkPoint.nextXid,
5915                                         checkPoint.nextOid)));
5916         ereport(DEBUG1,
5917                         (errmsg("next MultiXactId: %u; next MultiXactOffset: %u",
5918                                         checkPoint.nextMulti, checkPoint.nextMultiOffset)));
5919         ereport(DEBUG1,
5920                         (errmsg("oldest unfrozen transaction ID: %u, in database %u",
5921                                         checkPoint.oldestXid, checkPoint.oldestXidDB)));
5922         if (!TransactionIdIsNormal(checkPoint.nextXid))
5923                 ereport(PANIC,
5924                                 (errmsg("invalid next transaction ID")));
5925
5926         ShmemVariableCache->nextXid = checkPoint.nextXid;
5927         ShmemVariableCache->nextOid = checkPoint.nextOid;
5928         ShmemVariableCache->oidCount = 0;
5929         MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5930         SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5931
5932         /*
5933          * We must replay WAL entries using the same TimeLineID they were created
5934          * under, so temporarily adopt the TLI indicated by the checkpoint (see
5935          * also xlog_redo()).
5936          */
5937         ThisTimeLineID = checkPoint.ThisTimeLineID;
5938
5939         RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
5940
5941         if (XLByteLT(RecPtr, checkPoint.redo))
5942                 ereport(PANIC,
5943                                 (errmsg("invalid redo in checkpoint record")));
5944
5945         /*
5946          * Check whether we need to force recovery from WAL.  If it appears to
5947          * have been a clean shutdown and we did not have a recovery.conf file,
5948          * then assume no recovery needed.
5949          */
5950         if (XLByteLT(checkPoint.redo, RecPtr))
5951         {
5952                 if (wasShutdown)
5953                         ereport(PANIC,
5954                                         (errmsg("invalid redo record in shutdown checkpoint")));
5955                 InRecovery = true;
5956         }
5957         else if (ControlFile->state != DB_SHUTDOWNED)
5958                 InRecovery = true;
5959         else if (InArchiveRecovery)
5960         {
5961                 /* force recovery due to presence of recovery.conf */
5962                 InRecovery = true;
5963         }
5964
5965         /* REDO */
5966         if (InRecovery)
5967         {
5968                 int                     rmid;
5969
5970                 /* use volatile pointer to prevent code rearrangement */
5971                 volatile XLogCtlData *xlogctl = XLogCtl;
5972
5973                 /*
5974                  * Update pg_control to show that we are recovering and to show the
5975                  * selected checkpoint as the place we are starting from. We also mark
5976                  * pg_control with any minimum recovery stop point obtained from a
5977                  * backup history file.
5978                  */
5979                 if (InArchiveRecovery)
5980                         ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
5981                 else
5982                 {
5983                         ereport(LOG,
5984                                         (errmsg("database system was not properly shut down; "
5985                                                         "automatic recovery in progress")));
5986                         ControlFile->state = DB_IN_CRASH_RECOVERY;
5987                 }
5988                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
5989                 ControlFile->checkPoint = checkPointLoc;
5990                 ControlFile->checkPointCopy = checkPoint;
5991                 if (InArchiveRecovery)
5992                 {
5993                         /* initialize minRecoveryPoint if not set yet */
5994                         if (XLByteLT(ControlFile->minRecoveryPoint, checkPoint.redo))
5995                                 ControlFile->minRecoveryPoint = checkPoint.redo;
5996                 }
5997
5998                 /*
5999                  * set backupStartupPoint if we're starting archive recovery from a
6000                  * base backup
6001                  */
6002                 if (haveBackupLabel)
6003                         ControlFile->backupStartPoint = checkPoint.redo;
6004                 ControlFile->time = (pg_time_t) time(NULL);
6005                 /* No need to hold ControlFileLock yet, we aren't up far enough */
6006                 UpdateControlFile();
6007
6008                 /* initialize our local copy of minRecoveryPoint */
6009                 minRecoveryPoint = ControlFile->minRecoveryPoint;
6010
6011                 /*
6012                  * Reset pgstat data, because it may be invalid after recovery.
6013                  */
6014                 pgstat_reset_all();
6015
6016                 /*
6017                  * If there was a backup label file, it's done its job and the info
6018                  * has now been propagated into pg_control.  We must get rid of the
6019                  * label file so that if we crash during recovery, we'll pick up at
6020                  * the latest recovery restartpoint instead of going all the way back
6021                  * to the backup start point.  It seems prudent though to just rename
6022                  * the file out of the way rather than delete it completely.
6023                  */
6024                 if (haveBackupLabel)
6025                 {
6026                         unlink(BACKUP_LABEL_OLD);
6027                         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) != 0)
6028                                 ereport(FATAL,
6029                                                 (errcode_for_file_access(),
6030                                                  errmsg("could not rename file \"%s\" to \"%s\": %m",
6031                                                                 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
6032                 }
6033
6034                 /* Check that the GUCs used to generate the WAL allow recovery */
6035                 CheckRequiredParameterValues();
6036
6037                 /*
6038                  * Initialize for Hot Standby, if enabled. We won't let backends in
6039                  * yet, not until we've reached the min recovery point specified in
6040                  * control file and we've established a recovery snapshot from a
6041                  * running-xacts WAL record.
6042                  */
6043                 if (InArchiveRecovery && EnableHotStandby)
6044                 {
6045                         TransactionId *xids;
6046                         int                     nxids;
6047
6048                         ereport(DEBUG1,
6049                                         (errmsg("initializing for hot standby")));
6050
6051                         InitRecoveryTransactionEnvironment();
6052
6053                         if (wasShutdown)
6054                                 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6055                         else
6056                                 oldestActiveXID = checkPoint.oldestActiveXid;
6057                         Assert(TransactionIdIsValid(oldestActiveXID));
6058
6059                         /* Startup commit log and related stuff */
6060                         StartupCLOG();
6061                         StartupSUBTRANS(oldestActiveXID);
6062                         StartupMultiXact();
6063
6064                         ProcArrayInitRecoveryInfo(oldestActiveXID);
6065
6066                         /*
6067                          * If we're beginning at a shutdown checkpoint, we know that
6068                          * nothing was running on the master at this point. So fake-up an
6069                          * empty running-xacts record and use that here and now. Recover
6070                          * additional standby state for prepared transactions.
6071                          */
6072                         if (wasShutdown)
6073                         {
6074                                 RunningTransactionsData running;
6075                                 TransactionId latestCompletedXid;
6076
6077                                 /*
6078                                  * Construct a RunningTransactions snapshot representing a
6079                                  * shut down server, with only prepared transactions still
6080                                  * alive. We're never overflowed at this point because all
6081                                  * subxids are listed with their parent prepared transactions.
6082                                  */
6083                                 running.xcnt = nxids;
6084                                 running.subxid_overflow = false;
6085                                 running.nextXid = checkPoint.nextXid;
6086                                 running.oldestRunningXid = oldestActiveXID;
6087                                 latestCompletedXid = checkPoint.nextXid;
6088                                 TransactionIdRetreat(latestCompletedXid);
6089                                 Assert(TransactionIdIsNormal(latestCompletedXid));
6090                                 running.latestCompletedXid = latestCompletedXid;
6091                                 running.xids = xids;
6092
6093                                 ProcArrayApplyRecoveryInfo(&running);
6094
6095                                 StandbyRecoverPreparedTransactions(false);
6096                         }
6097                 }
6098
6099                 /* Initialize resource managers */
6100                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6101                 {
6102                         if (RmgrTable[rmid].rm_startup != NULL)
6103                                 RmgrTable[rmid].rm_startup();
6104                 }
6105
6106                 /*
6107                  * Initialize shared replayEndRecPtr, recoveryLastRecPtr, and
6108                  * recoveryLastXTime.
6109                  *
6110                  * This is slightly confusing if we're starting from an online
6111                  * checkpoint; we've just read and replayed the chekpoint record, but
6112                  * we're going to start replay from its redo pointer, which precedes
6113                  * the location of the checkpoint record itself. So even though the
6114                  * last record we've replayed is indeed ReadRecPtr, we haven't
6115                  * replayed all the preceding records yet. That's OK for the current
6116                  * use of these variables.
6117                  */
6118                 SpinLockAcquire(&xlogctl->info_lck);
6119                 xlogctl->replayEndRecPtr = ReadRecPtr;
6120                 xlogctl->recoveryLastRecPtr = ReadRecPtr;
6121                 xlogctl->recoveryLastXTime = 0;
6122                 SpinLockRelease(&xlogctl->info_lck);
6123
6124                 /* Also ensure XLogReceiptTime has a sane value */
6125                 XLogReceiptTime = GetCurrentTimestamp();
6126
6127                 /*
6128                  * Let postmaster know we've started redo now, so that it can launch
6129                  * bgwriter to perform restartpoints.  We don't bother during crash
6130                  * recovery as restartpoints can only be performed during archive
6131                  * recovery.  And we'd like to keep crash recovery simple, to avoid
6132                  * introducing bugs that could affect you when recovering after crash.
6133                  *
6134                  * After this point, we can no longer assume that we're the only
6135                  * process in addition to postmaster!  Also, fsync requests are
6136                  * subsequently to be handled by the bgwriter, not locally.
6137                  */
6138                 if (InArchiveRecovery && IsUnderPostmaster)
6139                 {
6140                         SetForwardFsyncRequests();
6141                         SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6142                         bgwriterLaunched = true;
6143                 }
6144
6145                 /*
6146                  * Allow read-only connections immediately if we're consistent
6147                  * already.
6148                  */
6149                 CheckRecoveryConsistency();
6150
6151                 /*
6152                  * Find the first record that logically follows the checkpoint --- it
6153                  * might physically precede it, though.
6154                  */
6155                 if (XLByteLT(checkPoint.redo, RecPtr))
6156                 {
6157                         /* back up to find the record */
6158                         record = ReadRecord(&(checkPoint.redo), PANIC, false);
6159                 }
6160                 else
6161                 {
6162                         /* just have to read next record after CheckPoint */
6163                         record = ReadRecord(NULL, LOG, false);
6164                 }
6165
6166                 if (record != NULL)
6167                 {
6168                         bool            recoveryContinue = true;
6169                         bool            recoveryApply = true;
6170                         ErrorContextCallback errcontext;
6171                         TimestampTz xtime;
6172
6173                         InRedo = true;
6174
6175                         ereport(LOG,
6176                                         (errmsg("redo starts at %X/%X",
6177                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6178
6179                         /*
6180                          * main redo apply loop
6181                          */
6182                         do
6183                         {
6184 #ifdef WAL_DEBUG
6185                                 if (XLOG_DEBUG ||
6186                                  (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6187                                         (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6188                                 {
6189                                         StringInfoData buf;
6190
6191                                         initStringInfo(&buf);
6192                                         appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6193                                                                          ReadRecPtr.xlogid, ReadRecPtr.xrecoff,
6194                                                                          EndRecPtr.xlogid, EndRecPtr.xrecoff);
6195                                         xlog_outrec(&buf, record);
6196                                         appendStringInfo(&buf, " - ");
6197                                         RmgrTable[record->xl_rmid].rm_desc(&buf,
6198                                                                                                            record->xl_info,
6199                                                                                                          XLogRecGetData(record));
6200                                         elog(LOG, "%s", buf.data);
6201                                         pfree(buf.data);
6202                                 }
6203 #endif
6204
6205                                 /* Handle interrupt signals of startup process */
6206                                 HandleStartupProcInterrupts();
6207
6208                                 /* Allow read-only connections if we're consistent now */
6209                                 CheckRecoveryConsistency();
6210
6211                                 /*
6212                                  * Have we reached our recovery target?
6213                                  */
6214                                 if (recoveryStopsHere(record, &recoveryApply))
6215                                 {
6216                                         reachedStopPoint = true;        /* see below */
6217                                         recoveryContinue = false;
6218                                         if (!recoveryApply)
6219                                                 break;
6220                                 }
6221
6222                                 /* Setup error traceback support for ereport() */
6223                                 errcontext.callback = rm_redo_error_callback;
6224                                 errcontext.arg = (void *) record;
6225                                 errcontext.previous = error_context_stack;
6226                                 error_context_stack = &errcontext;
6227
6228                                 /* nextXid must be beyond record's xid */
6229                                 if (TransactionIdFollowsOrEquals(record->xl_xid,
6230                                                                                                  ShmemVariableCache->nextXid))
6231                                 {
6232                                         ShmemVariableCache->nextXid = record->xl_xid;
6233                                         TransactionIdAdvance(ShmemVariableCache->nextXid);
6234                                 }
6235
6236                                 /*
6237                                  * Update shared replayEndRecPtr before replaying this record,
6238                                  * so that XLogFlush will update minRecoveryPoint correctly.
6239                                  */
6240                                 SpinLockAcquire(&xlogctl->info_lck);
6241                                 xlogctl->replayEndRecPtr = EndRecPtr;
6242                                 SpinLockRelease(&xlogctl->info_lck);
6243
6244                                 /*
6245                                  * If we are attempting to enter Hot Standby mode, process
6246                                  * XIDs we see
6247                                  */
6248                                 if (standbyState >= STANDBY_INITIALIZED &&
6249                                         TransactionIdIsValid(record->xl_xid))
6250                                         RecordKnownAssignedTransactionIds(record->xl_xid);
6251
6252                                 RmgrTable[record->xl_rmid].rm_redo(EndRecPtr, record);
6253
6254                                 /* Pop the error context stack */
6255                                 error_context_stack = errcontext.previous;
6256
6257                                 /*
6258                                  * Update shared recoveryLastRecPtr after this record has been
6259                                  * replayed.
6260                                  */
6261                                 SpinLockAcquire(&xlogctl->info_lck);
6262                                 xlogctl->recoveryLastRecPtr = EndRecPtr;
6263                                 SpinLockRelease(&xlogctl->info_lck);
6264
6265                                 LastRec = ReadRecPtr;
6266
6267                                 record = ReadRecord(NULL, LOG, false);
6268                         } while (record != NULL && recoveryContinue);
6269
6270                         /*
6271                          * end of main redo apply loop
6272                          */
6273
6274                         ereport(LOG,
6275                                         (errmsg("redo done at %X/%X",
6276                                                         ReadRecPtr.xlogid, ReadRecPtr.xrecoff)));
6277                         xtime = GetLatestXTime();
6278                         if (xtime)
6279                                 ereport(LOG,
6280                                          (errmsg("last completed transaction was at log time %s",
6281                                                          timestamptz_to_str(xtime))));
6282                         InRedo = false;
6283                 }
6284                 else
6285                 {
6286                         /* there are no WAL records following the checkpoint */
6287                         ereport(LOG,
6288                                         (errmsg("redo is not required")));
6289                 }
6290         }
6291
6292         /*
6293          * If we launched a WAL receiver, it should be gone by now. It will trump
6294          * over the startup checkpoint and subsequent records if it's still alive,
6295          * so be extra sure that it's gone.
6296          */
6297         if (WalRcvInProgress())
6298                 elog(PANIC, "wal receiver still active");
6299
6300         /*
6301          * We are now done reading the xlog from stream. Turn off streaming
6302          * recovery to force fetching the files (which would be required at end of
6303          * recovery, e.g., timeline history file) from archive or pg_xlog.
6304          */
6305         StandbyMode = false;
6306
6307         /*
6308          * Re-fetch the last valid or last applied record, so we can identify the
6309          * exact endpoint of what we consider the valid portion of WAL.
6310          */
6311         record = ReadRecord(&LastRec, PANIC, false);
6312         EndOfLog = EndRecPtr;
6313         XLByteToPrevSeg(EndOfLog, endLogId, endLogSeg);
6314
6315         /*
6316          * Complain if we did not roll forward far enough to render the backup
6317          * dump consistent.  Note: it is indeed okay to look at the local variable
6318          * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
6319          * be further ahead --- ControlFile->minRecoveryPoint cannot have been
6320          * advanced beyond the WAL we processed.
6321          */
6322         if (InArchiveRecovery &&
6323                 (XLByteLT(EndOfLog, minRecoveryPoint) ||
6324                  !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
6325         {
6326                 if (reachedStopPoint)   /* stopped because of stop request */
6327                         ereport(FATAL,
6328                                         (errmsg("requested recovery stop point is before consistent recovery point")));
6329                 else    /* ran off end of WAL */
6330                         ereport(FATAL,
6331                                         (errmsg("WAL ends before consistent recovery point")));
6332         }
6333
6334         /*
6335          * Consider whether we need to assign a new timeline ID.
6336          *
6337          * If we are doing an archive recovery, we always assign a new ID.      This
6338          * handles a couple of issues.  If we stopped short of the end of WAL
6339          * during recovery, then we are clearly generating a new timeline and must
6340          * assign it a unique new ID.  Even if we ran to the end, modifying the
6341          * current last segment is problematic because it may result in trying to
6342          * overwrite an already-archived copy of that segment, and we encourage
6343          * DBAs to make their archive_commands reject that.  We can dodge the
6344          * problem by making the new active segment have a new timeline ID.
6345          *
6346          * In a normal crash recovery, we can just extend the timeline we were in.
6347          */
6348         if (InArchiveRecovery)
6349         {
6350                 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
6351                 ereport(LOG,
6352                                 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
6353                 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
6354                                                          curFileTLI, endLogId, endLogSeg);
6355         }
6356
6357         /* Save the selected TimeLineID in shared memory, too */
6358         XLogCtl->ThisTimeLineID = ThisTimeLineID;
6359
6360         /*
6361          * We are now done reading the old WAL.  Turn off archive fetching if it
6362          * was active, and make a writable copy of the last WAL segment. (Note
6363          * that we also have a copy of the last block of the old WAL in readBuf;
6364          * we will use that below.)
6365          */
6366         if (InArchiveRecovery)
6367                 exitArchiveRecovery(curFileTLI, endLogId, endLogSeg);
6368
6369         /*
6370          * Prepare to write WAL starting at EndOfLog position, and init xlog
6371          * buffer cache using the block containing the last record from the
6372          * previous incarnation.
6373          */
6374         openLogId = endLogId;
6375         openLogSeg = endLogSeg;
6376         openLogFile = XLogFileOpen(openLogId, openLogSeg);
6377         openLogOff = 0;
6378         Insert = &XLogCtl->Insert;
6379         Insert->PrevRecord = LastRec;
6380         XLogCtl->xlblocks[0].xlogid = openLogId;
6381         XLogCtl->xlblocks[0].xrecoff =
6382                 ((EndOfLog.xrecoff - 1) / XLOG_BLCKSZ + 1) * XLOG_BLCKSZ;
6383
6384         /*
6385          * Tricky point here: readBuf contains the *last* block that the LastRec
6386          * record spans, not the one it starts in.      The last block is indeed the
6387          * one we want to use.
6388          */
6389         Assert(readOff == (XLogCtl->xlblocks[0].xrecoff - XLOG_BLCKSZ) % XLogSegSize);
6390         memcpy((char *) Insert->currpage, readBuf, XLOG_BLCKSZ);
6391         Insert->currpos = (char *) Insert->currpage +
6392                 (EndOfLog.xrecoff + XLOG_BLCKSZ - XLogCtl->xlblocks[0].xrecoff);
6393
6394         LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
6395
6396         XLogCtl->Write.LogwrtResult = LogwrtResult;
6397         Insert->LogwrtResult = LogwrtResult;
6398         XLogCtl->LogwrtResult = LogwrtResult;
6399
6400         XLogCtl->LogwrtRqst.Write = EndOfLog;
6401         XLogCtl->LogwrtRqst.Flush = EndOfLog;
6402
6403         freespace = INSERT_FREESPACE(Insert);
6404         if (freespace > 0)
6405         {
6406                 /* Make sure rest of page is zero */
6407                 MemSet(Insert->currpos, 0, freespace);
6408                 XLogCtl->Write.curridx = 0;
6409         }
6410         else
6411         {
6412                 /*
6413                  * Whenever Write.LogwrtResult points to exactly the end of a page,
6414                  * Write.curridx must point to the *next* page (see XLogWrite()).
6415                  *
6416                  * Note: it might seem we should do AdvanceXLInsertBuffer() here, but
6417                  * this is sufficient.  The first actual attempt to insert a log
6418                  * record will advance the insert state.
6419                  */
6420                 XLogCtl->Write.curridx = NextBufIdx(0);
6421         }
6422
6423         /* Pre-scan prepared transactions to find out the range of XIDs present */
6424         oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
6425
6426         if (InRecovery)
6427         {
6428                 int                     rmid;
6429
6430                 /*
6431                  * Resource managers might need to write WAL records, eg, to record
6432                  * index cleanup actions.  So temporarily enable XLogInsertAllowed in
6433                  * this process only.
6434                  */
6435                 LocalSetXLogInsertAllowed();
6436
6437                 /*
6438                  * Allow resource managers to do any required cleanup.
6439                  */
6440                 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6441                 {
6442                         if (RmgrTable[rmid].rm_cleanup != NULL)
6443                                 RmgrTable[rmid].rm_cleanup();
6444                 }
6445
6446                 /* Disallow XLogInsert again */
6447                 LocalXLogInsertAllowed = -1;
6448
6449                 /*
6450                  * Check to see if the XLOG sequence contained any unresolved
6451                  * references to uninitialized pages.
6452                  */
6453                 XLogCheckInvalidPages();
6454
6455                 /*
6456                  * Perform a checkpoint to update all our recovery activity to disk.
6457                  *
6458                  * Note that we write a shutdown checkpoint rather than an on-line
6459                  * one. This is not particularly critical, but since we may be
6460                  * assigning a new TLI, using a shutdown checkpoint allows us to have
6461                  * the rule that TLI only changes in shutdown checkpoints, which
6462                  * allows some extra error checking in xlog_redo.
6463                  */
6464                 if (bgwriterLaunched)
6465                         RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
6466                                                           CHECKPOINT_IMMEDIATE |
6467                                                           CHECKPOINT_WAIT);
6468                 else
6469                         CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
6470
6471                 /*
6472                  * And finally, execute the recovery_end_command, if any.
6473                  */
6474                 if (recoveryEndCommand)
6475                         ExecuteRecoveryCommand(recoveryEndCommand,
6476                                                                    "recovery_end_command",
6477                                                                    true);
6478         }
6479
6480         /*
6481          * Preallocate additional log files, if wanted.
6482          */
6483         PreallocXlogFiles(EndOfLog);
6484
6485         /*
6486          * Okay, we're officially UP.
6487          */
6488         InRecovery = false;
6489
6490         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
6491         ControlFile->state = DB_IN_PRODUCTION;
6492         ControlFile->time = (pg_time_t) time(NULL);
6493         UpdateControlFile();
6494         LWLockRelease(ControlFileLock);
6495
6496         /* start the archive_timeout timer running */
6497         XLogCtl->Write.lastSegSwitchTime = (pg_time_t) time(NULL);
6498
6499         /* initialize shared-memory copy of latest checkpoint XID/epoch */
6500         XLogCtl->ckptXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
6501         XLogCtl->ckptXid = ControlFile->checkPointCopy.nextXid;
6502
6503         /* also initialize latestCompletedXid, to nextXid - 1 */
6504         ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
6505         TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
6506
6507         /*
6508          * Start up the commit log and related stuff, too. In hot standby mode we
6509          * did this already before WAL replay.
6510          */
6511         if (standbyState == STANDBY_DISABLED)
6512         {
6513                 StartupCLOG();
6514                 StartupSUBTRANS(oldestActiveXID);
6515                 StartupMultiXact();
6516         }
6517
6518         /* Reload shared-memory state for prepared transactions */
6519         RecoverPreparedTransactions();
6520
6521         /*
6522          * Shutdown the recovery environment. This must occur after
6523          * RecoverPreparedTransactions(), see notes for lock_twophase_recover()
6524          */
6525         if (standbyState != STANDBY_DISABLED)
6526                 ShutdownRecoveryTransactionEnvironment();
6527
6528         /* Shut down readFile facility, free space */
6529         if (readFile >= 0)
6530         {
6531                 close(readFile);
6532                 readFile = -1;
6533         }
6534         if (readBuf)
6535         {
6536                 free(readBuf);
6537                 readBuf = NULL;
6538         }
6539         if (readRecordBuf)
6540         {
6541                 free(readRecordBuf);
6542                 readRecordBuf = NULL;
6543                 readRecordBufSize = 0;
6544         }
6545
6546         /*
6547          * If any of the critical GUCs have changed, log them before we allow
6548          * backends to write WAL.
6549          */
6550         LocalSetXLogInsertAllowed();
6551         XLogReportParameters();
6552
6553         /*
6554          * All done.  Allow backends to write WAL.      (Although the bool flag is
6555          * probably atomic in itself, we use the info_lck here to ensure that
6556          * there are no race conditions concerning visibility of other recent
6557          * updates to shared memory.)
6558          */
6559         {
6560                 /* use volatile pointer to prevent code rearrangement */
6561                 volatile XLogCtlData *xlogctl = XLogCtl;
6562
6563                 SpinLockAcquire(&xlogctl->info_lck);
6564                 xlogctl->SharedRecoveryInProgress = false;
6565                 SpinLockRelease(&xlogctl->info_lck);
6566         }
6567 }
6568
6569 /*
6570  * Checks if recovery has reached a consistent state. When consistency is
6571  * reached and we have a valid starting standby snapshot, tell postmaster
6572  * that it can start accepting read-only connections.
6573  */
6574 static void
6575 CheckRecoveryConsistency(void)
6576 {
6577         static bool backendsAllowed = false;
6578
6579         /*
6580          * Have we passed our safe starting point?
6581          */
6582         if (!reachedMinRecoveryPoint &&
6583                 XLByteLE(minRecoveryPoint, EndRecPtr) &&
6584                 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
6585         {
6586                 reachedMinRecoveryPoint = true;
6587                 ereport(LOG,
6588                                 (errmsg("consistent recovery state reached at %X/%X",
6589                                                 EndRecPtr.xlogid, EndRecPtr.xrecoff)));
6590         }
6591
6592         /*
6593          * Have we got a valid starting snapshot that will allow queries to be
6594          * run? If so, we can tell postmaster that the database is consistent now,
6595          * enabling connections.
6596          */
6597         if (standbyState == STANDBY_SNAPSHOT_READY &&
6598                 !backendsAllowed &&
6599                 reachedMinRecoveryPoint &&
6600                 IsUnderPostmaster)
6601         {
6602                 backendsAllowed = true;
6603                 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
6604         }
6605 }
6606
6607 /*
6608  * Is the system still in recovery?
6609  *
6610  * Unlike testing InRecovery, this works in any process that's connected to
6611  * shared memory.
6612  *
6613  * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
6614  * variables the first time we see that recovery is finished.
6615  */
6616 bool
6617 RecoveryInProgress(void)
6618 {
6619         /*
6620          * We check shared state each time only until we leave recovery mode. We
6621          * can't re-enter recovery, so there's no need to keep checking after the
6622          * shared variable has once been seen false.
6623          */
6624         if (!LocalRecoveryInProgress)
6625                 return false;
6626         else
6627         {
6628                 /* use volatile pointer to prevent code rearrangement */
6629                 volatile XLogCtlData *xlogctl = XLogCtl;
6630
6631                 /* spinlock is essential on machines with weak memory ordering! */
6632                 SpinLockAcquire(&xlogctl->info_lck);
6633                 LocalRecoveryInProgress = xlogctl->SharedRecoveryInProgress;
6634                 SpinLockRelease(&xlogctl->info_lck);
6635
6636                 /*
6637                  * Initialize TimeLineID and RedoRecPtr when we discover that recovery
6638                  * is finished. InitPostgres() relies upon this behaviour to ensure
6639                  * that InitXLOGAccess() is called at backend startup.  (If you change
6640                  * this, see also LocalSetXLogInsertAllowed.)
6641                  */
6642                 if (!LocalRecoveryInProgress)
6643                         InitXLOGAccess();
6644
6645                 return LocalRecoveryInProgress;
6646         }
6647 }
6648
6649 /*
6650  * Is this process allowed to insert new WAL records?
6651  *
6652  * Ordinarily this is essentially equivalent to !RecoveryInProgress().
6653  * But we also have provisions for forcing the result "true" or "false"
6654  * within specific processes regardless of the global state.
6655  */
6656 bool
6657 XLogInsertAllowed(void)
6658 {
6659         /*
6660          * If value is "unconditionally true" or "unconditionally false", just
6661          * return it.  This provides the normal fast path once recovery is known
6662          * done.
6663          */
6664         if (LocalXLogInsertAllowed >= 0)
6665                 return (bool) LocalXLogInsertAllowed;
6666
6667         /*
6668          * Else, must check to see if we're still in recovery.
6669          */
6670         if (RecoveryInProgress())
6671                 return false;
6672
6673         /*
6674          * On exit from recovery, reset to "unconditionally true", since there is
6675          * no need to keep checking.
6676          */
6677         LocalXLogInsertAllowed = 1;
6678         return true;
6679 }
6680
6681 /*
6682  * Make XLogInsertAllowed() return true in the current process only.
6683  *
6684  * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
6685  * and even call LocalSetXLogInsertAllowed() again after that.
6686  */
6687 static void
6688 LocalSetXLogInsertAllowed(void)
6689 {
6690         Assert(LocalXLogInsertAllowed == -1);
6691         LocalXLogInsertAllowed = 1;
6692
6693         /* Initialize as RecoveryInProgress() would do when switching state */
6694         InitXLOGAccess();
6695 }
6696
6697 /*
6698  * Subroutine to try to fetch and validate a prior checkpoint record.
6699  *
6700  * whichChkpt identifies the checkpoint (merely for reporting purposes).
6701  * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
6702  */
6703 static XLogRecord *
6704 ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
6705 {
6706         XLogRecord *record;
6707
6708         if (!XRecOffIsValid(RecPtr.xrecoff))
6709         {
6710                 switch (whichChkpt)
6711                 {
6712                         case 1:
6713                                 ereport(LOG,
6714                                 (errmsg("invalid primary checkpoint link in control file")));
6715                                 break;
6716                         case 2:
6717                                 ereport(LOG,
6718                                                 (errmsg("invalid secondary checkpoint link in control file")));
6719                                 break;
6720                         default:
6721                                 ereport(LOG,
6722                                    (errmsg("invalid checkpoint link in backup_label file")));
6723                                 break;
6724                 }
6725                 return NULL;
6726         }
6727
6728         record = ReadRecord(&RecPtr, LOG, true);
6729
6730         if (record == NULL)
6731         {
6732                 switch (whichChkpt)
6733                 {
6734                         case 1:
6735                                 ereport(LOG,
6736                                                 (errmsg("invalid primary checkpoint record")));
6737                                 break;
6738                         case 2:
6739                                 ereport(LOG,
6740                                                 (errmsg("invalid secondary checkpoint record")));
6741                                 break;
6742                         default:
6743                                 ereport(LOG,
6744                                                 (errmsg("invalid checkpoint record")));
6745                                 break;
6746                 }
6747                 return NULL;
6748         }
6749         if (record->xl_rmid != RM_XLOG_ID)
6750         {
6751                 switch (whichChkpt)
6752                 {
6753                         case 1:
6754                                 ereport(LOG,
6755                                                 (errmsg("invalid resource manager ID in primary checkpoint record")));
6756                                 break;
6757                         case 2:
6758                                 ereport(LOG,
6759                                                 (errmsg("invalid resource manager ID in secondary checkpoint record")));
6760                                 break;
6761                         default:
6762                                 ereport(LOG,
6763                                 (errmsg("invalid resource manager ID in checkpoint record")));
6764                                 break;
6765                 }
6766                 return NULL;
6767         }
6768         if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
6769                 record->xl_info != XLOG_CHECKPOINT_ONLINE)
6770         {
6771                 switch (whichChkpt)
6772                 {
6773                         case 1:
6774                                 ereport(LOG,
6775                                    (errmsg("invalid xl_info in primary checkpoint record")));
6776                                 break;
6777                         case 2:
6778                                 ereport(LOG,
6779                                  (errmsg("invalid xl_info in secondary checkpoint record")));
6780                                 break;
6781                         default:
6782                                 ereport(LOG,
6783                                                 (errmsg("invalid xl_info in checkpoint record")));
6784                                 break;
6785                 }
6786                 return NULL;
6787         }
6788         if (record->xl_len != sizeof(CheckPoint) ||
6789                 record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
6790         {
6791                 switch (whichChkpt)
6792                 {
6793                         case 1:
6794                                 ereport(LOG,
6795                                         (errmsg("invalid length of primary checkpoint record")));
6796                                 break;
6797                         case 2:
6798                                 ereport(LOG,
6799                                   (errmsg("invalid length of secondary checkpoint record")));
6800                                 break;
6801                         default:
6802                                 ereport(LOG,
6803                                                 (errmsg("invalid length of checkpoint record")));
6804                                 break;
6805                 }
6806                 return NULL;
6807         }
6808         return record;
6809 }
6810
6811 /*
6812  * This must be called during startup of a backend process, except that
6813  * it need not be called in a standalone backend (which does StartupXLOG
6814  * instead).  We need to initialize the local copies of ThisTimeLineID and
6815  * RedoRecPtr.
6816  *
6817  * Note: before Postgres 8.0, we went to some effort to keep the postmaster
6818  * process's copies of ThisTimeLineID and RedoRecPtr valid too.  This was
6819  * unnecessary however, since the postmaster itself never touches XLOG anyway.
6820  */
6821 void
6822 InitXLOGAccess(void)
6823 {
6824         /* ThisTimeLineID doesn't change so we need no lock to copy it */
6825         ThisTimeLineID = XLogCtl->ThisTimeLineID;
6826         Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
6827
6828         /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
6829         (void) GetRedoRecPtr();
6830 }
6831
6832 /*
6833  * Once spawned, a backend may update its local RedoRecPtr from
6834  * XLogCtl->Insert.RedoRecPtr; it must hold the insert lock or info_lck
6835  * to do so.  This is done in XLogInsert() or GetRedoRecPtr().
6836  */
6837 XLogRecPtr
6838 GetRedoRecPtr(void)
6839 {
6840         /* use volatile pointer to prevent code rearrangement */
6841         volatile XLogCtlData *xlogctl = XLogCtl;
6842
6843         SpinLockAcquire(&xlogctl->info_lck);
6844         Assert(XLByteLE(RedoRecPtr, xlogctl->Insert.RedoRecPtr));
6845         RedoRecPtr = xlogctl->Insert.RedoRecPtr;
6846         SpinLockRelease(&xlogctl->info_lck);
6847
6848         return RedoRecPtr;
6849 }
6850
6851 /*
6852  * GetInsertRecPtr -- Returns the current insert position.
6853  *
6854  * NOTE: The value *actually* returned is the position of the last full
6855  * xlog page. It lags behind the real insert position by at most 1 page.
6856  * For that, we don't need to acquire WALInsertLock which can be quite
6857  * heavily contended, and an approximation is enough for the current
6858  * usage of this function.
6859  */
6860 XLogRecPtr
6861 GetInsertRecPtr(void)
6862 {
6863         /* use volatile pointer to prevent code rearrangement */
6864         volatile XLogCtlData *xlogctl = XLogCtl;
6865         XLogRecPtr      recptr;
6866
6867         SpinLockAcquire(&xlogctl->info_lck);
6868         recptr = xlogctl->LogwrtRqst.Write;
6869         SpinLockRelease(&xlogctl->info_lck);
6870
6871         return recptr;
6872 }
6873
6874 /*
6875  * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
6876  * position known to be fsync'd to disk.
6877  */
6878 XLogRecPtr
6879 GetFlushRecPtr(void)
6880 {
6881         /* use volatile pointer to prevent code rearrangement */
6882         volatile XLogCtlData *xlogctl = XLogCtl;
6883         XLogRecPtr      recptr;
6884
6885         SpinLockAcquire(&xlogctl->info_lck);
6886         recptr = xlogctl->LogwrtResult.Flush;
6887         SpinLockRelease(&xlogctl->info_lck);
6888
6889         return recptr;
6890 }
6891
6892 /*
6893  * Get the time of the last xlog segment switch
6894  */
6895 pg_time_t
6896 GetLastSegSwitchTime(void)
6897 {
6898         pg_time_t       result;
6899
6900         /* Need WALWriteLock, but shared lock is sufficient */
6901         LWLockAcquire(WALWriteLock, LW_SHARED);
6902         result = XLogCtl->Write.lastSegSwitchTime;
6903         LWLockRelease(WALWriteLock);
6904
6905         return result;
6906 }
6907
6908 /*
6909  * GetNextXidAndEpoch - get the current nextXid value and associated epoch
6910  *
6911  * This is exported for use by code that would like to have 64-bit XIDs.
6912  * We don't really support such things, but all XIDs within the system
6913  * can be presumed "close to" the result, and thus the epoch associated
6914  * with them can be determined.
6915  */
6916 void
6917 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
6918 {
6919         uint32          ckptXidEpoch;
6920         TransactionId ckptXid;
6921         TransactionId nextXid;
6922
6923         /* Must read checkpoint info first, else have race condition */
6924         {
6925                 /* use volatile pointer to prevent code rearrangement */
6926                 volatile XLogCtlData *xlogctl = XLogCtl;
6927
6928                 SpinLockAcquire(&xlogctl->info_lck);
6929                 ckptXidEpoch = xlogctl->ckptXidEpoch;
6930                 ckptXid = xlogctl->ckptXid;
6931                 SpinLockRelease(&xlogctl->info_lck);
6932         }
6933
6934         /* Now fetch current nextXid */
6935         nextXid = ReadNewTransactionId();
6936
6937         /*
6938          * nextXid is certainly logically later than ckptXid.  So if it's
6939          * numerically less, it must have wrapped into the next epoch.
6940          */
6941         if (nextXid < ckptXid)
6942                 ckptXidEpoch++;
6943
6944         *xid = nextXid;
6945         *epoch = ckptXidEpoch;
6946 }
6947
6948 /*
6949  * GetRecoveryTargetTLI - get the recovery target timeline ID
6950  */
6951 TimeLineID
6952 GetRecoveryTargetTLI(void)
6953 {
6954         /* RecoveryTargetTLI doesn't change so we need no lock to copy it */
6955         return XLogCtl->RecoveryTargetTLI;
6956 }
6957
6958 /*
6959  * This must be called ONCE during postmaster or standalone-backend shutdown
6960  */
6961 void
6962 ShutdownXLOG(int code, Datum arg)
6963 {
6964         ereport(LOG,
6965                         (errmsg("shutting down")));
6966
6967         if (RecoveryInProgress())
6968                 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6969         else
6970         {
6971                 /*
6972                  * If archiving is enabled, rotate the last XLOG file so that all the
6973                  * remaining records are archived (postmaster wakes up the archiver
6974                  * process one more time at the end of shutdown). The checkpoint
6975                  * record will go to the next XLOG file and won't be archived (yet).
6976                  */
6977                 if (XLogArchivingActive() && XLogArchiveCommandSet())
6978                         RequestXLogSwitch();
6979
6980                 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
6981         }
6982         ShutdownCLOG();
6983         ShutdownSUBTRANS();
6984         ShutdownMultiXact();
6985
6986         ereport(LOG,
6987                         (errmsg("database system is shut down")));
6988 }
6989
6990 /*
6991  * Log start of a checkpoint.
6992  */
6993 static void
6994 LogCheckpointStart(int flags, bool restartpoint)
6995 {
6996         const char *msg;
6997
6998         /*
6999          * XXX: This is hopelessly untranslatable. We could call gettext_noop for
7000          * the main message, but what about all the flags?
7001          */
7002         if (restartpoint)
7003                 msg = "restartpoint starting:%s%s%s%s%s%s%s";
7004         else
7005                 msg = "checkpoint starting:%s%s%s%s%s%s%s";
7006
7007         elog(LOG, msg,
7008                  (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
7009                  (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
7010                  (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
7011                  (flags & CHECKPOINT_FORCE) ? " force" : "",
7012                  (flags & CHECKPOINT_WAIT) ? " wait" : "",
7013                  (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
7014                  (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "");
7015 }
7016
7017 /*
7018  * Log end of a checkpoint.
7019  */
7020 static void
7021 LogCheckpointEnd(bool restartpoint)
7022 {
7023         long            write_secs,
7024                                 sync_secs,
7025                                 total_secs;
7026         int                     write_usecs,
7027                                 sync_usecs,
7028                                 total_usecs;
7029
7030         CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
7031
7032         TimestampDifference(CheckpointStats.ckpt_start_t,
7033                                                 CheckpointStats.ckpt_end_t,
7034                                                 &total_secs, &total_usecs);
7035
7036         TimestampDifference(CheckpointStats.ckpt_write_t,
7037                                                 CheckpointStats.ckpt_sync_t,
7038                                                 &write_secs, &write_usecs);
7039
7040         TimestampDifference(CheckpointStats.ckpt_sync_t,
7041                                                 CheckpointStats.ckpt_sync_end_t,
7042                                                 &sync_secs, &sync_usecs);
7043
7044         if (restartpoint)
7045                 elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
7046                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
7047                          CheckpointStats.ckpt_bufs_written,
7048                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7049                          write_secs, write_usecs / 1000,
7050                          sync_secs, sync_usecs / 1000,
7051                          total_secs, total_usecs / 1000);
7052         else
7053                 elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
7054                          "%d transaction log file(s) added, %d removed, %d recycled; "
7055                          "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
7056                          CheckpointStats.ckpt_bufs_written,
7057                          (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
7058                          CheckpointStats.ckpt_segs_added,
7059                          CheckpointStats.ckpt_segs_removed,
7060                          CheckpointStats.ckpt_segs_recycled,
7061                          write_secs, write_usecs / 1000,
7062                          sync_secs, sync_usecs / 1000,
7063                          total_secs, total_usecs / 1000);
7064 }
7065
7066 /*
7067  * Perform a checkpoint --- either during shutdown, or on-the-fly
7068  *
7069  * flags is a bitwise OR of the following:
7070  *      CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
7071  *      CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
7072  *      CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
7073  *              ignoring checkpoint_completion_target parameter.
7074  *      CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occured
7075  *              since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
7076  *              CHECKPOINT_END_OF_RECOVERY).
7077  *
7078  * Note: flags contains other bits, of interest here only for logging purposes.
7079  * In particular note that this routine is synchronous and does not pay
7080  * attention to CHECKPOINT_WAIT.
7081  */
7082 void
7083 CreateCheckPoint(int flags)
7084 {
7085         bool            shutdown;
7086         CheckPoint      checkPoint;
7087         XLogRecPtr      recptr;
7088         XLogCtlInsert *Insert = &XLogCtl->Insert;
7089         XLogRecData rdata;
7090         uint32          freespace;
7091         uint32          _logId;
7092         uint32          _logSeg;
7093         TransactionId *inCommitXids;
7094         int                     nInCommit;
7095
7096         /*
7097          * An end-of-recovery checkpoint is really a shutdown checkpoint, just
7098          * issued at a different time.
7099          */
7100         if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
7101                 shutdown = true;
7102         else
7103                 shutdown = false;
7104
7105         /* sanity check */
7106         if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
7107                 elog(ERROR, "can't create a checkpoint during recovery");
7108
7109         /*
7110          * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
7111          * (This is just pro forma, since in the present system structure there is
7112          * only one process that is allowed to issue checkpoints at any given
7113          * time.)
7114          */
7115         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7116
7117         /*
7118          * Prepare to accumulate statistics.
7119          *
7120          * Note: because it is possible for log_checkpoints to change while a
7121          * checkpoint proceeds, we always accumulate stats, even if
7122          * log_checkpoints is currently off.
7123          */
7124         MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7125         CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7126
7127         /*
7128          * Use a critical section to force system panic if we have trouble.
7129          */
7130         START_CRIT_SECTION();
7131
7132         if (shutdown)
7133         {
7134                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7135                 ControlFile->state = DB_SHUTDOWNING;
7136                 ControlFile->time = (pg_time_t) time(NULL);
7137                 UpdateControlFile();
7138                 LWLockRelease(ControlFileLock);
7139         }
7140
7141         /*
7142          * Let smgr prepare for checkpoint; this has to happen before we determine
7143          * the REDO pointer.  Note that smgr must not do anything that'd have to
7144          * be undone if we decide no checkpoint is needed.
7145          */
7146         smgrpreckpt();
7147
7148         /* Begin filling in the checkpoint WAL record */
7149         MemSet(&checkPoint, 0, sizeof(checkPoint));
7150         checkPoint.time = (pg_time_t) time(NULL);
7151
7152         /*
7153          * We must hold WALInsertLock while examining insert state to determine
7154          * the checkpoint REDO pointer.
7155          */
7156         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7157
7158         /*
7159          * If this isn't a shutdown or forced checkpoint, and we have not inserted
7160          * any XLOG records since the start of the last checkpoint, skip the
7161          * checkpoint.  The idea here is to avoid inserting duplicate checkpoints
7162          * when the system is idle. That wastes log space, and more importantly it
7163          * exposes us to possible loss of both current and previous checkpoint
7164          * records if the machine crashes just as we're writing the update.
7165          * (Perhaps it'd make even more sense to checkpoint only when the previous
7166          * checkpoint record is in a different xlog page?)
7167          *
7168          * We have to make two tests to determine that nothing has happened since
7169          * the start of the last checkpoint: current insertion point must match
7170          * the end of the last checkpoint record, and its redo pointer must point
7171          * to itself.
7172          */
7173         if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
7174                                   CHECKPOINT_FORCE)) == 0)
7175         {
7176                 XLogRecPtr      curInsert;
7177
7178                 INSERT_RECPTR(curInsert, Insert, Insert->curridx);
7179                 if (curInsert.xlogid == ControlFile->checkPoint.xlogid &&
7180                         curInsert.xrecoff == ControlFile->checkPoint.xrecoff +
7181                         MAXALIGN(SizeOfXLogRecord + sizeof(CheckPoint)) &&
7182                         ControlFile->checkPoint.xlogid ==
7183                         ControlFile->checkPointCopy.redo.xlogid &&
7184                         ControlFile->checkPoint.xrecoff ==
7185                         ControlFile->checkPointCopy.redo.xrecoff)
7186                 {
7187                         LWLockRelease(WALInsertLock);
7188                         LWLockRelease(CheckpointLock);
7189                         END_CRIT_SECTION();
7190                         return;
7191                 }
7192         }
7193
7194         /*
7195          * An end-of-recovery checkpoint is created before anyone is allowed to
7196          * write WAL. To allow us to write the checkpoint record, temporarily
7197          * enable XLogInsertAllowed.  (This also ensures ThisTimeLineID is
7198          * initialized, which we need here and in AdvanceXLInsertBuffer.)
7199          */
7200         if (flags & CHECKPOINT_END_OF_RECOVERY)
7201                 LocalSetXLogInsertAllowed();
7202
7203         checkPoint.ThisTimeLineID = ThisTimeLineID;
7204
7205         /*
7206          * Compute new REDO record ptr = location of next XLOG record.
7207          *
7208          * NB: this is NOT necessarily where the checkpoint record itself will be,
7209          * since other backends may insert more XLOG records while we're off doing
7210          * the buffer flush work.  Those XLOG records are logically after the
7211          * checkpoint, even though physically before it.  Got that?
7212          */
7213         freespace = INSERT_FREESPACE(Insert);
7214         if (freespace < SizeOfXLogRecord)
7215         {
7216                 (void) AdvanceXLInsertBuffer(false);
7217                 /* OK to ignore update return flag, since we will do flush anyway */
7218                 freespace = INSERT_FREESPACE(Insert);
7219         }
7220         INSERT_RECPTR(checkPoint.redo, Insert, Insert->curridx);
7221
7222         /*
7223          * Here we update the shared RedoRecPtr for future XLogInsert calls; this
7224          * must be done while holding the insert lock AND the info_lck.
7225          *
7226          * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
7227          * pointing past where it really needs to point.  This is okay; the only
7228          * consequence is that XLogInsert might back up whole buffers that it
7229          * didn't really need to.  We can't postpone advancing RedoRecPtr because
7230          * XLogInserts that happen while we are dumping buffers must assume that
7231          * their buffer changes are not included in the checkpoint.
7232          */
7233         {
7234                 /* use volatile pointer to prevent code rearrangement */
7235                 volatile XLogCtlData *xlogctl = XLogCtl;
7236
7237                 SpinLockAcquire(&xlogctl->info_lck);
7238                 RedoRecPtr = xlogctl->Insert.RedoRecPtr = checkPoint.redo;
7239                 SpinLockRelease(&xlogctl->info_lck);
7240         }
7241
7242         /*
7243          * Now we can release WAL insert lock, allowing other xacts to proceed
7244          * while we are flushing disk buffers.
7245          */
7246         LWLockRelease(WALInsertLock);
7247
7248         /*
7249          * If enabled, log checkpoint start.  We postpone this until now so as not
7250          * to log anything if we decided to skip the checkpoint.
7251          */
7252         if (log_checkpoints)
7253                 LogCheckpointStart(flags, false);
7254
7255         TRACE_POSTGRESQL_CHECKPOINT_START(flags);
7256
7257         /*
7258          * Before flushing data, we must wait for any transactions that are
7259          * currently in their commit critical sections.  If an xact inserted its
7260          * commit record into XLOG just before the REDO point, then a crash
7261          * restart from the REDO point would not replay that record, which means
7262          * that our flushing had better include the xact's update of pg_clog.  So
7263          * we wait till he's out of his commit critical section before proceeding.
7264          * See notes in RecordTransactionCommit().
7265          *
7266          * Because we've already released WALInsertLock, this test is a bit fuzzy:
7267          * it is possible that we will wait for xacts we didn't really need to
7268          * wait for.  But the delay should be short and it seems better to make
7269          * checkpoint take a bit longer than to hold locks longer than necessary.
7270          * (In fact, the whole reason we have this issue is that xact.c does
7271          * commit record XLOG insertion and clog update as two separate steps
7272          * protected by different locks, but again that seems best on grounds of
7273          * minimizing lock contention.)
7274          *
7275          * A transaction that has not yet set inCommit when we look cannot be at
7276          * risk, since he's not inserted his commit record yet; and one that's
7277          * already cleared it is not at risk either, since he's done fixing clog
7278          * and we will correctly flush the update below.  So we cannot miss any
7279          * xacts we need to wait for.
7280          */
7281         nInCommit = GetTransactionsInCommit(&inCommitXids);
7282         if (nInCommit > 0)
7283         {
7284                 do
7285                 {
7286                         pg_usleep(10000L);      /* wait for 10 msec */
7287                 } while (HaveTransactionsInCommit(inCommitXids, nInCommit));
7288         }
7289         pfree(inCommitXids);
7290
7291         /*
7292          * Get the other info we need for the checkpoint record.
7293          */
7294         LWLockAcquire(XidGenLock, LW_SHARED);
7295         checkPoint.nextXid = ShmemVariableCache->nextXid;
7296         checkPoint.oldestXid = ShmemVariableCache->oldestXid;
7297         checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
7298         LWLockRelease(XidGenLock);
7299
7300         /* Increase XID epoch if we've wrapped around since last checkpoint */
7301         checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
7302         if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
7303                 checkPoint.nextXidEpoch++;
7304
7305         LWLockAcquire(OidGenLock, LW_SHARED);
7306         checkPoint.nextOid = ShmemVariableCache->nextOid;
7307         if (!shutdown)
7308                 checkPoint.nextOid += ShmemVariableCache->oidCount;
7309         LWLockRelease(OidGenLock);
7310
7311         MultiXactGetCheckptMulti(shutdown,
7312                                                          &checkPoint.nextMulti,
7313                                                          &checkPoint.nextMultiOffset);
7314
7315         /*
7316          * Having constructed the checkpoint record, ensure all shmem disk buffers
7317          * and commit-log buffers are flushed to disk.
7318          *
7319          * This I/O could fail for various reasons.  If so, we will fail to
7320          * complete the checkpoint, but there is no reason to force a system
7321          * panic. Accordingly, exit critical section while doing it.
7322          */
7323         END_CRIT_SECTION();
7324
7325         CheckPointGuts(checkPoint.redo, flags);
7326
7327         /*
7328          * Take a snapshot of running transactions and write this to WAL. This
7329          * allows us to reconstruct the state of running transactions during
7330          * archive recovery, if required. Skip, if this info disabled.
7331          *
7332          * If we are shutting down, or Startup process is completing crash
7333          * recovery we don't need to write running xact data.
7334          *
7335          * Update checkPoint.nextXid since we have a later value
7336          */
7337         if (!shutdown && XLogStandbyInfoActive())
7338                 LogStandbySnapshot(&checkPoint.oldestActiveXid, &checkPoint.nextXid);
7339         else
7340                 checkPoint.oldestActiveXid = InvalidTransactionId;
7341
7342         START_CRIT_SECTION();
7343
7344         /*
7345          * Now insert the checkpoint record into XLOG.
7346          */
7347         rdata.data = (char *) (&checkPoint);
7348         rdata.len = sizeof(checkPoint);
7349         rdata.buffer = InvalidBuffer;
7350         rdata.next = NULL;
7351
7352         recptr = XLogInsert(RM_XLOG_ID,
7353                                                 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
7354                                                 XLOG_CHECKPOINT_ONLINE,
7355                                                 &rdata);
7356
7357         XLogFlush(recptr);
7358
7359         /*
7360          * We mustn't write any new WAL after a shutdown checkpoint, or it will be
7361          * overwritten at next startup.  No-one should even try, this just allows
7362          * sanity-checking.  In the case of an end-of-recovery checkpoint, we want
7363          * to just temporarily disable writing until the system has exited
7364          * recovery.
7365          */
7366         if (shutdown)
7367         {
7368                 if (flags & CHECKPOINT_END_OF_RECOVERY)
7369                         LocalXLogInsertAllowed = -1;            /* return to "check" state */
7370                 else
7371                         LocalXLogInsertAllowed = 0; /* never again write WAL */
7372         }
7373
7374         /*
7375          * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
7376          * = end of actual checkpoint record.
7377          */
7378         if (shutdown && !XLByteEQ(checkPoint.redo, ProcLastRecPtr))
7379                 ereport(PANIC,
7380                                 (errmsg("concurrent transaction log activity while database system is shutting down")));
7381
7382         /*
7383          * Select point at which we can truncate the log, which we base on the
7384          * prior checkpoint's earliest info.
7385          */
7386         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
7387
7388         /*
7389          * Update the control file.
7390          */
7391         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7392         if (shutdown)
7393                 ControlFile->state = DB_SHUTDOWNED;
7394         ControlFile->prevCheckPoint = ControlFile->checkPoint;
7395         ControlFile->checkPoint = ProcLastRecPtr;
7396         ControlFile->checkPointCopy = checkPoint;
7397         ControlFile->time = (pg_time_t) time(NULL);
7398         /* crash recovery should always recover to the end of WAL */
7399         MemSet(&ControlFile->minRecoveryPoint, 0, sizeof(XLogRecPtr));
7400         UpdateControlFile();
7401         LWLockRelease(ControlFileLock);
7402
7403         /* Update shared-memory copy of checkpoint XID/epoch */
7404         {
7405                 /* use volatile pointer to prevent code rearrangement */
7406                 volatile XLogCtlData *xlogctl = XLogCtl;
7407
7408                 SpinLockAcquire(&xlogctl->info_lck);
7409                 xlogctl->ckptXidEpoch = checkPoint.nextXidEpoch;
7410                 xlogctl->ckptXid = checkPoint.nextXid;
7411                 SpinLockRelease(&xlogctl->info_lck);
7412         }
7413
7414         /*
7415          * We are now done with critical updates; no need for system panic if we
7416          * have trouble while fooling with old log segments.
7417          */
7418         END_CRIT_SECTION();
7419
7420         /*
7421          * Let smgr do post-checkpoint cleanup (eg, deleting old files).
7422          */
7423         smgrpostckpt();
7424
7425         /*
7426          * Delete old log files (those no longer needed even for previous
7427          * checkpoint or the standbys in XLOG streaming).
7428          */
7429         if (_logId || _logSeg)
7430         {
7431                 /*
7432                  * Calculate the last segment that we need to retain because of
7433                  * wal_keep_segments, by subtracting wal_keep_segments from the new
7434                  * checkpoint location.
7435                  */
7436                 if (wal_keep_segments > 0)
7437                 {
7438                         uint32          log;
7439                         uint32          seg;
7440                         int                     d_log;
7441                         int                     d_seg;
7442
7443                         XLByteToSeg(recptr, log, seg);
7444
7445                         d_seg = wal_keep_segments % XLogSegsPerFile;
7446                         d_log = wal_keep_segments / XLogSegsPerFile;
7447                         if (seg < d_seg)
7448                         {
7449                                 d_log += 1;
7450                                 seg = seg - d_seg + XLogSegsPerFile;
7451                         }
7452                         else
7453                                 seg = seg - d_seg;
7454                         /* avoid underflow, don't go below (0,1) */
7455                         if (log < d_log || (log == d_log && seg == 0))
7456                         {
7457                                 log = 0;
7458                                 seg = 1;
7459                         }
7460                         else
7461                                 log = log - d_log;
7462
7463                         /* don't delete WAL segments newer than the calculated segment */
7464                         if (log < _logId || (log == _logId && seg < _logSeg))
7465                         {
7466                                 _logId = log;
7467                                 _logSeg = seg;
7468                         }
7469                 }
7470
7471                 PrevLogSeg(_logId, _logSeg);
7472                 RemoveOldXlogFiles(_logId, _logSeg, recptr);
7473         }
7474
7475         /*
7476          * Make more log segments if needed.  (Do this after recycling old log
7477          * segments, since that may supply some of the needed files.)
7478          */
7479         if (!shutdown)
7480                 PreallocXlogFiles(recptr);
7481
7482         /*
7483          * Truncate pg_subtrans if possible.  We can throw away all data before
7484          * the oldest XMIN of any running transaction.  No future transaction will
7485          * attempt to reference any pg_subtrans entry older than that (see Asserts
7486          * in subtrans.c).      During recovery, though, we mustn't do this because
7487          * StartupSUBTRANS hasn't been called yet.
7488          */
7489         if (!RecoveryInProgress())
7490                 TruncateSUBTRANS(GetOldestXmin(true, false));
7491
7492         /* All real work is done, but log before releasing lock. */
7493         if (log_checkpoints)
7494                 LogCheckpointEnd(false);
7495
7496         TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
7497                                                                          NBuffers,
7498                                                                          CheckpointStats.ckpt_segs_added,
7499                                                                          CheckpointStats.ckpt_segs_removed,
7500                                                                          CheckpointStats.ckpt_segs_recycled);
7501
7502         LWLockRelease(CheckpointLock);
7503 }
7504
7505 /*
7506  * Flush all data in shared memory to disk, and fsync
7507  *
7508  * This is the common code shared between regular checkpoints and
7509  * recovery restartpoints.
7510  */
7511 static void
7512 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
7513 {
7514         CheckPointCLOG();
7515         CheckPointSUBTRANS();
7516         CheckPointMultiXact();
7517         CheckPointRelationMap();
7518         CheckPointBuffers(flags);       /* performs all required fsyncs */
7519         /* We deliberately delay 2PC checkpointing as long as possible */
7520         CheckPointTwoPhase(checkPointRedo);
7521 }
7522
7523 /*
7524  * Save a checkpoint for recovery restart if appropriate
7525  *
7526  * This function is called each time a checkpoint record is read from XLOG.
7527  * It must determine whether the checkpoint represents a safe restartpoint or
7528  * not.  If so, the checkpoint record is stashed in shared memory so that
7529  * CreateRestartPoint can consult it.  (Note that the latter function is
7530  * executed by the bgwriter, while this one will be executed by the startup
7531  * process.)
7532  */
7533 static void
7534 RecoveryRestartPoint(const CheckPoint *checkPoint)
7535 {
7536         int                     rmid;
7537
7538         /* use volatile pointer to prevent code rearrangement */
7539         volatile XLogCtlData *xlogctl = XLogCtl;
7540
7541         /*
7542          * Is it safe to checkpoint?  We must ask each of the resource managers
7543          * whether they have any partial state information that might prevent a
7544          * correct restart from this point.  If so, we skip this opportunity, but
7545          * return at the next checkpoint record for another try.
7546          */
7547         for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7548         {
7549                 if (RmgrTable[rmid].rm_safe_restartpoint != NULL)
7550                         if (!(RmgrTable[rmid].rm_safe_restartpoint()))
7551                         {
7552                                 elog(trace_recovery(DEBUG2), "RM %d not safe to record restart point at %X/%X",
7553                                          rmid,
7554                                          checkPoint->redo.xlogid,
7555                                          checkPoint->redo.xrecoff);
7556                                 return;
7557                         }
7558         }
7559
7560         /*
7561          * Copy the checkpoint record to shared memory, so that bgwriter can use
7562          * it the next time it wants to perform a restartpoint.
7563          */
7564         SpinLockAcquire(&xlogctl->info_lck);
7565         XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
7566         memcpy(&XLogCtl->lastCheckPoint, checkPoint, sizeof(CheckPoint));
7567         SpinLockRelease(&xlogctl->info_lck);
7568 }
7569
7570 /*
7571  * Establish a restartpoint if possible.
7572  *
7573  * This is similar to CreateCheckPoint, but is used during WAL recovery
7574  * to establish a point from which recovery can roll forward without
7575  * replaying the entire recovery log.
7576  *
7577  * Returns true if a new restartpoint was established. We can only establish
7578  * a restartpoint if we have replayed a safe checkpoint record since last
7579  * restartpoint.
7580  */
7581 bool
7582 CreateRestartPoint(int flags)
7583 {
7584         XLogRecPtr      lastCheckPointRecPtr;
7585         CheckPoint      lastCheckPoint;
7586         uint32          _logId;
7587         uint32          _logSeg;
7588         TimestampTz xtime;
7589
7590         /* use volatile pointer to prevent code rearrangement */
7591         volatile XLogCtlData *xlogctl = XLogCtl;
7592
7593         /*
7594          * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
7595          * happens at a time.
7596          */
7597         LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
7598
7599         /* Get a local copy of the last safe checkpoint record. */
7600         SpinLockAcquire(&xlogctl->info_lck);
7601         lastCheckPointRecPtr = xlogctl->lastCheckPointRecPtr;
7602         memcpy(&lastCheckPoint, &XLogCtl->lastCheckPoint, sizeof(CheckPoint));
7603         SpinLockRelease(&xlogctl->info_lck);
7604
7605         /*
7606          * Check that we're still in recovery mode. It's ok if we exit recovery
7607          * mode after this check, the restart point is valid anyway.
7608          */
7609         if (!RecoveryInProgress())
7610         {
7611                 ereport(DEBUG2,
7612                           (errmsg("skipping restartpoint, recovery has already ended")));
7613                 LWLockRelease(CheckpointLock);
7614                 return false;
7615         }
7616
7617         /*
7618          * If the last checkpoint record we've replayed is already our last
7619          * restartpoint, we can't perform a new restart point. We still update
7620          * minRecoveryPoint in that case, so that if this is a shutdown restart
7621          * point, we won't start up earlier than before. That's not strictly
7622          * necessary, but when hot standby is enabled, it would be rather weird if
7623          * the database opened up for read-only connections at a point-in-time
7624          * before the last shutdown. Such time travel is still possible in case of
7625          * immediate shutdown, though.
7626          *
7627          * We don't explicitly advance minRecoveryPoint when we do create a
7628          * restartpoint. It's assumed that flushing the buffers will do that as a
7629          * side-effect.
7630          */
7631         if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
7632                 XLByteLE(lastCheckPoint.redo, ControlFile->checkPointCopy.redo))
7633         {
7634                 XLogRecPtr      InvalidXLogRecPtr = {0, 0};
7635
7636                 ereport(DEBUG2,
7637                                 (errmsg("skipping restartpoint, already performed at %X/%X",
7638                                   lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff)));
7639
7640                 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
7641                 if (flags & CHECKPOINT_IS_SHUTDOWN)
7642                 {
7643                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7644                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7645                         ControlFile->time = (pg_time_t) time(NULL);
7646                         UpdateControlFile();
7647                         LWLockRelease(ControlFileLock);
7648                 }
7649                 LWLockRelease(CheckpointLock);
7650                 return false;
7651         }
7652
7653         /*
7654          * Update the shared RedoRecPtr so that the startup process can calculate
7655          * the number of segments replayed since last restartpoint, and request a
7656          * restartpoint if it exceeds checkpoint_segments.
7657          *
7658          * You need to hold WALInsertLock and info_lck to update it, although
7659          * during recovery acquiring WALInsertLock is just pro forma, because
7660          * there is no other processes updating Insert.RedoRecPtr.
7661          */
7662         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
7663         SpinLockAcquire(&xlogctl->info_lck);
7664         xlogctl->Insert.RedoRecPtr = lastCheckPoint.redo;
7665         SpinLockRelease(&xlogctl->info_lck);
7666         LWLockRelease(WALInsertLock);
7667
7668         if (log_checkpoints)
7669         {
7670                 /*
7671                  * Prepare to accumulate statistics.
7672                  */
7673                 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
7674                 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
7675
7676                 LogCheckpointStart(flags, true);
7677         }
7678
7679         CheckPointGuts(lastCheckPoint.redo, flags);
7680
7681         /*
7682          * Select point at which we can truncate the xlog, which we base on the
7683          * prior checkpoint's earliest info.
7684          */
7685         XLByteToSeg(ControlFile->checkPointCopy.redo, _logId, _logSeg);
7686
7687         /*
7688          * Update pg_control, using current time.  Check that it still shows
7689          * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
7690          * this is a quick hack to make sure nothing really bad happens if somehow
7691          * we get here after the end-of-recovery checkpoint.
7692          */
7693         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7694         if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
7695                 XLByteLT(ControlFile->checkPointCopy.redo, lastCheckPoint.redo))
7696         {
7697                 ControlFile->prevCheckPoint = ControlFile->checkPoint;
7698                 ControlFile->checkPoint = lastCheckPointRecPtr;
7699                 ControlFile->checkPointCopy = lastCheckPoint;
7700                 ControlFile->time = (pg_time_t) time(NULL);
7701                 if (flags & CHECKPOINT_IS_SHUTDOWN)
7702                         ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
7703                 UpdateControlFile();
7704         }
7705         LWLockRelease(ControlFileLock);
7706
7707         /*
7708          * Delete old log files (those no longer needed even for previous
7709          * checkpoint/restartpoint) to prevent the disk holding the xlog from
7710          * growing full. We don't need do this during normal recovery, but during
7711          * streaming recovery we have to or the disk will eventually fill up from
7712          * old log files streamed from master.
7713          */
7714         if (WalRcvInProgress() && (_logId || _logSeg))
7715         {
7716                 XLogRecPtr      endptr;
7717
7718                 /* Get the current (or recent) end of xlog */
7719                 endptr = GetWalRcvWriteRecPtr(NULL);
7720
7721                 PrevLogSeg(_logId, _logSeg);
7722                 RemoveOldXlogFiles(_logId, _logSeg, endptr);
7723
7724                 /*
7725                  * Make more log segments if needed.  (Do this after recycling old log
7726                  * segments, since that may supply some of the needed files.)
7727                  */
7728                 PreallocXlogFiles(endptr);
7729         }
7730
7731         /*
7732          * Truncate pg_subtrans if possible.  We can throw away all data before
7733          * the oldest XMIN of any running transaction.  No future transaction will
7734          * attempt to reference any pg_subtrans entry older than that (see Asserts
7735          * in subtrans.c).      When hot standby is disabled, though, we mustn't do
7736          * this because StartupSUBTRANS hasn't been called yet.
7737          */
7738         if (EnableHotStandby)
7739                 TruncateSUBTRANS(GetOldestXmin(true, false));
7740
7741         /* All real work is done, but log before releasing lock. */
7742         if (log_checkpoints)
7743                 LogCheckpointEnd(true);
7744
7745         xtime = GetLatestXTime();
7746         ereport((log_checkpoints ? LOG : DEBUG2),
7747                         (errmsg("recovery restart point at %X/%X",
7748                                         lastCheckPoint.redo.xlogid, lastCheckPoint.redo.xrecoff),
7749                    xtime ? errdetail("last completed transaction was at log time %s",
7750                                                          timestamptz_to_str(xtime)) : 0));
7751
7752         LWLockRelease(CheckpointLock);
7753
7754         /*
7755          * Finally, execute archive_cleanup_command, if any.
7756          */
7757         if (XLogCtl->archiveCleanupCommand[0])
7758                 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
7759                                                            "archive_cleanup_command",
7760                                                            false);
7761
7762         return true;
7763 }
7764
7765 /*
7766  * Write a NEXTOID log record
7767  */
7768 void
7769 XLogPutNextOid(Oid nextOid)
7770 {
7771         XLogRecData rdata;
7772
7773         rdata.data = (char *) (&nextOid);
7774         rdata.len = sizeof(Oid);
7775         rdata.buffer = InvalidBuffer;
7776         rdata.next = NULL;
7777         (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID, &rdata);
7778
7779         /*
7780          * We need not flush the NEXTOID record immediately, because any of the
7781          * just-allocated OIDs could only reach disk as part of a tuple insert or
7782          * update that would have its own XLOG record that must follow the NEXTOID
7783          * record.      Therefore, the standard buffer LSN interlock applied to those
7784          * records will ensure no such OID reaches disk before the NEXTOID record
7785          * does.
7786          *
7787          * Note, however, that the above statement only covers state "within" the
7788          * database.  When we use a generated OID as a file or directory name, we
7789          * are in a sense violating the basic WAL rule, because that filesystem
7790          * change may reach disk before the NEXTOID WAL record does.  The impact
7791          * of this is that if a database crash occurs immediately afterward, we
7792          * might after restart re-generate the same OID and find that it conflicts
7793          * with the leftover file or directory.  But since for safety's sake we
7794          * always loop until finding a nonconflicting filename, this poses no real
7795          * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
7796          */
7797 }
7798
7799 /*
7800  * Write an XLOG SWITCH record.
7801  *
7802  * Here we just blindly issue an XLogInsert request for the record.
7803  * All the magic happens inside XLogInsert.
7804  *
7805  * The return value is either the end+1 address of the switch record,
7806  * or the end+1 address of the prior segment if we did not need to
7807  * write a switch record because we are already at segment start.
7808  */
7809 XLogRecPtr
7810 RequestXLogSwitch(void)
7811 {
7812         XLogRecPtr      RecPtr;
7813         XLogRecData rdata;
7814
7815         /* XLOG SWITCH, alone among xlog record types, has no data */
7816         rdata.buffer = InvalidBuffer;
7817         rdata.data = NULL;
7818         rdata.len = 0;
7819         rdata.next = NULL;
7820
7821         RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH, &rdata);
7822
7823         return RecPtr;
7824 }
7825
7826 /*
7827  * Check if any of the GUC parameters that are critical for hot standby
7828  * have changed, and update the value in pg_control file if necessary.
7829  */
7830 static void
7831 XLogReportParameters(void)
7832 {
7833         if (wal_level != ControlFile->wal_level ||
7834                 MaxConnections != ControlFile->MaxConnections ||
7835                 max_prepared_xacts != ControlFile->max_prepared_xacts ||
7836                 max_locks_per_xact != ControlFile->max_locks_per_xact)
7837         {
7838                 /*
7839                  * The change in number of backend slots doesn't need to be WAL-logged
7840                  * if archiving is not enabled, as you can't start archive recovery
7841                  * with wal_level=minimal anyway. We don't really care about the
7842                  * values in pg_control either if wal_level=minimal, but seems better
7843                  * to keep them up-to-date to avoid confusion.
7844                  */
7845                 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
7846                 {
7847                         XLogRecData rdata;
7848                         xl_parameter_change xlrec;
7849
7850                         xlrec.MaxConnections = MaxConnections;
7851                         xlrec.max_prepared_xacts = max_prepared_xacts;
7852                         xlrec.max_locks_per_xact = max_locks_per_xact;
7853                         xlrec.wal_level = wal_level;
7854
7855                         rdata.buffer = InvalidBuffer;
7856                         rdata.data = (char *) &xlrec;
7857                         rdata.len = sizeof(xlrec);
7858                         rdata.next = NULL;
7859
7860                         XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE, &rdata);
7861                 }
7862
7863                 ControlFile->MaxConnections = MaxConnections;
7864                 ControlFile->max_prepared_xacts = max_prepared_xacts;
7865                 ControlFile->max_locks_per_xact = max_locks_per_xact;
7866                 ControlFile->wal_level = wal_level;
7867                 UpdateControlFile();
7868         }
7869 }
7870
7871 /*
7872  * XLOG resource manager's routines
7873  *
7874  * Definitions of info values are in include/catalog/pg_control.h, though
7875  * not all record types are related to control file updates.
7876  */
7877 void
7878 xlog_redo(XLogRecPtr lsn, XLogRecord *record)
7879 {
7880         uint8           info = record->xl_info & ~XLR_INFO_MASK;
7881
7882         /* Backup blocks are not used in xlog records */
7883         Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
7884
7885         if (info == XLOG_NEXTOID)
7886         {
7887                 Oid                     nextOid;
7888
7889                 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
7890                 if (ShmemVariableCache->nextOid < nextOid)
7891                 {
7892                         ShmemVariableCache->nextOid = nextOid;
7893                         ShmemVariableCache->oidCount = 0;
7894                 }
7895         }
7896         else if (info == XLOG_CHECKPOINT_SHUTDOWN)
7897         {
7898                 CheckPoint      checkPoint;
7899
7900                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7901                 /* In a SHUTDOWN checkpoint, believe the counters exactly */
7902                 ShmemVariableCache->nextXid = checkPoint.nextXid;
7903                 ShmemVariableCache->nextOid = checkPoint.nextOid;
7904                 ShmemVariableCache->oidCount = 0;
7905                 MultiXactSetNextMXact(checkPoint.nextMulti,
7906                                                           checkPoint.nextMultiOffset);
7907                 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
7908
7909                 /*
7910                  * If we see a shutdown checkpoint while waiting for an end-of-backup
7911                  * record, the backup was cancelled and the end-of-backup record will
7912                  * never arrive.
7913                  */
7914                 if (InArchiveRecovery &&
7915                         !XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7916                         ereport(ERROR,
7917                                         (errmsg("online backup was cancelled, recovery cannot continue")));
7918
7919                 /*
7920                  * If we see a shutdown checkpoint, we know that nothing was running
7921                  * on the master at this point. So fake-up an empty running-xacts
7922                  * record and use that here and now. Recover additional standby state
7923                  * for prepared transactions.
7924                  */
7925                 if (standbyState >= STANDBY_INITIALIZED)
7926                 {
7927                         TransactionId *xids;
7928                         int                     nxids;
7929                         TransactionId oldestActiveXID;
7930                         TransactionId latestCompletedXid;
7931                         RunningTransactionsData running;
7932
7933                         oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7934
7935                         /*
7936                          * Construct a RunningTransactions snapshot representing a shut
7937                          * down server, with only prepared transactions still alive. We're
7938                          * never overflowed at this point because all subxids are listed
7939                          * with their parent prepared transactions.
7940                          */
7941                         running.xcnt = nxids;
7942                         running.subxid_overflow = false;
7943                         running.nextXid = checkPoint.nextXid;
7944                         running.oldestRunningXid = oldestActiveXID;
7945                         latestCompletedXid = checkPoint.nextXid;
7946                         TransactionIdRetreat(latestCompletedXid);
7947                         Assert(TransactionIdIsNormal(latestCompletedXid));
7948                         running.latestCompletedXid = latestCompletedXid;
7949                         running.xids = xids;
7950
7951                         ProcArrayApplyRecoveryInfo(&running);
7952
7953                         StandbyRecoverPreparedTransactions(true);
7954                 }
7955
7956                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
7957                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
7958                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
7959
7960                 /*
7961                  * TLI may change in a shutdown checkpoint, but it shouldn't decrease
7962                  */
7963                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
7964                 {
7965                         if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
7966                                 !list_member_int(expectedTLIs,
7967                                                                  (int) checkPoint.ThisTimeLineID))
7968                                 ereport(PANIC,
7969                                                 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
7970                                                                 checkPoint.ThisTimeLineID, ThisTimeLineID)));
7971                         /* Following WAL records should be run with new TLI */
7972                         ThisTimeLineID = checkPoint.ThisTimeLineID;
7973                 }
7974
7975                 RecoveryRestartPoint(&checkPoint);
7976         }
7977         else if (info == XLOG_CHECKPOINT_ONLINE)
7978         {
7979                 CheckPoint      checkPoint;
7980
7981                 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
7982                 /* In an ONLINE checkpoint, treat the counters like NEXTOID */
7983                 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
7984                                                                   checkPoint.nextXid))
7985                         ShmemVariableCache->nextXid = checkPoint.nextXid;
7986                 if (ShmemVariableCache->nextOid < checkPoint.nextOid)
7987                 {
7988                         ShmemVariableCache->nextOid = checkPoint.nextOid;
7989                         ShmemVariableCache->oidCount = 0;
7990                 }
7991                 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
7992                                                                   checkPoint.nextMultiOffset);
7993                 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
7994                                                                   checkPoint.oldestXid))
7995                         SetTransactionIdLimit(checkPoint.oldestXid,
7996                                                                   checkPoint.oldestXidDB);
7997
7998                 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
7999                 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
8000                 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
8001
8002                 /* TLI should not change in an on-line checkpoint */
8003                 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
8004                         ereport(PANIC,
8005                                         (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
8006                                                         checkPoint.ThisTimeLineID, ThisTimeLineID)));
8007
8008                 RecoveryRestartPoint(&checkPoint);
8009         }
8010         else if (info == XLOG_NOOP)
8011         {
8012                 /* nothing to do here */
8013         }
8014         else if (info == XLOG_SWITCH)
8015         {
8016                 /* nothing to do here */
8017         }
8018         else if (info == XLOG_BACKUP_END)
8019         {
8020                 XLogRecPtr      startpoint;
8021
8022                 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
8023
8024                 if (XLByteEQ(ControlFile->backupStartPoint, startpoint))
8025                 {
8026                         /*
8027                          * We have reached the end of base backup, the point where
8028                          * pg_stop_backup() was done. The data on disk is now consistent.
8029                          * Reset backupStartPoint, and update minRecoveryPoint to make
8030                          * sure we don't allow starting up at an earlier point even if
8031                          * recovery is stopped and restarted soon after this.
8032                          */
8033                         elog(DEBUG1, "end of backup reached");
8034
8035                         LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8036
8037                         if (XLByteLT(ControlFile->minRecoveryPoint, lsn))
8038                                 ControlFile->minRecoveryPoint = lsn;
8039                         MemSet(&ControlFile->backupStartPoint, 0, sizeof(XLogRecPtr));
8040                         UpdateControlFile();
8041
8042                         LWLockRelease(ControlFileLock);
8043                 }
8044         }
8045         else if (info == XLOG_PARAMETER_CHANGE)
8046         {
8047                 xl_parameter_change xlrec;
8048
8049                 /* Update our copy of the parameters in pg_control */
8050                 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
8051
8052                 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8053                 ControlFile->MaxConnections = xlrec.MaxConnections;
8054                 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
8055                 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
8056                 ControlFile->wal_level = xlrec.wal_level;
8057
8058                 /*
8059                  * Update minRecoveryPoint to ensure that if recovery is aborted, we
8060                  * recover back up to this point before allowing hot standby again.
8061                  * This is particularly important if wal_level was set to 'archive'
8062                  * before, and is now 'hot_standby', to ensure you don't run queries
8063                  * against the WAL preceding the wal_level change. Same applies to
8064                  * decreasing max_* settings.
8065                  */
8066                 minRecoveryPoint = ControlFile->minRecoveryPoint;
8067                 if ((minRecoveryPoint.xlogid != 0 || minRecoveryPoint.xrecoff != 0)
8068                         && XLByteLT(minRecoveryPoint, lsn))
8069                 {
8070                         ControlFile->minRecoveryPoint = lsn;
8071                 }
8072
8073                 UpdateControlFile();
8074                 LWLockRelease(ControlFileLock);
8075
8076                 /* Check to see if any changes to max_connections give problems */
8077                 CheckRequiredParameterValues();
8078         }
8079 }
8080
8081 void
8082 xlog_desc(StringInfo buf, uint8 xl_info, char *rec)
8083 {
8084         uint8           info = xl_info & ~XLR_INFO_MASK;
8085
8086         if (info == XLOG_CHECKPOINT_SHUTDOWN ||
8087                 info == XLOG_CHECKPOINT_ONLINE)
8088         {
8089                 CheckPoint *checkpoint = (CheckPoint *) rec;
8090
8091                 appendStringInfo(buf, "checkpoint: redo %X/%X; "
8092                                                  "tli %u; xid %u/%u; oid %u; multi %u; offset %u; "
8093                                                  "oldest xid %u in DB %u; oldest running xid %u; %s",
8094                                                  checkpoint->redo.xlogid, checkpoint->redo.xrecoff,
8095                                                  checkpoint->ThisTimeLineID,
8096                                                  checkpoint->nextXidEpoch, checkpoint->nextXid,
8097                                                  checkpoint->nextOid,
8098                                                  checkpoint->nextMulti,
8099                                                  checkpoint->nextMultiOffset,
8100                                                  checkpoint->oldestXid,
8101                                                  checkpoint->oldestXidDB,
8102                                                  checkpoint->oldestActiveXid,
8103                                  (info == XLOG_CHECKPOINT_SHUTDOWN) ? "shutdown" : "online");
8104         }
8105         else if (info == XLOG_NOOP)
8106         {
8107                 appendStringInfo(buf, "xlog no-op");
8108         }
8109         else if (info == XLOG_NEXTOID)
8110         {
8111                 Oid                     nextOid;
8112
8113                 memcpy(&nextOid, rec, sizeof(Oid));
8114                 appendStringInfo(buf, "nextOid: %u", nextOid);
8115         }
8116         else if (info == XLOG_SWITCH)
8117         {
8118                 appendStringInfo(buf, "xlog switch");
8119         }
8120         else if (info == XLOG_BACKUP_END)
8121         {
8122                 XLogRecPtr      startpoint;
8123
8124                 memcpy(&startpoint, rec, sizeof(XLogRecPtr));
8125                 appendStringInfo(buf, "backup end: %X/%X",
8126                                                  startpoint.xlogid, startpoint.xrecoff);
8127         }
8128         else if (info == XLOG_PARAMETER_CHANGE)
8129         {
8130                 xl_parameter_change xlrec;
8131                 const char *wal_level_str;
8132                 const struct config_enum_entry *entry;
8133
8134                 memcpy(&xlrec, rec, sizeof(xl_parameter_change));
8135
8136                 /* Find a string representation for wal_level */
8137                 wal_level_str = "?";
8138                 for (entry = wal_level_options; entry->name; entry++)
8139                 {
8140                         if (entry->val == xlrec.wal_level)
8141                         {
8142                                 wal_level_str = entry->name;
8143                                 break;
8144                         }
8145                 }
8146
8147                 appendStringInfo(buf, "parameter change: max_connections=%d max_prepared_xacts=%d max_locks_per_xact=%d wal_level=%s",
8148                                                  xlrec.MaxConnections,
8149                                                  xlrec.max_prepared_xacts,
8150                                                  xlrec.max_locks_per_xact,
8151                                                  wal_level_str);
8152         }
8153         else
8154                 appendStringInfo(buf, "UNKNOWN");
8155 }
8156
8157 #ifdef WAL_DEBUG
8158
8159 static void
8160 xlog_outrec(StringInfo buf, XLogRecord *record)
8161 {
8162         int                     i;
8163
8164         appendStringInfo(buf, "prev %X/%X; xid %u",
8165                                          record->xl_prev.xlogid, record->xl_prev.xrecoff,
8166                                          record->xl_xid);
8167
8168         appendStringInfo(buf, "; len %u",
8169                                          record->xl_len);
8170
8171         for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
8172         {
8173                 if (record->xl_info & XLR_SET_BKP_BLOCK(i))
8174                         appendStringInfo(buf, "; bkpb%d", i + 1);
8175         }
8176
8177         appendStringInfo(buf, ": %s", RmgrTable[record->xl_rmid].rm_name);
8178 }
8179 #endif   /* WAL_DEBUG */
8180
8181
8182 /*
8183  * Return the (possible) sync flag used for opening a file, depending on the
8184  * value of the GUC wal_sync_method.
8185  */
8186 static int
8187 get_sync_bit(int method)
8188 {
8189         int                     o_direct_flag = 0;
8190
8191         /* If fsync is disabled, never open in sync mode */
8192         if (!enableFsync)
8193                 return 0;
8194
8195         /*
8196          * Optimize writes by bypassing kernel cache with O_DIRECT when using
8197          * O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are
8198          * disabled, otherwise the archive command or walsender process will read
8199          * the WAL soon after writing it, which is guaranteed to cause a physical
8200          * read if we bypassed the kernel cache. We also skip the
8201          * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
8202          * reason.
8203          *
8204          * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
8205          * written by walreceiver is normally read by the startup process soon
8206          * after its written. Also, walreceiver performs unaligned writes, which
8207          * don't work with O_DIRECT, so it is required for correctness too.
8208          */
8209         if (!XLogIsNeeded() && !am_walreceiver)
8210                 o_direct_flag = PG_O_DIRECT;
8211
8212         switch (method)
8213         {
8214                         /*
8215                          * enum values for all sync options are defined even if they are
8216                          * not supported on the current platform.  But if not, they are
8217                          * not included in the enum option array, and therefore will never
8218                          * be seen here.
8219                          */
8220                 case SYNC_METHOD_FSYNC:
8221                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8222                 case SYNC_METHOD_FDATASYNC:
8223                         return 0;
8224 #ifdef OPEN_SYNC_FLAG
8225                 case SYNC_METHOD_OPEN:
8226                         return OPEN_SYNC_FLAG | o_direct_flag;
8227 #endif
8228 #ifdef OPEN_DATASYNC_FLAG
8229                 case SYNC_METHOD_OPEN_DSYNC:
8230                         return OPEN_DATASYNC_FLAG | o_direct_flag;
8231 #endif
8232                 default:
8233                         /* can't happen (unless we are out of sync with option array) */
8234                         elog(ERROR, "unrecognized wal_sync_method: %d", method);
8235                         return 0;                       /* silence warning */
8236         }
8237 }
8238
8239 /*
8240  * GUC support
8241  */
8242 bool
8243 assign_xlog_sync_method(int new_sync_method, bool doit, GucSource source)
8244 {
8245         if (!doit)
8246                 return true;
8247
8248         if (sync_method != new_sync_method)
8249         {
8250                 /*
8251                  * To ensure that no blocks escape unsynced, force an fsync on the
8252                  * currently open log segment (if any).  Also, if the open flag is
8253                  * changing, close the log file so it will be reopened (with new flag
8254                  * bit) at next use.
8255                  */
8256                 if (openLogFile >= 0)
8257                 {
8258                         if (pg_fsync(openLogFile) != 0)
8259                                 ereport(PANIC,
8260                                                 (errcode_for_file_access(),
8261                                                  errmsg("could not fsync log file %u, segment %u: %m",
8262                                                                 openLogId, openLogSeg)));
8263                         if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
8264                                 XLogFileClose();
8265                 }
8266         }
8267
8268         return true;
8269 }
8270
8271
8272 /*
8273  * Issue appropriate kind of fsync (if any) for an XLOG output file.
8274  *
8275  * 'fd' is a file descriptor for the XLOG file to be fsync'd.
8276  * 'log' and 'seg' are for error reporting purposes.
8277  */
8278 void
8279 issue_xlog_fsync(int fd, uint32 log, uint32 seg)
8280 {
8281         switch (sync_method)
8282         {
8283                 case SYNC_METHOD_FSYNC:
8284                         if (pg_fsync_no_writethrough(fd) != 0)
8285                                 ereport(PANIC,
8286                                                 (errcode_for_file_access(),
8287                                                  errmsg("could not fsync log file %u, segment %u: %m",
8288                                                                 log, seg)));
8289                         break;
8290 #ifdef HAVE_FSYNC_WRITETHROUGH
8291                 case SYNC_METHOD_FSYNC_WRITETHROUGH:
8292                         if (pg_fsync_writethrough(fd) != 0)
8293                                 ereport(PANIC,
8294                                                 (errcode_for_file_access(),
8295                                                  errmsg("could not fsync write-through log file %u, segment %u: %m",
8296                                                                 log, seg)));
8297                         break;
8298 #endif
8299 #ifdef HAVE_FDATASYNC
8300                 case SYNC_METHOD_FDATASYNC:
8301                         if (pg_fdatasync(fd) != 0)
8302                                 ereport(PANIC,
8303                                                 (errcode_for_file_access(),
8304                                         errmsg("could not fdatasync log file %u, segment %u: %m",
8305                                                    log, seg)));
8306                         break;
8307 #endif
8308                 case SYNC_METHOD_OPEN:
8309                 case SYNC_METHOD_OPEN_DSYNC:
8310                         /* write synced it already */
8311                         break;
8312                 default:
8313                         elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
8314                         break;
8315         }
8316 }
8317
8318
8319 /*
8320  * pg_start_backup: set up for taking an on-line backup dump
8321  *
8322  * Essentially what this does is to create a backup label file in $PGDATA,
8323  * where it will be archived as part of the backup dump.  The label file
8324  * contains the user-supplied label string (typically this would be used
8325  * to tell where the backup dump will be stored) and the starting time and
8326  * starting WAL location for the dump.
8327  */
8328 Datum
8329 pg_start_backup(PG_FUNCTION_ARGS)
8330 {
8331         text       *backupid = PG_GETARG_TEXT_P(0);
8332         bool            fast = PG_GETARG_BOOL(1);
8333         char       *backupidstr;
8334         XLogRecPtr      checkpointloc;
8335         XLogRecPtr      startpoint;
8336         pg_time_t       stamp_time;
8337         char            strfbuf[128];
8338         char            xlogfilename[MAXFNAMELEN];
8339         uint32          _logId;
8340         uint32          _logSeg;
8341         struct stat stat_buf;
8342         FILE       *fp;
8343
8344         if (!superuser())
8345                 ereport(ERROR,
8346                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8347                                  errmsg("must be superuser to run a backup")));
8348
8349         if (RecoveryInProgress())
8350                 ereport(ERROR,
8351                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8352                                  errmsg("recovery is in progress"),
8353                                  errhint("WAL control functions cannot be executed during recovery.")));
8354
8355         if (!XLogIsNeeded())
8356                 ereport(ERROR,
8357                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8358                           errmsg("WAL level not sufficient for making an online backup"),
8359                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8360
8361         backupidstr = text_to_cstring(backupid);
8362
8363         /*
8364          * Mark backup active in shared memory.  We must do full-page WAL writes
8365          * during an on-line backup even if not doing so at other times, because
8366          * it's quite possible for the backup dump to obtain a "torn" (partially
8367          * written) copy of a database page if it reads the page concurrently with
8368          * our write to the same page.  This can be fixed as long as the first
8369          * write to the page in the WAL sequence is a full-page write. Hence, we
8370          * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
8371          * are no dirty pages in shared memory that might get dumped while the
8372          * backup is in progress without having a corresponding WAL record.  (Once
8373          * the backup is complete, we need not force full-page writes anymore,
8374          * since we expect that any pages not modified during the backup interval
8375          * must have been correctly captured by the backup.)
8376          *
8377          * We must hold WALInsertLock to change the value of forcePageWrites, to
8378          * ensure adequate interlocking against XLogInsert().
8379          */
8380         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8381         if (XLogCtl->Insert.forcePageWrites)
8382         {
8383                 LWLockRelease(WALInsertLock);
8384                 ereport(ERROR,
8385                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8386                                  errmsg("a backup is already in progress"),
8387                                  errhint("Run pg_stop_backup() and try again.")));
8388         }
8389         XLogCtl->Insert.forcePageWrites = true;
8390         LWLockRelease(WALInsertLock);
8391
8392         /*
8393          * Force an XLOG file switch before the checkpoint, to ensure that the WAL
8394          * segment the checkpoint is written to doesn't contain pages with old
8395          * timeline IDs. That would otherwise happen if you called
8396          * pg_start_backup() right after restoring from a PITR archive: the first
8397          * WAL segment containing the startup checkpoint has pages in the
8398          * beginning with the old timeline ID. That can cause trouble at recovery:
8399          * we won't have a history file covering the old timeline if pg_xlog
8400          * directory was not included in the base backup and the WAL archive was
8401          * cleared too before starting the backup.
8402          */
8403         RequestXLogSwitch();
8404
8405         /* Ensure we release forcePageWrites if fail below */
8406         PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
8407         {
8408                 /*
8409                  * Force a CHECKPOINT.  Aside from being necessary to prevent torn
8410                  * page problems, this guarantees that two successive backup runs will
8411                  * have different checkpoint positions and hence different history
8412                  * file names, even if nothing happened in between.
8413                  *
8414                  * We use CHECKPOINT_IMMEDIATE only if requested by user (via passing
8415                  * fast = true).  Otherwise this can take awhile.
8416                  */
8417                 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
8418                                                   (fast ? CHECKPOINT_IMMEDIATE : 0));
8419
8420                 /*
8421                  * Now we need to fetch the checkpoint record location, and also its
8422                  * REDO pointer.  The oldest point in WAL that would be needed to
8423                  * restore starting from the checkpoint is precisely the REDO pointer.
8424                  */
8425                 LWLockAcquire(ControlFileLock, LW_SHARED);
8426                 checkpointloc = ControlFile->checkPoint;
8427                 startpoint = ControlFile->checkPointCopy.redo;
8428                 LWLockRelease(ControlFileLock);
8429
8430                 XLByteToSeg(startpoint, _logId, _logSeg);
8431                 XLogFileName(xlogfilename, ThisTimeLineID, _logId, _logSeg);
8432
8433                 /* Use the log timezone here, not the session timezone */
8434                 stamp_time = (pg_time_t) time(NULL);
8435                 pg_strftime(strfbuf, sizeof(strfbuf),
8436                                         "%Y-%m-%d %H:%M:%S %Z",
8437                                         pg_localtime(&stamp_time, log_timezone));
8438
8439                 /*
8440                  * Check for existing backup label --- implies a backup is already
8441                  * running.  (XXX given that we checked forcePageWrites above, maybe
8442                  * it would be OK to just unlink any such label file?)
8443                  */
8444                 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
8445                 {
8446                         if (errno != ENOENT)
8447                                 ereport(ERROR,
8448                                                 (errcode_for_file_access(),
8449                                                  errmsg("could not stat file \"%s\": %m",
8450                                                                 BACKUP_LABEL_FILE)));
8451                 }
8452                 else
8453                         ereport(ERROR,
8454                                         (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8455                                          errmsg("a backup is already in progress"),
8456                                          errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
8457                                                          BACKUP_LABEL_FILE)));
8458
8459                 /*
8460                  * Okay, write the file
8461                  */
8462                 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
8463                 if (!fp)
8464                         ereport(ERROR,
8465                                         (errcode_for_file_access(),
8466                                          errmsg("could not create file \"%s\": %m",
8467                                                         BACKUP_LABEL_FILE)));
8468                 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
8469                                 startpoint.xlogid, startpoint.xrecoff, xlogfilename);
8470                 fprintf(fp, "CHECKPOINT LOCATION: %X/%X\n",
8471                                 checkpointloc.xlogid, checkpointloc.xrecoff);
8472                 fprintf(fp, "START TIME: %s\n", strfbuf);
8473                 fprintf(fp, "LABEL: %s\n", backupidstr);
8474                 if (fflush(fp) || ferror(fp) || FreeFile(fp))
8475                         ereport(ERROR,
8476                                         (errcode_for_file_access(),
8477                                          errmsg("could not write file \"%s\": %m",
8478                                                         BACKUP_LABEL_FILE)));
8479         }
8480         PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) 0);
8481
8482         /*
8483          * We're done.  As a convenience, return the starting WAL location.
8484          */
8485         snprintf(xlogfilename, sizeof(xlogfilename), "%X/%X",
8486                          startpoint.xlogid, startpoint.xrecoff);
8487         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
8488 }
8489
8490 /* Error cleanup callback for pg_start_backup */
8491 static void
8492 pg_start_backup_callback(int code, Datum arg)
8493 {
8494         /* Turn off forcePageWrites on failure */
8495         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8496         XLogCtl->Insert.forcePageWrites = false;
8497         LWLockRelease(WALInsertLock);
8498 }
8499
8500 /*
8501  * pg_stop_backup: finish taking an on-line backup dump
8502  *
8503  * We write an end-of-backup WAL record, and remove the backup label file
8504  * created by pg_start_backup, creating a backup history file in pg_xlog
8505  * instead (whence it will immediately be archived). The backup history file
8506  * contains the same info found in the label file, plus the backup-end time
8507  * and WAL location. Before 9.0, the backup-end time was read from the backup
8508  * history file at the beginning of archive recovery, but we now use the WAL
8509  * record for that and the file is for informational and debug purposes only.
8510  *
8511  * Note: different from CancelBackup which just cancels online backup mode.
8512  */
8513 Datum
8514 pg_stop_backup(PG_FUNCTION_ARGS)
8515 {
8516         XLogRecPtr      startpoint;
8517         XLogRecPtr      stoppoint;
8518         XLogRecData rdata;
8519         pg_time_t       stamp_time;
8520         char            strfbuf[128];
8521         char            histfilepath[MAXPGPATH];
8522         char            startxlogfilename[MAXFNAMELEN];
8523         char            stopxlogfilename[MAXFNAMELEN];
8524         char            lastxlogfilename[MAXFNAMELEN];
8525         char            histfilename[MAXFNAMELEN];
8526         uint32          _logId;
8527         uint32          _logSeg;
8528         FILE       *lfp;
8529         FILE       *fp;
8530         char            ch;
8531         int                     ich;
8532         int                     seconds_before_warning;
8533         int                     waits = 0;
8534         bool            reported_waiting = false;
8535
8536         if (!superuser())
8537                 ereport(ERROR,
8538                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8539                                  (errmsg("must be superuser to run a backup"))));
8540
8541         if (RecoveryInProgress())
8542                 ereport(ERROR,
8543                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8544                                  errmsg("recovery is in progress"),
8545                                  errhint("WAL control functions cannot be executed during recovery.")));
8546
8547         if (!XLogIsNeeded())
8548                 ereport(ERROR,
8549                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8550                           errmsg("WAL level not sufficient for making an online backup"),
8551                                  errhint("wal_level must be set to \"archive\" or \"hot_standby\" at server start.")));
8552
8553         /*
8554          * OK to clear forcePageWrites
8555          */
8556         LWLockAcquire(WALInsertLock, LW_EXCLUSIVE);
8557         XLogCtl->Insert.forcePageWrites = false;
8558         LWLockRelease(WALInsertLock);
8559
8560         /*
8561          * Open the existing label file
8562          */
8563         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
8564         if (!lfp)
8565         {
8566                 if (errno != ENOENT)
8567                         ereport(ERROR,
8568                                         (errcode_for_file_access(),
8569                                          errmsg("could not read file \"%s\": %m",
8570                                                         BACKUP_LABEL_FILE)));
8571                 ereport(ERROR,
8572                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8573                                  errmsg("a backup is not in progress")));
8574         }
8575
8576         /*
8577          * Read and parse the START WAL LOCATION line (this code is pretty crude,
8578          * but we are not expecting any variability in the file format).
8579          */
8580         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %24s)%c",
8581                            &startpoint.xlogid, &startpoint.xrecoff, startxlogfilename,
8582                            &ch) != 4 || ch != '\n')
8583                 ereport(ERROR,
8584                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8585                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
8586
8587         /*
8588          * Write the backup-end xlog record
8589          */
8590         rdata.data = (char *) (&startpoint);
8591         rdata.len = sizeof(startpoint);
8592         rdata.buffer = InvalidBuffer;
8593         rdata.next = NULL;
8594         stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END, &rdata);
8595
8596         /*
8597          * Force a switch to a new xlog segment file, so that the backup is valid
8598          * as soon as archiver moves out the current segment file.
8599          */
8600         RequestXLogSwitch();
8601
8602         XLByteToPrevSeg(stoppoint, _logId, _logSeg);
8603         XLogFileName(stopxlogfilename, ThisTimeLineID, _logId, _logSeg);
8604
8605         /* Use the log timezone here, not the session timezone */
8606         stamp_time = (pg_time_t) time(NULL);
8607         pg_strftime(strfbuf, sizeof(strfbuf),
8608                                 "%Y-%m-%d %H:%M:%S %Z",
8609                                 pg_localtime(&stamp_time, log_timezone));
8610
8611         /*
8612          * Write the backup history file
8613          */
8614         XLByteToSeg(startpoint, _logId, _logSeg);
8615         BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logId, _logSeg,
8616                                                   startpoint.xrecoff % XLogSegSize);
8617         fp = AllocateFile(histfilepath, "w");
8618         if (!fp)
8619                 ereport(ERROR,
8620                                 (errcode_for_file_access(),
8621                                  errmsg("could not create file \"%s\": %m",
8622                                                 histfilepath)));
8623         fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
8624                         startpoint.xlogid, startpoint.xrecoff, startxlogfilename);
8625         fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
8626                         stoppoint.xlogid, stoppoint.xrecoff, stopxlogfilename);
8627         /* transfer remaining lines from label to history file */
8628         while ((ich = fgetc(lfp)) != EOF)
8629                 fputc(ich, fp);
8630         fprintf(fp, "STOP TIME: %s\n", strfbuf);
8631         if (fflush(fp) || ferror(fp) || FreeFile(fp))
8632                 ereport(ERROR,
8633                                 (errcode_for_file_access(),
8634                                  errmsg("could not write file \"%s\": %m",
8635                                                 histfilepath)));
8636
8637         /*
8638          * Close and remove the backup label file
8639          */
8640         if (ferror(lfp) || FreeFile(lfp))
8641                 ereport(ERROR,
8642                                 (errcode_for_file_access(),
8643                                  errmsg("could not read file \"%s\": %m",
8644                                                 BACKUP_LABEL_FILE)));
8645         if (unlink(BACKUP_LABEL_FILE) != 0)
8646                 ereport(ERROR,
8647                                 (errcode_for_file_access(),
8648                                  errmsg("could not remove file \"%s\": %m",
8649                                                 BACKUP_LABEL_FILE)));
8650
8651         /*
8652          * Clean out any no-longer-needed history files.  As a side effect, this
8653          * will post a .ready file for the newly created history file, notifying
8654          * the archiver that history file may be archived immediately.
8655          */
8656         CleanupBackupHistory();
8657
8658         /*
8659          * If archiving is enabled, wait for all the required WAL files to be
8660          * archived before returning. If archiving isn't enabled, the required WAL
8661          * needs to be transported via streaming replication (hopefully with
8662          * wal_keep_segments set high enough), or some more exotic mechanism like
8663          * polling and copying files from pg_xlog with script. We have no
8664          * knowledge of those mechanisms, so it's up to the user to ensure that he
8665          * gets all the required WAL.
8666          *
8667          * We wait until both the last WAL file filled during backup and the
8668          * history file have been archived, and assume that the alphabetic sorting
8669          * property of the WAL files ensures any earlier WAL files are safely
8670          * archived as well.
8671          *
8672          * We wait forever, since archive_command is supposed to work and we
8673          * assume the admin wanted his backup to work completely. If you don't
8674          * wish to wait, you can set statement_timeout.  Also, some notices are
8675          * issued to clue in anyone who might be doing this interactively.
8676          */
8677         if (XLogArchivingActive())
8678         {
8679                 XLByteToPrevSeg(stoppoint, _logId, _logSeg);
8680                 XLogFileName(lastxlogfilename, ThisTimeLineID, _logId, _logSeg);
8681
8682                 XLByteToSeg(startpoint, _logId, _logSeg);
8683                 BackupHistoryFileName(histfilename, ThisTimeLineID, _logId, _logSeg,
8684                                                           startpoint.xrecoff % XLogSegSize);
8685
8686                 seconds_before_warning = 60;
8687                 waits = 0;
8688
8689                 while (XLogArchiveIsBusy(lastxlogfilename) ||
8690                            XLogArchiveIsBusy(histfilename))
8691                 {
8692                         CHECK_FOR_INTERRUPTS();
8693
8694                         if (!reported_waiting && waits > 5)
8695                         {
8696                                 ereport(NOTICE,
8697                                                 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
8698                                 reported_waiting = true;
8699                         }
8700
8701                         pg_usleep(1000000L);
8702
8703                         if (++waits >= seconds_before_warning)
8704                         {
8705                                 seconds_before_warning *= 2;    /* This wraps in >10 years... */
8706                                 ereport(WARNING,
8707                                                 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
8708                                                                 waits),
8709                                                  errhint("Check that your archive_command is executing properly.  "
8710                                                                  "pg_stop_backup can be cancelled safely, "
8711                                                                  "but the database backup will not be usable without all the WAL segments.")));
8712                         }
8713                 }
8714
8715                 ereport(NOTICE,
8716                                 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
8717         }
8718         else
8719                 ereport(NOTICE,
8720                                 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
8721
8722         /*
8723          * We're done.  As a convenience, return the ending WAL location.
8724          */
8725         snprintf(stopxlogfilename, sizeof(stopxlogfilename), "%X/%X",
8726                          stoppoint.xlogid, stoppoint.xrecoff);
8727         PG_RETURN_TEXT_P(cstring_to_text(stopxlogfilename));
8728 }
8729
8730 /*
8731  * pg_switch_xlog: switch to next xlog file
8732  */
8733 Datum
8734 pg_switch_xlog(PG_FUNCTION_ARGS)
8735 {
8736         XLogRecPtr      switchpoint;
8737         char            location[MAXFNAMELEN];
8738
8739         if (!superuser())
8740                 ereport(ERROR,
8741                                 (errcode(ERRCODE_INSUFFICIENT_PRIVILEGE),
8742                          (errmsg("must be superuser to switch transaction log files"))));
8743
8744         if (RecoveryInProgress())
8745                 ereport(ERROR,
8746                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8747                                  errmsg("recovery is in progress"),
8748                                  errhint("WAL control functions cannot be executed during recovery.")));
8749
8750         switchpoint = RequestXLogSwitch();
8751
8752         /*
8753          * As a convenience, return the WAL location of the switch record
8754          */
8755         snprintf(location, sizeof(location), "%X/%X",
8756                          switchpoint.xlogid, switchpoint.xrecoff);
8757         PG_RETURN_TEXT_P(cstring_to_text(location));
8758 }
8759
8760 /*
8761  * Report the current WAL write location (same format as pg_start_backup etc)
8762  *
8763  * This is useful for determining how much of WAL is visible to an external
8764  * archiving process.  Note that the data before this point is written out
8765  * to the kernel, but is not necessarily synced to disk.
8766  */
8767 Datum
8768 pg_current_xlog_location(PG_FUNCTION_ARGS)
8769 {
8770         char            location[MAXFNAMELEN];
8771
8772         if (RecoveryInProgress())
8773                 ereport(ERROR,
8774                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8775                                  errmsg("recovery is in progress"),
8776                                  errhint("WAL control functions cannot be executed during recovery.")));
8777
8778         /* Make sure we have an up-to-date local LogwrtResult */
8779         {
8780                 /* use volatile pointer to prevent code rearrangement */
8781                 volatile XLogCtlData *xlogctl = XLogCtl;
8782
8783                 SpinLockAcquire(&xlogctl->info_lck);
8784                 LogwrtResult = xlogctl->LogwrtResult;
8785                 SpinLockRelease(&xlogctl->info_lck);
8786         }
8787
8788         snprintf(location, sizeof(location), "%X/%X",
8789                          LogwrtResult.Write.xlogid, LogwrtResult.Write.xrecoff);
8790         PG_RETURN_TEXT_P(cstring_to_text(location));
8791 }
8792
8793 /*
8794  * Report the current WAL insert location (same format as pg_start_backup etc)
8795  *
8796  * This function is mostly for debugging purposes.
8797  */
8798 Datum
8799 pg_current_xlog_insert_location(PG_FUNCTION_ARGS)
8800 {
8801         XLogCtlInsert *Insert = &XLogCtl->Insert;
8802         XLogRecPtr      current_recptr;
8803         char            location[MAXFNAMELEN];
8804
8805         if (RecoveryInProgress())
8806                 ereport(ERROR,
8807                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8808                                  errmsg("recovery is in progress"),
8809                                  errhint("WAL control functions cannot be executed during recovery.")));
8810
8811         /*
8812          * Get the current end-of-WAL position ... shared lock is sufficient
8813          */
8814         LWLockAcquire(WALInsertLock, LW_SHARED);
8815         INSERT_RECPTR(current_recptr, Insert, Insert->curridx);
8816         LWLockRelease(WALInsertLock);
8817
8818         snprintf(location, sizeof(location), "%X/%X",
8819                          current_recptr.xlogid, current_recptr.xrecoff);
8820         PG_RETURN_TEXT_P(cstring_to_text(location));
8821 }
8822
8823 /*
8824  * Report the last WAL receive location (same format as pg_start_backup etc)
8825  *
8826  * This is useful for determining how much of WAL is guaranteed to be received
8827  * and synced to disk by walreceiver.
8828  */
8829 Datum
8830 pg_last_xlog_receive_location(PG_FUNCTION_ARGS)
8831 {
8832         XLogRecPtr      recptr;
8833         char            location[MAXFNAMELEN];
8834
8835         recptr = GetWalRcvWriteRecPtr(NULL);
8836
8837         if (recptr.xlogid == 0 && recptr.xrecoff == 0)
8838                 PG_RETURN_NULL();
8839
8840         snprintf(location, sizeof(location), "%X/%X",
8841                          recptr.xlogid, recptr.xrecoff);
8842         PG_RETURN_TEXT_P(cstring_to_text(location));
8843 }
8844
8845 /*
8846  * Report the last WAL replay location (same format as pg_start_backup etc)
8847  *
8848  * This is useful for determining how much of WAL is visible to read-only
8849  * connections during recovery.
8850  */
8851 Datum
8852 pg_last_xlog_replay_location(PG_FUNCTION_ARGS)
8853 {
8854         /* use volatile pointer to prevent code rearrangement */
8855         volatile XLogCtlData *xlogctl = XLogCtl;
8856         XLogRecPtr      recptr;
8857         char            location[MAXFNAMELEN];
8858
8859         SpinLockAcquire(&xlogctl->info_lck);
8860         recptr = xlogctl->recoveryLastRecPtr;
8861         SpinLockRelease(&xlogctl->info_lck);
8862
8863         if (recptr.xlogid == 0 && recptr.xrecoff == 0)
8864                 PG_RETURN_NULL();
8865
8866         snprintf(location, sizeof(location), "%X/%X",
8867                          recptr.xlogid, recptr.xrecoff);
8868         PG_RETURN_TEXT_P(cstring_to_text(location));
8869 }
8870
8871 /*
8872  * Compute an xlog file name and decimal byte offset given a WAL location,
8873  * such as is returned by pg_stop_backup() or pg_xlog_switch().
8874  *
8875  * Note that a location exactly at a segment boundary is taken to be in
8876  * the previous segment.  This is usually the right thing, since the
8877  * expected usage is to determine which xlog file(s) are ready to archive.
8878  */
8879 Datum
8880 pg_xlogfile_name_offset(PG_FUNCTION_ARGS)
8881 {
8882         text       *location = PG_GETARG_TEXT_P(0);
8883         char       *locationstr;
8884         unsigned int uxlogid;
8885         unsigned int uxrecoff;
8886         uint32          xlogid;
8887         uint32          xlogseg;
8888         uint32          xrecoff;
8889         XLogRecPtr      locationpoint;
8890         char            xlogfilename[MAXFNAMELEN];
8891         Datum           values[2];
8892         bool            isnull[2];
8893         TupleDesc       resultTupleDesc;
8894         HeapTuple       resultHeapTuple;
8895         Datum           result;
8896
8897         if (RecoveryInProgress())
8898                 ereport(ERROR,
8899                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8900                                  errmsg("recovery is in progress"),
8901                                  errhint("pg_xlogfile_name_offset() cannot be executed during recovery.")));
8902
8903         /*
8904          * Read input and parse
8905          */
8906         locationstr = text_to_cstring(location);
8907
8908         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
8909                 ereport(ERROR,
8910                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8911                                  errmsg("could not parse transaction log location \"%s\"",
8912                                                 locationstr)));
8913
8914         locationpoint.xlogid = uxlogid;
8915         locationpoint.xrecoff = uxrecoff;
8916
8917         /*
8918          * Construct a tuple descriptor for the result row.  This must match this
8919          * function's pg_proc entry!
8920          */
8921         resultTupleDesc = CreateTemplateTupleDesc(2, false);
8922         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 1, "file_name",
8923                                            TEXTOID, -1, 0);
8924         TupleDescInitEntry(resultTupleDesc, (AttrNumber) 2, "file_offset",
8925                                            INT4OID, -1, 0);
8926
8927         resultTupleDesc = BlessTupleDesc(resultTupleDesc);
8928
8929         /*
8930          * xlogfilename
8931          */
8932         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
8933         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
8934
8935         values[0] = CStringGetTextDatum(xlogfilename);
8936         isnull[0] = false;
8937
8938         /*
8939          * offset
8940          */
8941         xrecoff = locationpoint.xrecoff - xlogseg * XLogSegSize;
8942
8943         values[1] = UInt32GetDatum(xrecoff);
8944         isnull[1] = false;
8945
8946         /*
8947          * Tuple jam: Having first prepared your Datums, then squash together
8948          */
8949         resultHeapTuple = heap_form_tuple(resultTupleDesc, values, isnull);
8950
8951         result = HeapTupleGetDatum(resultHeapTuple);
8952
8953         PG_RETURN_DATUM(result);
8954 }
8955
8956 /*
8957  * Compute an xlog file name given a WAL location,
8958  * such as is returned by pg_stop_backup() or pg_xlog_switch().
8959  */
8960 Datum
8961 pg_xlogfile_name(PG_FUNCTION_ARGS)
8962 {
8963         text       *location = PG_GETARG_TEXT_P(0);
8964         char       *locationstr;
8965         unsigned int uxlogid;
8966         unsigned int uxrecoff;
8967         uint32          xlogid;
8968         uint32          xlogseg;
8969         XLogRecPtr      locationpoint;
8970         char            xlogfilename[MAXFNAMELEN];
8971
8972         if (RecoveryInProgress())
8973                 ereport(ERROR,
8974                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
8975                                  errmsg("recovery is in progress"),
8976                  errhint("pg_xlogfile_name() cannot be executed during recovery.")));
8977
8978         locationstr = text_to_cstring(location);
8979
8980         if (sscanf(locationstr, "%X/%X", &uxlogid, &uxrecoff) != 2)
8981                 ereport(ERROR,
8982                                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
8983                                  errmsg("could not parse transaction log location \"%s\"",
8984                                                 locationstr)));
8985
8986         locationpoint.xlogid = uxlogid;
8987         locationpoint.xrecoff = uxrecoff;
8988
8989         XLByteToPrevSeg(locationpoint, xlogid, xlogseg);
8990         XLogFileName(xlogfilename, ThisTimeLineID, xlogid, xlogseg);
8991
8992         PG_RETURN_TEXT_P(cstring_to_text(xlogfilename));
8993 }
8994
8995 /*
8996  * read_backup_label: check to see if a backup_label file is present
8997  *
8998  * If we see a backup_label during recovery, we assume that we are recovering
8999  * from a backup dump file, and we therefore roll forward from the checkpoint
9000  * identified by the label file, NOT what pg_control says.      This avoids the
9001  * problem that pg_control might have been archived one or more checkpoints
9002  * later than the start of the dump, and so if we rely on it as the start
9003  * point, we will fail to restore a consistent database state.
9004  *
9005  * Returns TRUE if a backup_label was found (and fills the checkpoint
9006  * location and its REDO location into *checkPointLoc and RedoStartLSN,
9007  * respectively); returns FALSE if not.
9008  */
9009 static bool
9010 read_backup_label(XLogRecPtr *checkPointLoc)
9011 {
9012         char            startxlogfilename[MAXFNAMELEN];
9013         TimeLineID      tli;
9014         FILE       *lfp;
9015         char            ch;
9016
9017         /*
9018          * See if label file is present
9019          */
9020         lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
9021         if (!lfp)
9022         {
9023                 if (errno != ENOENT)
9024                         ereport(FATAL,
9025                                         (errcode_for_file_access(),
9026                                          errmsg("could not read file \"%s\": %m",
9027                                                         BACKUP_LABEL_FILE)));
9028                 return false;                   /* it's not there, all is fine */
9029         }
9030
9031         /*
9032          * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
9033          * is pretty crude, but we are not expecting any variability in the file
9034          * format).
9035          */
9036         if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
9037                            &RedoStartLSN.xlogid, &RedoStartLSN.xrecoff, &tli,
9038                            startxlogfilename, &ch) != 5 || ch != '\n')
9039                 ereport(FATAL,
9040                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9041                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9042         if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
9043                            &checkPointLoc->xlogid, &checkPointLoc->xrecoff,
9044                            &ch) != 3 || ch != '\n')
9045                 ereport(FATAL,
9046                                 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
9047                                  errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
9048         if (ferror(lfp) || FreeFile(lfp))
9049                 ereport(FATAL,
9050                                 (errcode_for_file_access(),
9051                                  errmsg("could not read file \"%s\": %m",
9052                                                 BACKUP_LABEL_FILE)));
9053
9054         return true;
9055 }
9056
9057 /*
9058  * Error context callback for errors occurring during rm_redo().
9059  */
9060 static void
9061 rm_redo_error_callback(void *arg)
9062 {
9063         XLogRecord *record = (XLogRecord *) arg;
9064         StringInfoData buf;
9065
9066         initStringInfo(&buf);
9067         RmgrTable[record->xl_rmid].rm_desc(&buf,
9068                                                                            record->xl_info,
9069                                                                            XLogRecGetData(record));
9070
9071         /* don't bother emitting empty description */
9072         if (buf.len > 0)
9073                 errcontext("xlog redo %s", buf.data);
9074
9075         pfree(buf.data);
9076 }
9077
9078 /*
9079  * BackupInProgress: check if online backup mode is active
9080  *
9081  * This is done by checking for existence of the "backup_label" file.
9082  */
9083 bool
9084 BackupInProgress(void)
9085 {
9086         struct stat stat_buf;
9087
9088         return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
9089 }
9090
9091 /*
9092  * CancelBackup: rename the "backup_label" file to cancel backup mode
9093  *
9094  * If the "backup_label" file exists, it will be renamed to "backup_label.old".
9095  * Note that this will render an online backup in progress useless.
9096  * To correctly finish an online backup, pg_stop_backup must be called.
9097  */
9098 void
9099 CancelBackup(void)
9100 {
9101         struct stat stat_buf;
9102
9103         /* if the file is not there, return */
9104         if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
9105                 return;
9106
9107         /* remove leftover file from previously cancelled backup if it exists */
9108         unlink(BACKUP_LABEL_OLD);
9109
9110         if (rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD) == 0)
9111         {
9112                 ereport(LOG,
9113                                 (errmsg("online backup mode cancelled"),
9114                                  errdetail("\"%s\" was renamed to \"%s\".",
9115                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9116         }
9117         else
9118         {
9119                 ereport(WARNING,
9120                                 (errcode_for_file_access(),
9121                                  errmsg("online backup mode was not cancelled"),
9122                                  errdetail("Could not rename \"%s\" to \"%s\": %m.",
9123                                                    BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
9124         }
9125 }
9126
9127 /* ------------------------------------------------------
9128  *      Startup Process main entry point and signal handlers
9129  * ------------------------------------------------------
9130  */
9131
9132 /*
9133  * startupproc_quickdie() occurs when signalled SIGQUIT by the postmaster.
9134  *
9135  * Some backend has bought the farm,
9136  * so we need to stop what we're doing and exit.
9137  */
9138 static void
9139 startupproc_quickdie(SIGNAL_ARGS)
9140 {
9141         PG_SETMASK(&BlockSig);
9142
9143         /*
9144          * We DO NOT want to run proc_exit() callbacks -- we're here because
9145          * shared memory may be corrupted, so we don't want to try to clean up our
9146          * transaction.  Just nail the windows shut and get out of town.  Now that
9147          * there's an atexit callback to prevent third-party code from breaking
9148          * things by calling exit() directly, we have to reset the callbacks
9149          * explicitly to make this work as intended.
9150          */
9151         on_exit_reset();
9152
9153         /*
9154          * Note we do exit(2) not exit(0).      This is to force the postmaster into a
9155          * system reset cycle if some idiot DBA sends a manual SIGQUIT to a random
9156          * backend.  This is necessary precisely because we don't clean up our
9157          * shared memory state.  (The "dead man switch" mechanism in pmsignal.c
9158          * should ensure the postmaster sees this as a crash, too, but no harm in
9159          * being doubly sure.)
9160          */
9161         exit(2);
9162 }
9163
9164
9165 /* SIGHUP: set flag to re-read config file at next convenient time */
9166 static void
9167 StartupProcSigHupHandler(SIGNAL_ARGS)
9168 {
9169         got_SIGHUP = true;
9170 }
9171
9172 /* SIGTERM: set flag to abort redo and exit */
9173 static void
9174 StartupProcShutdownHandler(SIGNAL_ARGS)
9175 {
9176         if (in_restore_command)
9177                 proc_exit(1);
9178         else
9179                 shutdown_requested = true;
9180 }
9181
9182 /* Handle SIGHUP and SIGTERM signals of startup process */
9183 void
9184 HandleStartupProcInterrupts(void)
9185 {
9186         /*
9187          * Check if we were requested to re-read config file.
9188          */
9189         if (got_SIGHUP)
9190         {
9191                 got_SIGHUP = false;
9192                 ProcessConfigFile(PGC_SIGHUP);
9193         }
9194
9195         /*
9196          * Check if we were requested to exit without finishing recovery.
9197          */
9198         if (shutdown_requested)
9199                 proc_exit(1);
9200
9201         /*
9202          * Emergency bailout if postmaster has died.  This is to avoid the
9203          * necessity for manual cleanup of all postmaster children.
9204          */
9205         if (IsUnderPostmaster && !PostmasterIsAlive(true))
9206                 exit(1);
9207 }
9208
9209 /* Main entry point for startup process */
9210 void
9211 StartupProcessMain(void)
9212 {
9213         /*
9214          * If possible, make this process a group leader, so that the postmaster
9215          * can signal any child processes too.
9216          */
9217 #ifdef HAVE_SETSID
9218         if (setsid() < 0)
9219                 elog(FATAL, "setsid() failed: %m");
9220 #endif
9221
9222         /*
9223          * Properly accept or ignore signals the postmaster might send us.
9224          *
9225          * Note: ideally we'd not enable handle_standby_sig_alarm unless actually
9226          * doing hot standby, but we don't know that yet.  Rely on it to not do
9227          * anything if it shouldn't.
9228          */
9229         pqsignal(SIGHUP, StartupProcSigHupHandler); /* reload config file */
9230         pqsignal(SIGINT, SIG_IGN);      /* ignore query cancel */
9231         pqsignal(SIGTERM, StartupProcShutdownHandler);          /* request shutdown */
9232         pqsignal(SIGQUIT, startupproc_quickdie);        /* hard crash time */
9233         if (EnableHotStandby)
9234                 pqsignal(SIGALRM, handle_standby_sig_alarm);    /* ignored unless
9235                                                                                                                  * InHotStandby */
9236         else
9237                 pqsignal(SIGALRM, SIG_IGN);
9238         pqsignal(SIGPIPE, SIG_IGN);
9239         pqsignal(SIGUSR1, SIG_IGN);
9240         pqsignal(SIGUSR2, SIG_IGN);
9241
9242         /*
9243          * Reset some signals that are accepted by postmaster but not here
9244          */
9245         pqsignal(SIGCHLD, SIG_DFL);
9246         pqsignal(SIGTTIN, SIG_DFL);
9247         pqsignal(SIGTTOU, SIG_DFL);
9248         pqsignal(SIGCONT, SIG_DFL);
9249         pqsignal(SIGWINCH, SIG_DFL);
9250
9251         /*
9252          * Unblock signals (they were blocked when the postmaster forked us)
9253          */
9254         PG_SETMASK(&UnBlockSig);
9255
9256         StartupXLOG();
9257
9258         /*
9259          * Exit normally. Exit code 0 tells postmaster that we completed recovery
9260          * successfully.
9261          */
9262         proc_exit(0);
9263 }
9264
9265 /*
9266  * Read the XLOG page containing RecPtr into readBuf (if not read already).
9267  * Returns true if the page is read successfully.
9268  *
9269  * This is responsible for restoring files from archive as needed, as well
9270  * as for waiting for the requested WAL record to arrive in standby mode.
9271  *
9272  * 'emode' specifies the log level used for reporting "file not found" or
9273  * "end of WAL" situations in archive recovery, or in standby mode when a
9274  * trigger file is found. If set to WARNING or below, XLogPageRead() returns
9275  * false in those situations, on higher log levels the ereport() won't
9276  * return.
9277  *
9278  * In standby mode, if after a successful return of XLogPageRead() the
9279  * caller finds the record it's interested in to be broken, it should
9280  * ereport the error with the level determined by
9281  * emode_for_corrupt_record(), and then set "failedSources |= readSource"
9282  * and call XLogPageRead() again with the same arguments. This lets
9283  * XLogPageRead() to try fetching the record from another source, or to
9284  * sleep and retry.
9285  */
9286 static bool
9287 XLogPageRead(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt,
9288                          bool randAccess)
9289 {
9290         static XLogRecPtr receivedUpto = {0, 0};
9291         bool            switched_segment = false;
9292         uint32          targetPageOff;
9293         uint32          targetRecOff;
9294         uint32          targetId;
9295         uint32          targetSeg;
9296         static pg_time_t last_fail_time = 0;
9297
9298         XLByteToSeg(*RecPtr, targetId, targetSeg);
9299         targetPageOff = ((RecPtr->xrecoff % XLogSegSize) / XLOG_BLCKSZ) * XLOG_BLCKSZ;
9300         targetRecOff = RecPtr->xrecoff % XLOG_BLCKSZ;
9301
9302         /* Fast exit if we have read the record in the current buffer already */
9303         if (failedSources == 0 && targetId == readId && targetSeg == readSeg &&
9304                 targetPageOff == readOff && targetRecOff < readLen)
9305                 return true;
9306
9307         /*
9308          * See if we need to switch to a new segment because the requested record
9309          * is not in the currently open one.
9310          */
9311         if (readFile >= 0 && !XLByteInSeg(*RecPtr, readId, readSeg))
9312         {
9313                 /*
9314                  * Signal bgwriter to start a restartpoint if we've replayed too much
9315                  * xlog since the last one.
9316                  */
9317                 if (StandbyMode && bgwriterLaunched)
9318                 {
9319                         if (XLogCheckpointNeeded(readId, readSeg))
9320                         {
9321                                 (void) GetRedoRecPtr();
9322                                 if (XLogCheckpointNeeded(readId, readSeg))
9323                                         RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
9324                         }
9325                 }
9326
9327                 close(readFile);
9328                 readFile = -1;
9329                 readSource = 0;
9330         }
9331
9332         XLByteToSeg(*RecPtr, readId, readSeg);
9333
9334 retry:
9335         /* See if we need to retrieve more data */
9336         if (readFile < 0 ||
9337                 (readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto)))
9338         {
9339                 if (StandbyMode)
9340                 {
9341                         /*
9342                          * In standby mode, wait for the requested record to become
9343                          * available, either via restore_command succeeding to restore the
9344                          * segment, or via walreceiver having streamed the record.
9345                          */
9346                         for (;;)
9347                         {
9348                                 if (WalRcvInProgress())
9349                                 {
9350                                         bool            havedata;
9351
9352                                         /*
9353                                          * If we find an invalid record in the WAL streamed from
9354                                          * master, something is seriously wrong. There's little
9355                                          * chance that the problem will just go away, but PANIC is
9356                                          * not good for availability either, especially in hot
9357                                          * standby mode. Disconnect, and retry from
9358                                          * archive/pg_xlog again. The WAL in the archive should be
9359                                          * identical to what was streamed, so it's unlikely that
9360                                          * it helps, but one can hope...
9361                                          */
9362                                         if (failedSources & XLOG_FROM_STREAM)
9363                                         {
9364                                                 ShutdownWalRcv();
9365                                                 continue;
9366                                         }
9367
9368                                         /*
9369                                          * Walreceiver is active, so see if new data has arrived.
9370                                          *
9371                                          * We only advance XLogReceiptTime when we obtain fresh
9372                                          * WAL from walreceiver and observe that we had already
9373                                          * processed everything before the most recent "chunk"
9374                                          * that it flushed to disk.  In steady state where we are
9375                                          * keeping up with the incoming data, XLogReceiptTime will
9376                                          * be updated on each cycle.  When we are behind,
9377                                          * XLogReceiptTime will not advance, so the grace time
9378                                          * alloted to conflicting queries will decrease.
9379                                          */
9380                                         if (XLByteLT(*RecPtr, receivedUpto))
9381                                                 havedata = true;
9382                                         else
9383                                         {
9384                                                 XLogRecPtr      latestChunkStart;
9385
9386                                                 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
9387                                                 if (XLByteLT(*RecPtr, receivedUpto))
9388                                                 {
9389                                                         havedata = true;
9390                                                         if (!XLByteLT(*RecPtr, latestChunkStart))
9391                                                                 XLogReceiptTime = GetCurrentTimestamp();
9392                                                 }
9393                                                 else
9394                                                         havedata = false;
9395                                         }
9396                                         if (havedata)
9397                                         {
9398                                                 /*
9399                                                  * Great, streamed far enough. Open the file if it's
9400                                                  * not open already.  Use XLOG_FROM_STREAM so that
9401                                                  * source info is set correctly and XLogReceiptTime
9402                                                  * isn't changed.
9403                                                  */
9404                                                 if (readFile < 0)
9405                                                 {
9406                                                         readFile =
9407                                                                 XLogFileRead(readId, readSeg, PANIC,
9408                                                                                          recoveryTargetTLI,
9409                                                                                          XLOG_FROM_STREAM, false);
9410                                                         Assert(readFile >= 0);
9411                                                         switched_segment = true;
9412                                                 }
9413                                                 else
9414                                                 {
9415                                                         /* just make sure source info is correct... */
9416                                                         readSource = XLOG_FROM_STREAM;
9417                                                         XLogReceiptSource = XLOG_FROM_STREAM;
9418                                                 }
9419                                                 break;
9420                                         }
9421
9422                                         /*
9423                                          * Data not here yet, so check for trigger then sleep.
9424                                          */
9425                                         if (CheckForStandbyTrigger())
9426                                                 goto triggered;
9427
9428                                         /*
9429                                          * When streaming is active, we want to react quickly when
9430                                          * the next WAL record arrives, so sleep only a bit.
9431                                          */
9432                                         pg_usleep(100000L); /* 100ms */
9433                                 }
9434                                 else
9435                                 {
9436                                         int                     sources;
9437                                         pg_time_t       now;
9438
9439                                         /*
9440                                          * Until walreceiver manages to reconnect, poll the
9441                                          * archive.
9442                                          */
9443                                         if (readFile >= 0)
9444                                         {
9445                                                 close(readFile);
9446                                                 readFile = -1;
9447                                         }
9448                                         /* Reset curFileTLI if random fetch. */
9449                                         if (randAccess)
9450                                                 curFileTLI = 0;
9451
9452                                         /*
9453                                          * Try to restore the file from archive, or read an
9454                                          * existing file from pg_xlog.
9455                                          */
9456                                         sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG;
9457                                         if (!(sources & ~failedSources))
9458                                         {
9459                                                 /*
9460                                                  * We've exhausted all options for retrieving the
9461                                                  * file. Retry ...
9462                                                  */
9463                                                 failedSources = 0;
9464
9465                                                 /*
9466                                                  * ... but sleep first if it hasn't been long since
9467                                                  * last attempt.
9468                                                  */
9469                                                 now = (pg_time_t) time(NULL);
9470                                                 if ((now - last_fail_time) < 5)
9471                                                 {
9472                                                         pg_usleep(1000000L * (5 - (now - last_fail_time)));
9473                                                         now = (pg_time_t) time(NULL);
9474                                                 }
9475                                                 last_fail_time = now;
9476
9477                                                 /*
9478                                                  * If primary_conninfo is set, launch walreceiver to
9479                                                  * try to stream the missing WAL, before retrying to
9480                                                  * restore from archive/pg_xlog.
9481                                                  *
9482                                                  * If fetching_ckpt is TRUE, RecPtr points to the
9483                                                  * initial checkpoint location. In that case, we use
9484                                                  * RedoStartLSN as the streaming start position
9485                                                  * instead of RecPtr, so that when we later jump
9486                                                  * backwards to start redo at RedoStartLSN, we will
9487                                                  * have the logs streamed already.
9488                                                  */
9489                                                 if (PrimaryConnInfo)
9490                                                 {
9491                                                         RequestXLogStreaming(
9492                                                                           fetching_ckpt ? RedoStartLSN : *RecPtr,
9493                                                                                                  PrimaryConnInfo);
9494                                                         continue;
9495                                                 }
9496                                         }
9497                                         /* Don't try to read from a source that just failed */
9498                                         sources &= ~failedSources;
9499                                         readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,
9500                                                                                                   sources);
9501                                         switched_segment = true;
9502                                         if (readFile >= 0)
9503                                                 break;
9504
9505                                         /*
9506                                          * Nope, not found in archive and/or pg_xlog.
9507                                          */
9508                                         failedSources |= sources;
9509
9510                                         /*
9511                                          * Check to see if the trigger file exists. Note that we
9512                                          * do this only after failure, so when you create the
9513                                          * trigger file, we still finish replaying as much as we
9514                                          * can from archive and pg_xlog before failover.
9515                                          */
9516                                         if (CheckForStandbyTrigger())
9517                                                 goto triggered;
9518                                 }
9519
9520                                 /*
9521                                  * This possibly-long loop needs to handle interrupts of
9522                                  * startup process.
9523                                  */
9524                                 HandleStartupProcInterrupts();
9525                         }
9526                 }
9527                 else
9528                 {
9529                         /* In archive or crash recovery. */
9530                         if (readFile < 0)
9531                         {
9532                                 int                     sources;
9533
9534                                 /* Reset curFileTLI if random fetch. */
9535                                 if (randAccess)
9536                                         curFileTLI = 0;
9537
9538                                 sources = XLOG_FROM_PG_XLOG;
9539                                 if (InArchiveRecovery)
9540                                         sources |= XLOG_FROM_ARCHIVE;
9541
9542                                 readFile = XLogFileReadAnyTLI(readId, readSeg, emode,
9543                                                                                           sources);
9544                                 switched_segment = true;
9545                                 if (readFile < 0)
9546                                         return false;
9547                         }
9548                 }
9549         }
9550
9551         /*
9552          * At this point, we have the right segment open and if we're streaming we
9553          * know the requested record is in it.
9554          */
9555         Assert(readFile != -1);
9556
9557         /*
9558          * If the current segment is being streamed from master, calculate how
9559          * much of the current page we have received already. We know the
9560          * requested record has been received, but this is for the benefit of
9561          * future calls, to allow quick exit at the top of this function.
9562          */
9563         if (readSource == XLOG_FROM_STREAM)
9564         {
9565                 if (RecPtr->xlogid != receivedUpto.xlogid ||
9566                         (RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ))
9567                 {
9568                         readLen = XLOG_BLCKSZ;
9569                 }
9570                 else
9571                         readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff;
9572         }
9573         else
9574                 readLen = XLOG_BLCKSZ;
9575
9576         if (switched_segment && targetPageOff != 0)
9577         {
9578                 /*
9579                  * Whenever switching to a new WAL segment, we read the first page of
9580                  * the file and validate its header, even if that's not where the
9581                  * target record is.  This is so that we can check the additional
9582                  * identification info that is present in the first page's "long"
9583                  * header.
9584                  */
9585                 readOff = 0;
9586                 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
9587                 {
9588                         ereport(emode_for_corrupt_record(emode, *RecPtr),
9589                                         (errcode_for_file_access(),
9590                                          errmsg("could not read from log file %u, segment %u, offset %u: %m",
9591                                                         readId, readSeg, readOff)));
9592                         goto next_record_is_invalid;
9593                 }
9594                 if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
9595                         goto next_record_is_invalid;
9596         }
9597
9598         /* Read the requested page */
9599         readOff = targetPageOff;
9600         if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
9601         {
9602                 ereport(emode_for_corrupt_record(emode, *RecPtr),
9603                                 (errcode_for_file_access(),
9604                  errmsg("could not seek in log file %u, segment %u to offset %u: %m",
9605                                 readId, readSeg, readOff)));
9606                 goto next_record_is_invalid;
9607         }
9608         if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
9609         {
9610                 ereport(emode_for_corrupt_record(emode, *RecPtr),
9611                                 (errcode_for_file_access(),
9612                  errmsg("could not read from log file %u, segment %u, offset %u: %m",
9613                                 readId, readSeg, readOff)));
9614                 goto next_record_is_invalid;
9615         }
9616         if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode))
9617                 goto next_record_is_invalid;
9618
9619         Assert(targetId == readId);
9620         Assert(targetSeg == readSeg);
9621         Assert(targetPageOff == readOff);
9622         Assert(targetRecOff < readLen);
9623
9624         return true;
9625
9626 next_record_is_invalid:
9627         failedSources |= readSource;
9628
9629         if (readFile >= 0)
9630                 close(readFile);
9631         readFile = -1;
9632         readLen = 0;
9633         readSource = 0;
9634
9635         /* In standby-mode, keep trying */
9636         if (StandbyMode)
9637                 goto retry;
9638         else
9639                 return false;
9640
9641 triggered:
9642         if (readFile >= 0)
9643                 close(readFile);
9644         readFile = -1;
9645         readLen = 0;
9646         readSource = 0;
9647
9648         return false;
9649 }
9650
9651 /*
9652  * Determine what log level should be used to report a corrupt WAL record
9653  * in the current WAL page, previously read by XLogPageRead().
9654  *
9655  * 'emode' is the error mode that would be used to report a file-not-found
9656  * or legitimate end-of-WAL situation.   Generally, we use it as-is, but if
9657  * we're retrying the exact same record that we've tried previously, only
9658  * complain the first time to keep the noise down.      However, we only do when
9659  * reading from pg_xlog, because we don't expect any invalid records in archive
9660  * or in records streamed from master. Files in the archive should be complete,
9661  * and we should never hit the end of WAL because we stop and wait for more WAL
9662  * to arrive before replaying it.
9663  *
9664  * NOTE: This function remembers the RecPtr value it was last called with,
9665  * to suppress repeated messages about the same record. Only call this when
9666  * you are about to ereport(), or you might cause a later message to be
9667  * erroneously suppressed.
9668  */
9669 static int
9670 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
9671 {
9672         static XLogRecPtr lastComplaint = {0, 0};
9673
9674         if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
9675         {
9676                 if (XLByteEQ(RecPtr, lastComplaint))
9677                         emode = DEBUG1;
9678                 else
9679                         lastComplaint = RecPtr;
9680         }
9681         return emode;
9682 }
9683
9684 /*
9685  * Check to see if the trigger file exists. If it does, request postmaster
9686  * to shut down walreceiver, wait for it to exit, remove the trigger
9687  * file, and return true.
9688  */
9689 static bool
9690 CheckForStandbyTrigger(void)
9691 {
9692         struct stat stat_buf;
9693
9694         if (TriggerFile == NULL)
9695                 return false;
9696
9697         if (stat(TriggerFile, &stat_buf) == 0)
9698         {
9699                 ereport(LOG,
9700                                 (errmsg("trigger file found: %s", TriggerFile)));
9701                 ShutdownWalRcv();
9702                 unlink(TriggerFile);
9703                 return true;
9704         }
9705         return false;
9706 }