From 07e8b6aabcca3ad9a67681694d955f607e29ce7b Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Thu, 17 Jun 2010 16:41:25 +0000 Subject: [PATCH] Don't allow walsender to send WAL data until it's been safely fsync'd on the master. Otherwise a subsequent crash could cause the master to lose WAL that has already been applied on the slave, resulting in the slave being out of sync and soon corrupt. Per recent discussion and an example from Robert Haas. Fujii Masao --- src/backend/access/transam/xlog.c | 9 +++++---- src/backend/replication/walsender.c | 24 +++++++++++++----------- src/include/access/xlog.h | 4 ++-- 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 5787b3d164..ab474c35b0 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.424 2010/06/14 06:04:21 heikki Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.425 2010/06/17 16:41:25 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -6803,17 +6803,18 @@ GetInsertRecPtr(void) } /* - * GetWriteRecPtr -- Returns the current write position. + * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL + * position known to be fsync'd to disk. */ XLogRecPtr -GetWriteRecPtr(void) +GetFlushRecPtr(void) { /* use volatile pointer to prevent code rearrangement */ volatile XLogCtlData *xlogctl = XLogCtl; XLogRecPtr recptr; SpinLockAcquire(&xlogctl->info_lck); - recptr = xlogctl->LogwrtResult.Write; + recptr = xlogctl->LogwrtResult.Flush; SpinLockRelease(&xlogctl->info_lck); return recptr; diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c index 7422f76251..298737c89c 100644 --- a/src/backend/replication/walsender.c +++ b/src/backend/replication/walsender.c @@ -3,8 +3,9 @@ * walsender.c * * The WAL sender process (walsender) is new as of Postgres 9.0. It takes - * charge of XLOG streaming sender in the primary server. At first, it is - * started by the postmaster when the walreceiver in the standby server + * care of sending XLOG from the primary server to a single recipient. + * (Note that there can be more than one walsender process concurrently.) + * It is started by the postmaster when the walreceiver of a standby server * connects to the primary server and requests XLOG streaming replication. * It attempts to keep reading XLOG records from the disk and sending them * to the standby server, as long as the connection is alive (i.e., like @@ -23,13 +24,11 @@ * This instruct walsender to send any outstanding WAL, including the * shutdown checkpoint record, and then exit. * - * Note that there can be more than one walsender process concurrently. * * Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group * - * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/replication/walsender.c,v 1.26 2010/06/03 23:00:14 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/replication/walsender.c,v 1.27 2010/06/17 16:41:25 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -641,7 +640,7 @@ XLogRead(char *buf, XLogRecPtr recptr, Size nbytes) } /* - * Read up to MAX_SEND_SIZE bytes of WAL that's been written to disk, + * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk, * but not yet sent to the client, and send it. * * msgbuf is a work area in which the output message is constructed. It's @@ -663,11 +662,14 @@ XLogSend(char *msgbuf, bool *caughtup) WalDataMessageHeader msghdr; /* - * Attempt to send all data that's already been written out from WAL - * buffers (note it might not yet be fsync'd to disk). We cannot go - * further than that given the current implementation of XLogRead(). + * Attempt to send all data that's already been written out and fsync'd + * to disk. We cannot go further than what's been written out given the + * current implementation of XLogRead(). And in any case it's unsafe to + * send WAL that is not securely down to disk on the master: if the master + * subsequently crashes and restarts, slaves must not have applied any WAL + * that gets lost on the master. */ - SendRqstPtr = GetWriteRecPtr(); + SendRqstPtr = GetFlushRecPtr(); /* Quick exit if nothing to do */ if (XLByteLE(SendRqstPtr, sentPtr)) @@ -679,7 +681,7 @@ XLogSend(char *msgbuf, bool *caughtup) /* * Figure out how much to send in one message. If there's no more than * MAX_SEND_SIZE bytes to send, send everything. Otherwise send - * MAX_SEND_SIZE bytes, but round to logfile or page boundary. + * MAX_SEND_SIZE bytes, but round back to logfile or page boundary. * * The rounding is not only for performance reasons. Walreceiver * relies on the fact that we never split a WAL record across two diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index 1a8f455dba..cbadd7f91f 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.112 2010/06/10 07:49:23 heikki Exp $ + * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.113 2010/06/17 16:41:25 tgl Exp $ */ #ifndef XLOG_H #define XLOG_H @@ -294,7 +294,7 @@ extern bool CreateRestartPoint(int flags); extern void XLogPutNextOid(Oid nextOid); extern XLogRecPtr GetRedoRecPtr(void); extern XLogRecPtr GetInsertRecPtr(void); -extern XLogRecPtr GetWriteRecPtr(void); +extern XLogRecPtr GetFlushRecPtr(void); extern void GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch); extern TimeLineID GetRecoveryTargetTLI(void); -- 2.11.0