From ad458cfe81bcefd6d8bd17ff2e42c6599d441bd6 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Fri, 19 Feb 2010 10:51:04 +0000 Subject: [PATCH] Don't use O_DIRECT when writing WAL files if archiving or streaming is enabled. Bypassing the kernel cache is counter-productive in that case, because the archiver/walsender process will read from the WAL file soon after it's written, and if it's not cached the read will cause a physical read, eating I/O bandwidth available on the WAL drive. Also, walreceiver process does unaligned writes, so disable O_DIRECT in walreceiver process for that reason too. --- src/backend/access/transam/xlog.c | 32 ++++++++++++++++++++++++-------- src/backend/replication/walreceiver.c | 22 ++++++++++++++-------- src/include/access/xlogdefs.h | 15 ++++++--------- src/include/replication/walreceiver.h | 4 +++- 4 files changed, 47 insertions(+), 26 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 106d39b760..046d80fa95 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.376 2010/02/19 01:04:03 itagaki Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.377 2010/02/19 10:51:03 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -2686,13 +2686,10 @@ XLogFileClose(void) * WAL segment files will not be re-read in normal operation, so we advise * the OS to release any cached pages. But do not do so if WAL archiving * or streaming is active, because archiver and walsender process could use - * the cache to read the WAL segment. Also, don't bother with it if we - * are using O_DIRECT, since the kernel is presumably not caching in that - * case. + * the cache to read the WAL segment. */ #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - if (!XLogIsNeeded() && - (get_sync_bit(sync_method) & PG_O_DIRECT) == 0) + if (!XLogIsNeeded()) (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED); #endif @@ -7652,10 +7649,29 @@ xlog_outrec(StringInfo buf, XLogRecord *record) static int get_sync_bit(int method) { + int o_direct_flag = 0; + /* If fsync is disabled, never open in sync mode */ if (!enableFsync) return 0; + /* + * Optimize writes by bypassing kernel cache with O_DIRECT when using + * O_SYNC, O_DSYNC or O_FSYNC. But only if archiving and streaming are + * disabled, otherwise the archive command or walsender process will + * read the WAL soon after writing it, which is guaranteed to cause a + * physical read if we bypassed the kernel cache. We also skip the + * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the + * same reason. + * + * Never use O_DIRECT in walreceiver process for similar reasons; the WAL + * written by walreceiver is normally read by the startup process soon + * after its written. Also, walreceiver performs unaligned writes, which + * don't work with O_DIRECT, so it is required for correctness too. + */ + if (!XLogIsNeeded() && !am_walreceiver) + o_direct_flag = PG_O_DIRECT; + switch (method) { /* @@ -7670,11 +7686,11 @@ get_sync_bit(int method) return 0; #ifdef OPEN_SYNC_FLAG case SYNC_METHOD_OPEN: - return OPEN_SYNC_FLAG; + return OPEN_SYNC_FLAG | o_direct_flag; #endif #ifdef OPEN_DATASYNC_FLAG case SYNC_METHOD_OPEN_DSYNC: - return OPEN_DATASYNC_FLAG; + return OPEN_DATASYNC_FLAG | o_direct_flag; #endif default: /* can't happen (unless we are out of sync with option array) */ diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 0e57611da4..3f82693dce 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -29,7 +29,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.4 2010/02/17 04:19:39 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/replication/walreceiver.c,v 1.5 2010/02/19 10:51:04 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -50,6 +50,9 @@ #include "utils/ps_status.h" #include "utils/resowner.h" +/* Global variable to indicate if this process is a walreceiver process */ +bool am_walreceiver; + /* libpqreceiver hooks to these when loaded */ walrcv_connect_type walrcv_connect = NULL; walrcv_receive_type walrcv_receive = NULL; @@ -158,6 +161,8 @@ WalReceiverMain(void) /* use volatile pointer to prevent code rearrangement */ volatile WalRcvData *walrcv = WalRcv; + am_walreceiver = true; + /* * WalRcv should be set up already (if we are a backend, we inherit * this by fork() or EXEC_BACKEND mechanism from the postmaster). @@ -424,16 +429,18 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) bool use_existent; /* - * XLOG segment files will be re-read in recovery operation soon, - * so we don't need to advise the OS to release any cache page. + * fsync() and close current file before we switch to next one. + * We would otherwise have to reopen this file to fsync it later */ if (recvFile >= 0) { + XLogWalRcvFlush(); + /* - * fsync() before we switch to next file. We would otherwise - * have to reopen this file to fsync it later + * XLOG segment files will be re-read by recovery in startup + * process soon, so we don't advise the OS to release cache + * pages associated with the file like XLogFileClose() does. */ - XLogWalRcvFlush(); if (close(recvFile) != 0) ereport(PANIC, (errcode_for_file_access(), @@ -445,8 +452,7 @@ XLogWalRcvWrite(char *buf, Size nbytes, XLogRecPtr recptr) /* Create/use new log file */ XLByteToSeg(recptr, recvId, recvSeg); use_existent = true; - recvFile = XLogFileInit(recvId, recvSeg, - &use_existent, true); + recvFile = XLogFileInit(recvId, recvSeg, &use_existent, true); recvOff = 0; } diff --git a/src/include/access/xlogdefs.h b/src/include/access/xlogdefs.h index 8ecc3a21b1..0760b25930 100644 --- a/src/include/access/xlogdefs.h +++ b/src/include/access/xlogdefs.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.25 2010/01/15 09:19:06 heikki Exp $ + * $PostgreSQL: pgsql/src/include/access/xlogdefs.h,v 1.26 2010/02/19 10:51:04 heikki Exp $ */ #ifndef XLOG_DEFS_H #define XLOG_DEFS_H @@ -106,23 +106,20 @@ typedef uint32 TimeLineID; * configure determined whether fdatasync() is. */ #if defined(O_SYNC) -#define BARE_OPEN_SYNC_FLAG O_SYNC +#define OPEN_SYNC_FLAG O_SYNC #elif defined(O_FSYNC) -#define BARE_OPEN_SYNC_FLAG O_FSYNC -#endif -#ifdef BARE_OPEN_SYNC_FLAG -#define OPEN_SYNC_FLAG (BARE_OPEN_SYNC_FLAG | PG_O_DIRECT) +#define OPEN_SYNC_FLAG O_FSYNC #endif #if defined(O_DSYNC) #if defined(OPEN_SYNC_FLAG) /* O_DSYNC is distinct? */ -#if O_DSYNC != BARE_OPEN_SYNC_FLAG -#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) +#if O_DSYNC != OPEN_SYNC_FLAG +#define OPEN_DATASYNC_FLAG O_DSYNC #endif #else /* !defined(OPEN_SYNC_FLAG) */ /* Win32 only has O_DSYNC */ -#define OPEN_DATASYNC_FLAG (O_DSYNC | PG_O_DIRECT) +#define OPEN_DATASYNC_FLAG O_DSYNC #endif #endif diff --git a/src/include/replication/walreceiver.h b/src/include/replication/walreceiver.h index bf7ad41b06..56af60560e 100644 --- a/src/include/replication/walreceiver.h +++ b/src/include/replication/walreceiver.h @@ -5,7 +5,7 @@ * * Portions Copyright (c) 2010-2010, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.6 2010/02/03 09:47:19 heikki Exp $ + * $PostgreSQL: pgsql/src/include/replication/walreceiver.h,v 1.7 2010/02/19 10:51:04 heikki Exp $ * *------------------------------------------------------------------------- */ @@ -15,6 +15,8 @@ #include "access/xlogdefs.h" #include "storage/spin.h" +extern bool am_walreceiver; + /* * MAXCONNINFO: maximum size of a connection string. * -- 2.11.0