OSDN Git Service

Speed up CREATE DATABASE by deferring the fsyncs until after copying
authorGreg Stark <stark@mit.edu>
Mon, 15 Feb 2010 00:50:57 +0000 (00:50 +0000)
committerGreg Stark <stark@mit.edu>
Mon, 15 Feb 2010 00:50:57 +0000 (00:50 +0000)
all the data and using posix_fadvise to nudge the OS into flushing it
earlier. This also hopefully makes CREATE DATABASE avoid spamming the
cache.

Tests show a big speedup on Linux at least on some filesystems.

Idea and patch from Andres Freund.

src/backend/storage/file/fd.c
src/include/storage/fd.h
src/port/copydir.c

index ec27859..adea849 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.153 2010/01/12 02:42:52 momjian Exp $
+ *       $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.154 2010/02/15 00:50:57 stark Exp $
  *
  * NOTES:
  *
@@ -320,6 +320,22 @@ pg_fdatasync(int fd)
 }
 
 /*
+ * pg_flush_data --- advise OS that the data described won't be needed soon
+ *
+ * Not all platforms have posix_fadvise; treat as noop if not available.
+ */
+int
+pg_flush_data(int fd, off_t offset, off_t amount)
+{
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+       return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
+#else
+       return 0;
+#endif
+}
+
+
+/*
  * InitFileAccess --- initialize this module during backend startup
  *
  * This is called during either normal or standalone backend start.
index 20f6091..9dd240e 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.66 2010/01/02 16:58:08 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.67 2010/02/15 00:50:57 stark Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -98,6 +98,7 @@ extern int    pg_fsync(int fd);
 extern int     pg_fsync_no_writethrough(int fd);
 extern int     pg_fsync_writethrough(int fd);
 extern int     pg_fdatasync(int fd);
+extern int  pg_flush_data(int fd, off_t offset, off_t amount);
 
 /* Filename components for OpenTemporaryFile */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
index 0bf764e..a52b1f7 100644 (file)
@@ -11,7 +11,7 @@
  *     as a service.
  *
  * IDENTIFICATION
- *       $PostgreSQL: pgsql/src/port/copydir.c,v 1.25 2010/02/14 17:50:52 stark Exp $
+ *       $PostgreSQL: pgsql/src/port/copydir.c,v 1.26 2010/02/15 00:50:57 stark Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -37,6 +37,7 @@
 
 
 static void copy_file(char *fromfile, char *tofile);
+static void fsync_fname(char *fname);
 
 
 /*
@@ -91,27 +92,32 @@ copydir(char *fromdir, char *todir, bool recurse)
                        copy_file(fromfile, tofile);
        }
 
-       FreeDir(xldir);
-
        /*
-        * fsync the directory to make sure not just the data but also the
-        * new directory file entries have reached the disk. While needed
-        * by most filesystems, the window got bigger with newer ones like
-        * ext4.
+        * Be paranoid here and fsync all files to ensure we catch problems.
         */
-       dirfd = BasicOpenFile(todir,
-                             O_RDONLY | PG_BINARY,
-                             S_IRUSR | S_IWUSR);
-       if(dirfd == -1)
-               ereport(ERROR,
-                       (errcode_for_file_access(),
-                        errmsg("could not open directory for fsync \"%s\": %m", todir)));
-
-       if(pg_fsync(dirfd) == -1)
+       if (xldir == NULL)
                ereport(ERROR,
                                (errcode_for_file_access(),
-                                errmsg("could not fsync directory \"%s\": %m", todir)));
-       close(dirfd);
+                                errmsg("could not open directory \"%s\": %m", fromdir)));
+
+       while ((xlde = ReadDir(xldir, fromdir)) != NULL)
+       {
+               if (strcmp(xlde->d_name, ".") == 0 ||
+                       strcmp(xlde->d_name, "..") == 0)
+                       continue;
+
+               snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name);
+               fsync_fname(tofile);
+       }
+       FreeDir(xldir);
+
+       /* It's important to fsync the destination directory itself as
+        * individual file fsyncs don't guarantee that the directory entry
+        * for the file is synced. Recent versions of ext4 have made the
+        * window much wider but it's been true for ext3 and other
+        * filesyetems in the past 
+        */
+       fsync_fname(todir);
 }
 
 /*
@@ -124,6 +130,7 @@ copy_file(char *fromfile, char *tofile)
        int                     srcfd;
        int                     dstfd;
        int                     nbytes;
+       off_t           offset;
 
        /* Use palloc to ensure we get a maxaligned buffer */
 #define COPY_BUF_SIZE (8 * BLCKSZ)
@@ -149,7 +156,7 @@ copy_file(char *fromfile, char *tofile)
        /*
         * Do the data copying.
         */
-       for (;;)
+       for (offset=0; ; offset+=nbytes)
        {
                nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
                if (nbytes < 0)
@@ -168,15 +175,14 @@ copy_file(char *fromfile, char *tofile)
                                        (errcode_for_file_access(),
                                         errmsg("could not write to file \"%s\": %m", tofile)));
                }
-       }
 
-       /*
-        * Be paranoid here to ensure we catch problems.
-        */
-       if (pg_fsync(dstfd) != 0)
-               ereport(ERROR,
-                               (errcode_for_file_access(),
-                                errmsg("could not fsync file \"%s\": %m", tofile)));
+               /*
+                * We fsync the files later but first flush them to avoid spamming
+                * the cache and hopefully get the kernel to start writing them
+                * out before the fsync comes.
+                */
+               pg_flush_data(dstfd, offset, nbytes);
+       }
 
        if (close(dstfd))
                ereport(ERROR,
@@ -187,3 +193,27 @@ copy_file(char *fromfile, char *tofile)
 
        pfree(buffer);
 }
+
+
+
+/*
+ * fsync a file
+ */
+static void
+fsync_fname(char *fname)
+{
+       int     fd = BasicOpenFile(fname, 
+                                                  O_RDONLY | PG_BINARY,
+                                                  S_IRUSR | S_IWUSR);
+
+       if (fd < 0)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not open file \"%s\": %m", fname)));
+
+       if (pg_fsync(fd) != 0)
+               ereport(ERROR,
+                               (errcode_for_file_access(),
+                                errmsg("could not fsync file \"%s\": %m", fname)));
+       close(fd);
+}