OSDN Git Service

New file format for COPY BINARY, in accordance with pghackers discussions
authorTom Lane <tgl@sss.pgh.pa.us>
Wed, 3 Jan 2001 20:04:10 +0000 (20:04 +0000)
committerTom Lane <tgl@sss.pgh.pa.us>
Wed, 3 Jan 2001 20:04:10 +0000 (20:04 +0000)
of early December 2000.  COPY BINARY is now TOAST-safe.

doc/src/sgml/ref/copy.sgml
src/backend/commands/copy.c

index 07d46eb..155fca5 100644 (file)
@@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.18 2000/10/05 19:48:17 momjian Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/ref/copy.sgml,v 1.19 2001/01/03 20:04:09 tgl Exp $
 Postgres documentation
 -->
 
@@ -49,6 +49,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
        <para>
        Changes the behavior of field formatting, forcing all data to be
        stored or read in binary format rather than as text.
+       The DELIMITERS and WITH NULL options are irrelevant for binary format.
        </para>
       </listitem>
      </varlistentry>
@@ -66,7 +67,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
       <term>WITH OIDS</term>
       <listitem>
        <para>
-       Copies the internal unique object id (OID) for each row.
+       Specifies copying the internal unique object id (OID) for each row.
        </para>
       </listitem>
      </varlistentry>
@@ -84,7 +85,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
       <term><filename>stdin</filename></term>
       <listitem>
        <para>
-       Specifies that input comes from a pipe or terminal.
+       Specifies that input comes from the client application.
        </para>
       </listitem>
      </varlistentry>
@@ -93,7 +94,7 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
       <term><filename>stdout</filename></term>
       <listitem>
        <para>
-       Specifies that output goes to a pipe or terminal.
+       Specifies that output goes to the client application.
        </para>
       </listitem>
      </varlistentry>
@@ -102,16 +103,16 @@ COPY [ BINARY ] <replaceable class="parameter">table</replaceable> [ WITH OIDS ]
       <term><replaceable class="parameter">delimiter</replaceable></term>
       <listitem>
        <para>
-       A character that delimits the input or output fields.
+       The character that separates fields within each row (line) of the file.
        </para>
       </listitem>
      </varlistentry>
 
      <varlistentry>
-      <term><replaceable class="parameter">null print</replaceable></term>
+      <term><replaceable class="parameter">null string</replaceable></term>
       <listitem>
        <para>
-        A string to represent NULL values. The default is
+        The string that represents a NULL value. The default is
         <quote><literal>\N</literal></quote> (backslash-N).
        You might prefer an empty string, for example.
        </para>
@@ -166,7 +167,7 @@ ERROR: <replaceable>reason</replaceable>
  
  <refsect1 id="R1-SQL-COPY-1">
   <refsect1info>
-   <date>1998-09-08</date>
+   <date>2001-01-02</date>
   </refsect1info>
   <title>
    Description
@@ -176,17 +177,36 @@ ERROR: <replaceable>reason</replaceable>
    <productname>Postgres</productname> tables and
    standard file-system files.
 
+   <command>COPY TO</command> copies the entire contents of a table to
+   a file, while <command>COPY FROM</command> copies data from a file to a
+   table (appending the data to whatever is in the table already).
+  </para>
+
+  <para>
    <command>COPY</command> instructs
    the <productname>Postgres</productname> backend
-   to directly read from or write to a file. The file must be directly visible to
-   the backend and the name must be specified from the viewpoint of the backend.
-   If <filename>stdin</filename> or <filename>stdout</filename> are
+   to directly read from or write to a file. If a file name is specified,
+   the file must be accessible to the backend and the name must be specified
+   from the viewpoint of the backend.
+   If <filename>stdin</filename> or <filename>stdout</filename> is
    specified, data flows through the client frontend to  the backend.
-  </para>
+    
+    <tip>
+     <para>
+      Do not confuse <command>COPY</command> with the
+      <application>psql</application> instruction <command>\copy</command>.
+      <command>\copy</command> invokes <command>COPY FROM stdin</command> 
+      or <command>COPY TO stdout</command>, and then fetches/stores the data
+      in a file accessible to the <application>psql</application> client.
+      Thus, file accessibility and access rights depend on the client
+      rather than the backend when <command>\copy</command> is used.
+     </para>
+    </tip>
+   </para>
 
   <refsect2 id="R2-SQL-COPY-3">
    <refsect2info>
-    <date>1998-09-08</date>
+    <date>2001-01-02</date>
    </refsect2info>
    <title>
     Notes
@@ -194,16 +214,19 @@ ERROR: <replaceable>reason</replaceable>
    <para>
     The BINARY keyword will force all data to be
     stored/read as binary format rather than as text.  It is
-    somewhat faster than the normal copy command, but is not
-    generally portable, and the files generated are somewhat larger,
-    although this factor is highly dependent on the data itself.  
+    somewhat faster than the normal copy command, but a binary copy
+    file is not portable across machine architectures.
     </para>
+
     <para>
-    By default, a text copy uses a tab ("\t") character as a delimiter.
-    The delimiter may also be changed to any other single character
-    with the keyword phrase USING DELIMITERS.  Characters
+    By default, a text copy uses a tab ("\t") character as a delimiter
+    between fields.  The field delimiter may be changed to any other single
+    character with the keyword phrase USING DELIMITERS.  Characters
     in data fields which happen to match the delimiter character will
     be backslash quoted.
+    Note that the delimiter is always a single character.
+    If multiple characters are specified in the delimiter string,
+    only the first character is used.
    </para>
    
    <para>
@@ -217,67 +240,63 @@ ERROR: <replaceable>reason</replaceable>
    </para>
 
    <para>
-    The keyword phrase USING DELIMITERS specifies a single character
-    to be used for all delimiters between columns. If multiple characters
-    are specified in the delimiter string,  only the first character is
-    used.
-    
-    <tip>
-     <para>
-      Do not confuse <command>COPY</command> with the
-      <application>psql</application> instruction <command>\copy</command>.
-     </para>
-    </tip>
+    <command>COPY TO</command> neither invokes rules nor acts on column
+    defaults.  It does invoke triggers and check constraints.
    </para>
 
    <para>
-    <command>COPY</command> neither invokes rules nor acts on column defaults.
-    It does invoke triggers, however.
-   </para>
-   <para>
     <command>COPY</command> stops operation at the first error.  This
     should not lead to problems in the event of
     a <command>COPY FROM</command>, but the
-    target relation will, of course, be partially modified in a
-    <command>COPY TO</command>.
-    <command>VACUUM</command> should be used to clean up
-    after a failed copy.
-   </para>
-   <para>
-    Because the Postgres backend's current working directory
-    is not usually the same as the user's
-    working directory, the result of copying to a file
-    "<filename>foo</filename>" (without
-    additional path information) may yield unexpected results for the
-    naive user.  In this case, <filename>foo</filename>
-    will wind up in <filename>$PGDATA/foo</filename>.  In
-    general, the full pathname as it would appear to the backend server machine
-    should be used when specifying files to
-    be copied.
+    target relation will already have received earlier rows in a
+    <command>COPY TO</command>.  These rows will not be visible or
+    accessible, but they still occupy disk space.  This may amount to a
+    considerable amount
+    of wasted disk space if the failure happened well into a large copy
+    operation.  You may wish to invoke <command>VACUUM</command> to recover
+    the wasted space.
    </para>
+
    <para>
-    Files used as arguments to <command>COPY</command>
-    must reside on or be
-    accessible to the database server machine by being either on
-    local disks or on a networked file system.
+    Files named in a <command>COPY</command> command are read or written
+    directly by the backend, not by the client application.  Therefore,
+    they must reside on or be accessible to the database server machine,
+    not the client.  They must be accessible to and readable or writable
+    by the Postgres user (the userid the backend runs as), not the client.
+    <command>COPY</command> naming a file is only allowed to database
+    superusers, since it allows writing on any file that the backend has
+    privileges to write on.
+    
+    <tip>
+     <para>
+      The
+      <application>psql</application> instruction <command>\copy</command>
+      reads or writes files on the client machine with the client's
+      permissions, so it is not restricted to superusers.
+     </para>
+    </tip>
    </para>
+
    <para>
-    When a TCP/IP connection from one machine to another is used, and a
-    target file is specified, the target file will be written on the
-    machine where the backend is running rather than the user's
-    machine. 
+    It is recommended that the filename used in <command>COPY</command>
+    always be specified as an absolute path.  This is enforced by the backend
+    in the case of <command>COPY TO</command>, but for <command>COPY
+    FROM</command> you do have the option of reading from a file specified
+    by a relative path.  The path will be interpreted relative to the
+    backend's working directory (somewhere below
+    <filename>$PGDATA</filename>), not the client's working directory.
    </para>
   </refsect2>
  </refsect1>
  
  <refsect1 id="R1-SQL-COPY-2">
   <refsect1info>
-   <date>1998-05-04</date>
+   <date>2001-01-02</date>
   </refsect1info>
   <title>File Formats</title>
   <refsect2>
    <refsect2info>
-    <date>1998-05-04</date>
+    <date>2001-01-02</date>
    </refsect2info>
    <title>Text Format</title>
    <para>
@@ -293,27 +312,34 @@ ERROR: <replaceable>reason</replaceable>
    <para>
     The actual format for each instance is
     <programlisting>
-&lt;attr1&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr2&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;...&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr<replaceable class="parameter">n</replaceable>&gt;&lt;newline&gt;.
+&lt;attr1&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr2&gt;&lt;<replaceable class=parameter>separator</replaceable>&gt;...&lt;<replaceable class=parameter>separator</replaceable>&gt;&lt;attr<replaceable class="parameter">n</replaceable>&gt;&lt;newline&gt;
     </programlisting>
-    The oid is placed on the beginning of the line
-    if WITH OIDS is specified.
+    Note that the end of each row is marked by a Unix-style newline
+    ("\n").  <command>COPY FROM</command> will not behave as desired
+    if given a file containing DOS- or Mac-style newlines.
+   </para>
+   <para>
+    The OID is emitted as the first column if WITH OIDS is specified.
    </para>
    <para>
-    If <command>COPY</command> is sending its output to standard
-    output instead of a file, it will send a backslash("\") and a period
-    (".")  followed immediately by a newline, on a separate line,
-    when it is done.  Similarly, if <command>COPY</command> is reading
+    If <command>COPY TO</command> is sending its output to standard
+    output instead of a file, after the last row it will send a backslash ("\")
+    and a period (".") followed by a newline.
+    Similarly, if <command>COPY FROM</command> is reading
     from standard input, it will expect a backslash ("\") and a period
     (".") followed by a newline, as the first three characters on a
-    line to denote end-of-file.  However, <command>COPY</command>
-    will terminate (followed by the backend itself) if a true EOF is
-    encountered before this special end-of-file pattern is found.
+    line to denote end-of-file.  However, <command>COPY FROM</command>
+    will terminate correctly (followed by the backend itself) if the
+    input connection is closed before this special end-of-file pattern is
+    found.
    </para>
    <para>
     The backslash character has other special meanings.  A literal backslash
     character is represented as two
     consecutive backslashes ("\\").  A literal tab character is represented
-    as a backslash and a tab.  A literal newline character is
+    as a backslash and a tab.  (If you are using something other than tab
+    as the column delimiter, backslash that delimiter character to include
+    it in data.)  A literal newline character is
     represented as a backslash and a newline.  When loading text data
     not generated by <acronym>Postgres</acronym>,
     you will need to convert backslash
@@ -324,82 +350,207 @@ ERROR: <replaceable>reason</replaceable>
 
   <refsect2>
    <refsect2info>
-    <date>1998-05-04</date>
+    <date>2001-01-02</date>
    </refsect2info>
    <title>Binary Format</title>
    <para>
-    In the case of <command>COPY BINARY</command>, the first four
-    bytes in the file will be the number of instances in the file.  If
-    this number is zero, the <command>COPY BINARY</command> command
-    will read until end-of-file is encountered.  Otherwise, it will
-    stop reading when this number of instances has been read.
-    Remaining data in the file will be ignored.
-   </para>
-   <para>
-    The format for each instance in the file is as follows.  Note that
-    this format must be followed <emphasis>exactly</emphasis>.
-    Unsigned four-byte integer quantities are called uint32 in the
-    table below.
-   </para>
-   <table frame="all">
-    <title>Contents of a binary copy file</title>
-    <tgroup cols="2" colsep="1" rowsep="1" align="center">
-     <colspec colname="col1">
-     <colspec colname="col2">
-     <spanspec namest="col1" nameend="col2" spanname="subhead">
-     <tbody>
-      <row>
-       <entry spanname="subhead" align="center">At the start of the file</entry>
-      </row>
-      <row>
-       <entry>uint32</entry>
-       <entry>number of tuples</entry>
-      </row>
-      <row>
-       <entry spanname="subhead" align="center">For each tuple</entry>
-      </row>
-      <row>
-       <entry>uint32</entry>
-       <entry>total length of tuple data</entry>
-      </row>
-      <row>
-       <entry>uint32</entry>
-       <entry>oid (if specified)</entry>
-      </row>
-      <row>
-       <entry>uint32</entry>
-       <entry>number of null attributes</entry>
-      </row>
-      <row>
-       <entry>[uint32,...,uint32]</entry>
-       <entry>attribute numbers of attributes, counting from 0</entry>
-      </row>
-      <row>
-       <entry>-</entry>
-       <entry>&lt;tuple data&gt;</entry>
-      </row>
-     </tbody>
-    </tgroup>
-   </table>
-   
-  </refsect2>
-  <refsect2>
-   <refsect2info>
-    <date>1998-05-04</date>
-   </refsect2info>
-   <title>Alignment of Binary Data</title>
-   <para>
-    On Sun-3s, 2-byte attributes are aligned on two-byte boundaries,
-    and all larger attributes are aligned on four-byte boundaries.
-    Character attributes are aligned on single-byte boundaries.  On
-    most other machines, all attributes larger than 1 byte are aligned on
-    four-byte boundaries.  Note that variable length attributes are
-    preceded by the attribute's length; arrays are simply contiguous
-    streams of the array element type.
+    The file format used for <command>COPY BINARY</command> changed in
+    Postgres v7.1.  The new format consists of a file header, zero or more
+    tuples, and a file trailer.
    </para>
+
+   <refsect3>
+    <refsect3info>
+     <date>2001-01-02</date>
+    </refsect3info>
+    <title>
+     File Header
+    </title>
+    <para>
+     The file header consists of 24 bytes of fixed fields, followed
+     by a variable-length header extension area.  The fixed fields are:
+
+    <variablelist>
+     <varlistentry>
+      <term>Signature</term>
+      <listitem>
+       <para>
+12-byte sequence "PGBCOPY\n\377\r\n\0" --- note that the null
+is a required part of the signature.  (The signature is designed to allow
+easy identification of files that have been munged by a non-8-bit-clean
+transfer.  This signature will be changed by newline-translation
+filters, dropped nulls, dropped high bits, or parity changes.)
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>Integer layout field</term>
+      <listitem>
+       <para>
+int32 constant 0x01020304 in source's byte order.
+Potentially, a reader could engage in byte-flipping of subsequent fields
+if the wrong byte order is detected here.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>Flags field</term>
+      <listitem>
+       <para>
+int32 bit mask to denote important aspects of the file
+format.  Bits are numbered from 0 (LSB) to 31 (MSB) --- note that this
+field is stored with source's endianness, as are all subsequent integer
+fields.  Bits 16-31 are reserved to denote critical file format issues;
+a reader should abort if it finds an unexpected bit set in this range.
+Bits 0-15 are reserved to signal backwards-compatible format issues;
+a reader should simply ignore any unexpected bits set in this range.
+Currently only one flag bit is defined, and the rest must be zero:
+        <variablelist>
+         <varlistentry>
+          <term>Bit 16</term>
+          <listitem>
+           <para>
+            if 1, OIDs are included in the dump; if 0, not
+           </para>
+          </listitem>
+         </varlistentry>
+        </variablelist>
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>Header extension area length</term>
+      <listitem>
+       <para>
+int32 length in bytes of remainder of header, not including self.  In
+the initial version this will be zero, and the first tuple follows
+immediately.  Future changes to the format might allow additional data
+to be present in the header.  A reader should silently skip over any header
+extension data it does not know what to do with.
+       </para>
+      </listitem>
+     </varlistentry>
+    </variablelist>
+    </para>
+
+    <para>
+The header extension area is envisioned to contain a sequence of
+self-identifying chunks.  The flags field is not intended to tell readers
+what is in the extension area.  Specific design of header extension contents
+is left for a later release.
+    </para>
+
+    <para>
+     This design allows for both backwards-compatible header additions (add
+     header extension chunks, or set low-order flag bits) and
+     non-backwards-compatible changes (set high-order flag bits to signal such
+     changes, and add supporting data to the extension area if needed).
+    </para>
+   </refsect3>
+
+   <refsect3>
+    <refsect3info>
+     <date>2001-01-02</date>
+    </refsect3info>
+    <title>
+     Tuples
+    </title>
+    <para>
+Each tuple begins with an int16 count of the number of fields in the
+tuple.  (Presently, all tuples in a table will have the same count, but
+that might not always be true.)  Then, repeated for each field in the
+tuple, there is an int16 typlen word possibly followed by field data.
+The typlen field is interpreted thus:
+
+    <variablelist>
+     <varlistentry>
+      <term>Zero</term>
+      <listitem>
+       <para>
+       Field is NULL.  No data follows.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>&gt; 0</term>
+      <listitem>
+       <para>
+        Field is a fixed-length datatype.  Exactly N
+       bytes of data follow the typlen word.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>-1</term>
+      <listitem>
+       <para>
+       Field is a varlena datatype.  The next four
+       bytes are the varlena header, which contains
+       the total value length including itself.
+       </para>
+      </listitem>
+     </varlistentry>
+
+     <varlistentry>
+      <term>&lt; -1</term>
+      <listitem>
+       <para>
+       Reserved for future use.
+       </para>
+      </listitem>
+     </varlistentry>
+    </variablelist>
+    </para>
+
+    <para>
+For non-NULL fields, the reader can check that the typlen matches the
+expected typlen for the destination column.  This provides a simple
+but very useful check that the data is as expected.
+    </para>
+
+    <para>
+There is no alignment padding or any other extra data between fields.
+Note also that the format does not distinguish whether a datatype is
+pass-by-reference or pass-by-value.  Both of these provisions are
+deliberate: they might help improve portability of the files (although
+of course endianness and floating-point-format issues can still keep
+you from moving a binary file across machines).
+    </para>
+
+    <para>
+If OIDs are included in the dump, the OID field immediately follows the
+field-count word.  It is a normal field except that it's not included
+in the field-count.  In particular it has a typlen --- this will allow
+handling of 4-byte vs 8-byte OIDs without too much pain, and will allow
+OIDs to be shown as NULL if we someday allow OIDs to be optional.
+    </para>
+   </refsect3>
+
+   <refsect3>
+    <refsect3info>
+     <date>2001-01-02</date>
+    </refsect3info>
+    <title>
+     File Trailer
+    </title>
+    <para>
+     The file trailer consists of an int16 word containing -1.  This is
+     easily distinguished from a tuple's field-count word.
+    </para>
+
+    <para>
+     A reader should report an error if a field-count word is neither -1
+     nor the expected number of columns.  This provides an extra
+     check against somehow getting out of sync with the data.
+    </para>
+   </refsect3>
   </refsect2>
  </refsect1>
-
  
  <refsect1 id="R1-SQL-COPY-3">
   <title>
@@ -407,7 +558,7 @@ ERROR: <replaceable>reason</replaceable>
   </title>
   <para>
 The following example copies a table to standard output,
- using a pipe (|) as the field
+ using a vertical bar (|) as the field
  delimiter:
   </para>
   <programlisting>
@@ -425,36 +576,36 @@ COPY country FROM '/usr1/proj/bray/sql/country_data';
 has the termination sequence on the last line):
   </para>
   <programlisting>
-   AF      AFGHANISTAN
-   AL      ALBANIA
-   DZ      ALGERIA
-   ...
-   ZM      ZAMBIA
-   ZW      ZIMBABWE
-   \.
+AF      AFGHANISTAN
+AL      ALBANIA
+DZ      ALGERIA
+ZM      ZAMBIA
+ZW      ZIMBABWE
+\.
   </programlisting>
   <para>
-   The following is the same data, output in binary format on a Linux/i586 machine.
-   The data is shown after filtering through
- the Unix utility <command>od -c</command>. The table has
-   three fields; the first is <classname>char(2)</classname>
- and the second is <classname>text</classname>. All the
+   Note that the white space on each line is actually a TAB.
+  </para>
+  <para>
+   The following is the same data, output in binary format on a Linux/i586
+   machine. The data is shown after filtering through
+   the Unix utility <command>od -c</command>. The table has
+   three fields; the first is <classname>char(2)</classname>,
+   the second is <classname>text</classname>, and the third is
+   <classname>int4</classname>. All the
    rows have a null value in the third field.
-  Notice how the <classname>char(2)</classname>
-   field is padded with nulls to four bytes and the text field is
-   preceded by its length:
   </para>
   <programlisting>
-   355  \0  \0  \0 027  \0  \0  \0 001  \0  \0  \0 002  \0  \0  \0
-   006  \0  \0  \0   A   F  \0  \0 017  \0  \0  \0   A   F   G   H
-     A   N   I   S   T   A   N 023  \0  \0  \0 001  \0  \0  \0 002
-    \0  \0  \0 006  \0  \0  \0   A   L  \0  \0  \v  \0  \0  \0   A
-     L   B   A   N   I   A 023  \0  \0  \0 001  \0  \0  \0 002  \0
-    \0  \0 006  \0  \0  \0   D   Z  \0  \0  \v  \0  \0  \0   A   L
-     G   E   R   I   A
-   ...              \n  \0  \0  \0   Z   A   M   B   I   A 024  \0
-    \0  \0 001  \0  \0  \0 002  \0  \0  \0 006  \0  \0  \0   Z   W
-    \0  \0  \f  \0  \0  \0   Z   I   M   B   A   B   W   E
+0000000   P   G   B   C   O   P   Y  \n 377  \r  \n  \0 004 003 002 001
+0000020  \0  \0  \0  \0  \0  \0  \0  \0 003  \0 377 377 006  \0  \0  \0
+0000040   A   F 377 377 017  \0  \0  \0   A   F   G   H   A   N   I   S
+0000060   T   A   N  \0  \0 003  \0 377 377 006  \0  \0  \0   A   L 377
+0000100 377  \v  \0  \0  \0   A   L   B   A   N   I   A  \0  \0 003  \0
+0000120 377 377 006  \0  \0  \0   D   Z 377 377  \v  \0  \0  \0   A   L
+0000140   G   E   R   I   A  \0  \0 003  \0 377 377 006  \0  \0  \0   Z
+0000160   M 377 377  \n  \0  \0  \0   Z   A   M   B   I   A  \0  \0 003
+0000200  \0 377 377 006  \0  \0  \0   Z   W 377 377  \f  \0  \0  \0   Z
+0000220   I   M   B   A   B   W   E  \0  \0 377 377
   </programlisting>
  </refsect1>
  
index f950734..a742403 100644 (file)
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *       $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.126 2000/12/27 23:59:14 tgl Exp $
+ *       $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.127 2001/01/03 20:04:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -52,7 +52,8 @@ static Oid    GetTypeElement(Oid type);
 static void CopyReadNewline(FILE *fp, int *newline);
 static char *CopyReadAttribute(FILE *fp, bool *isnull, char *delim, int *newline, char *null_print);
 static void CopyAttributeOut(FILE *fp, char *string, char *delim);
-static int     CountTuples(Relation relation);
+
+static const char BinarySignature[12] = "PGBCOPY\n\377\r\n\0";
 
 /*
  * Static communication variables ... pretty grotty, but COPY has
@@ -387,7 +388,8 @@ DoCopy(char *relname, bool binary, bool oids, bool from, bool pipe,
  * Copy from relation TO file.
  */
 static void
-CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_print)
+CopyTo(Relation rel, bool binary, bool oids, FILE *fp,
+          char *delim, char *null_print)
 {
        HeapTuple       tuple;
        TupleDesc       tupDesc;
@@ -398,20 +400,9 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p
        FmgrInfo   *out_functions;
        Oid                *elements;
        bool       *isvarlena;
-       int32      *typmod;
-       char       *nulls;
-
-       /*
-        * <nulls> is a (dynamically allocated) array with one character per
-        * attribute in the instance being copied.      nulls[I-1] is 'n' if
-        * Attribute Number I is null, and ' ' otherwise.
-        *
-        * <nulls> is meaningful only if we are doing a binary copy.
-        */
+       int16           fld_size;
        char       *string;
 
-       scandesc = heap_beginscan(rel, 0, QuerySnapshot, 0, NULL);
-
        tupDesc = rel->rd_att;
        attr_count = rel->rd_att->natts;
        attr = rel->rd_att->attrs;
@@ -420,7 +411,6 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p
        out_functions = (FmgrInfo *) palloc(attr_count * sizeof(FmgrInfo));
        elements = (Oid *) palloc(attr_count * sizeof(Oid));
        isvarlena = (bool *) palloc(attr_count * sizeof(bool));
-       typmod = (int32 *) palloc(attr_count * sizeof(int32));
        for (i = 0; i < attr_count; i++)
        {
                Oid                     out_func_oid;
@@ -430,40 +420,62 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p
                        elog(ERROR, "COPY: couldn't lookup info for type %u",
                                 attr[i]->atttypid);
                fmgr_info(out_func_oid, &out_functions[i]);
-               typmod[i] = attr[i]->atttypmod;
        }
 
-       if (!binary)
+       if (binary)
        {
-               nulls = NULL;                   /* meaningless, but compiler doesn't know
-                                                                * that */
+               /* Generate header for a binary copy */
+               int32           tmp;
+
+               /* Signature */
+               CopySendData((char *) BinarySignature, 12, fp);
+               /* Integer layout field */
+               tmp = 0x01020304;
+               CopySendData(&tmp, sizeof(int32), fp);
+               /* Flags field */
+               tmp = 0;
+               if (oids)
+                       tmp |= (1 << 16);
+               CopySendData(&tmp, sizeof(int32), fp);
+               /* No header extension */
+               tmp = 0;
+               CopySendData(&tmp, sizeof(int32), fp);
        }
-       else
-       {
-               int32           ntuples;
 
-               nulls = (char *) palloc(attr_count);
-               for (i = 0; i < attr_count; i++)
-                       nulls[i] = ' ';
-
-               /* XXX expensive */
-
-               ntuples = CountTuples(rel);
-               CopySendData(&ntuples, sizeof(int32), fp);
-       }
+       scandesc = heap_beginscan(rel, 0, QuerySnapshot, 0, NULL);
 
        while (HeapTupleIsValid(tuple = heap_getnext(scandesc, 0)))
        {
+               bool            need_delim = false;
+
                if (QueryCancel)
                        CancelQuery();
 
-               if (oids && !binary)
+               if (binary)
+               {
+                       /* Binary per-tuple header */
+                       int16   fld_count = attr_count;
+
+                       CopySendData(&fld_count, sizeof(int16), fp);
+                       /* Send OID if wanted --- note fld_count doesn't include it */
+                       if (oids)
+                       {
+                               fld_size = sizeof(Oid);
+                               CopySendData(&fld_size, sizeof(int16), fp);
+                               CopySendData(&tuple->t_data->t_oid, sizeof(Oid), fp);
+                       }
+               }
+               else
                {
-                       string = DatumGetCString(DirectFunctionCall1(oidout,
-                                                                        ObjectIdGetDatum(tuple->t_data->t_oid)));
-                       CopySendString(string, fp);
-                       CopySendChar(delim[0], fp);
-                       pfree(string);
+                       /* Text format has no per-tuple header, but send OID if wanted */
+                       if (oids)
+                       {
+                               string = DatumGetCString(DirectFunctionCall1(oidout,
+                                                                       ObjectIdGetDatum(tuple->t_data->t_oid)));
+                               CopySendString(string, fp);
+                               pfree(string);
+                               need_delim = true;
+                       }
                }
 
                for (i = 0; i < attr_count; i++)
@@ -474,18 +486,31 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p
 
                        origvalue = heap_getattr(tuple, i + 1, tupDesc, &isnull);
 
+                       if (!binary)
+                       {
+                               if (need_delim)
+                                       CopySendChar(delim[0], fp);
+                               need_delim = true;
+                       }
+
                        if (isnull)
                        {
                                if (!binary)
+                               {
                                        CopySendString(null_print, fp); /* null indicator */
+                               }
                                else
-                                       nulls[i] = 'n';
+                               {
+                                       fld_size = 0; /* null marker */
+                                       CopySendData(&fld_size, sizeof(int16), fp);
+                               }
                        }
                        else
                        {
                                /*
                                 * If we have a toasted datum, forcibly detoast it to avoid
-                                * memory leakage inside the type's output routine.
+                                * memory leakage inside the type's output routine (or
+                                * for binary case, becase we must output untoasted value).
                                 */
                                if (isvarlena[i])
                                        value = PointerGetDatum(PG_DETOAST_DATUM(origvalue));
@@ -495,75 +520,71 @@ CopyTo(Relation rel, bool binary, bool oids, FILE *fp, char *delim, char *null_p
                                if (!binary)
                                {
                                        string = DatumGetCString(FunctionCall3(&out_functions[i],
-                                                                                               value,
-                                                                                               ObjectIdGetDatum(elements[i]),
-                                                                                               Int32GetDatum(typmod[i])));
+                                                                               value,
+                                                                               ObjectIdGetDatum(elements[i]),
+                                                                               Int32GetDatum(attr[i]->atttypmod)));
                                        CopyAttributeOut(fp, string, delim);
                                        pfree(string);
                                }
+                               else
+                               {
+                                       fld_size = attr[i]->attlen;
+                                       CopySendData(&fld_size, sizeof(int16), fp);
+                                       if (isvarlena[i])
+                                       {
+                                               /* varlena */
+                                               Assert(fld_size == -1);
+                                               CopySendData(DatumGetPointer(value),
+                                                                        VARSIZE(value),
+                                                                        fp);
+                                       }
+                                       else if (!attr[i]->attbyval)
+                                       {
+                                               /* fixed-length pass-by-reference */
+                                               Assert(fld_size > 0);
+                                               CopySendData(DatumGetPointer(value),
+                                                                        fld_size,
+                                                                        fp);
+                                       }
+                                       else
+                                       {
+                                               /* pass-by-value */
+                                               Datum           datumBuf;
+
+                                               /*
+                                                * We need this horsing around because we don't know
+                                                * how shorter data values are aligned within a Datum.
+                                                */
+                                               store_att_byval(&datumBuf, value, fld_size);
+                                               CopySendData(&datumBuf,
+                                                                        fld_size,
+                                                                        fp);
+                                       }
+                               }
 
                                /* Clean up detoasted copy, if any */
                                if (value != origvalue)
                                        pfree(DatumGetPointer(value));
                        }
-
-                       if (!binary)
-                       {
-                               if (i == attr_count - 1)
-                                       CopySendChar('\n', fp);
-                               else
-                               {
-
-                                       /*
-                                        * when copying out, only use the first char of the
-                                        * delim string
-                                        */
-                                       CopySendChar(delim[0], fp);
-                               }
-                       }
                }
 
-               if (binary)
-               {
-                       int32           null_ct = 0,
-                                               length;
+               if (!binary)
+                       CopySendChar('\n', fp);
+       }
 
-                       for (i = 0; i < attr_count; i++)
-                       {
-                               if (nulls[i] == 'n')
-                                       null_ct++;
-                       }
+       heap_endscan(scandesc);
 
-                       length = tuple->t_len - tuple->t_data->t_hoff;
-                       CopySendData(&length, sizeof(int32), fp);
-                       if (oids)
-                               CopySendData((char *) &tuple->t_data->t_oid, sizeof(int32), fp);
+       if (binary)
+       {
+               /* Generate trailer for a binary copy */
+               int16   fld_count = -1;
 
-                       CopySendData(&null_ct, sizeof(int32), fp);
-                       if (null_ct > 0)
-                       {
-                               for (i = 0; i < attr_count; i++)
-                               {
-                                       if (nulls[i] == 'n')
-                                       {
-                                               CopySendData(&i, sizeof(int32), fp);
-                                               nulls[i] = ' ';
-                                       }
-                               }
-                       }
-                       CopySendData((char *) tuple->t_data + tuple->t_data->t_hoff,
-                                                length, fp);
-               }
+               CopySendData(&fld_count, sizeof(int16), fp);
        }
 
-       heap_endscan(scandesc);
-
        pfree(out_functions);
        pfree(elements);
        pfree(isvarlena);
-       pfree(typmod);
-       if (binary)
-               pfree(nulls);
 }
 
 
@@ -580,27 +601,20 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
        AttrNumber      attr_count;
        FmgrInfo   *in_functions;
        Oid                *elements;
-       int32      *typmod;
        int                     i;
        Oid                     in_func_oid;
        Datum      *values;
        char       *nulls;
        bool            isnull;
        int                     done = 0;
-       char       *string = NULL,
-                          *ptr;
-       int32           len,
-                               null_ct,
-                               null_id;
-       int32           ntuples,
-                               tuples_read = 0;
-       bool            reading_to_eof = true;
+       char       *string;
        ResultRelInfo *resultRelInfo;
        EState     *estate = CreateExecutorState();     /* for ExecConstraints() */
        TupleTable      tupleTable;
        TupleTableSlot *slot;
        Oid                     loaded_oid = InvalidOid;
        bool            skip_tuple = false;
+       bool            file_has_oids;
 
        tupDesc = RelationGetDescr(rel);
        attr = tupDesc->attrs;
@@ -630,31 +644,58 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
        {
                in_functions = (FmgrInfo *) palloc(attr_count * sizeof(FmgrInfo));
                elements = (Oid *) palloc(attr_count * sizeof(Oid));
-               typmod = (int32 *) palloc(attr_count * sizeof(int32));
                for (i = 0; i < attr_count; i++)
                {
                        in_func_oid = (Oid) GetInputFunction(attr[i]->atttypid);
                        fmgr_info(in_func_oid, &in_functions[i]);
                        elements[i] = GetTypeElement(attr[i]->atttypid);
-                       typmod[i] = attr[i]->atttypmod;
                }
+               file_has_oids = oids;   /* must rely on user to tell us this... */
        }
        else
        {
+               /* Read and verify binary header */
+               char            readSig[12];
+               int32           tmp;
+
+               /* Signature */
+               CopyGetData(readSig, 12, fp);
+               if (CopyGetEof(fp) ||
+                       memcmp(readSig, BinarySignature, 12) != 0)
+                       elog(ERROR, "COPY BINARY: file signature not recognized");
+               /* Integer layout field */
+               CopyGetData(&tmp, sizeof(int32), fp);
+               if (CopyGetEof(fp) ||
+                       tmp != 0x01020304)
+                       elog(ERROR, "COPY BINARY: incompatible integer layout");
+               /* Flags field */
+               CopyGetData(&tmp, sizeof(int32), fp);
+               if (CopyGetEof(fp))
+                       elog(ERROR, "COPY BINARY: bogus file header (missing flags)");
+               file_has_oids = (tmp & (1 << 16)) != 0;
+               tmp &= ~ (1 << 16);
+               if ((tmp >> 16) != 0)
+                       elog(ERROR, "COPY BINARY: unrecognized critical flags in header");
+               /* Header extension length */
+               CopyGetData(&tmp, sizeof(int32), fp);
+               if (CopyGetEof(fp) ||
+                       tmp < 0)
+                       elog(ERROR, "COPY BINARY: bogus file header (missing length)");
+               /* Skip extension header, if present */
+               while (tmp-- > 0)
+               {
+                       CopyGetData(readSig, 1, fp);
+                       if (CopyGetEof(fp))
+                               elog(ERROR, "COPY BINARY: bogus file header (wrong length)");
+               }
+
                in_functions = NULL;
                elements = NULL;
-               typmod = NULL;
-               CopyGetData(&ntuples, sizeof(int32), fp);
-               if (ntuples != 0)
-                       reading_to_eof = false;
        }
 
        values = (Datum *) palloc(attr_count * sizeof(Datum));
        nulls = (char *) palloc(attr_count * sizeof(char));
 
-       for (i = 0; i < attr_count; i++)
-               nulls[i] = ' ';
-
        lineno = 0;
        fe_eof = false;
 
@@ -668,15 +709,22 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
 
                lineno++;
 
+               /* Initialize all values for row to NULL */
+               MemSet(values, 0, attr_count * sizeof(Datum));
+               MemSet(nulls, 'n', attr_count * sizeof(char));
+
                if (!binary)
                {
                        int                     newline = 0;
 
-                       if (oids)
+                       if (file_has_oids)
                        {
-                               string = CopyReadAttribute(fp, &isnull, delim, &newline, null_print);
-                               if (string == NULL)
-                                       done = 1;
+                               string = CopyReadAttribute(fp, &isnull, delim,
+                                                                                  &newline, null_print);
+                               if (isnull)
+                                       elog(ERROR, "COPY TEXT: NULL Oid");
+                               else if (string == NULL)
+                                       done = 1;       /* end of file */
                                else
                                {
                                        loaded_oid = DatumGetObjectId(DirectFunctionCall1(oidin,
@@ -685,22 +733,24 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
                                                elog(ERROR, "COPY TEXT: Invalid Oid");
                                }
                        }
+
                        for (i = 0; i < attr_count && !done; i++)
                        {
-                               string = CopyReadAttribute(fp, &isnull, delim, &newline, null_print);
+                               string = CopyReadAttribute(fp, &isnull, delim,
+                                                                                  &newline, null_print);
                                if (isnull)
                                {
-                                       values[i] = PointerGetDatum(NULL);
-                                       nulls[i] = 'n';
+                                       /* already set values[i] and nulls[i] */
                                }
                                else if (string == NULL)
-                                       done = 1;
+                                       done = 1;       /* end of file */
                                else
                                {
                                        values[i] = FunctionCall3(&in_functions[i],
                                                                                          CStringGetDatum(string),
                                                                                          ObjectIdGetDatum(elements[i]),
-                                                                                         Int32GetDatum(typmod[i]));
+                                                                                         Int32GetDatum(attr[i]->atttypmod));
+                                       nulls[i] = ' ';
                                }
                        }
                        if (!done)
@@ -708,47 +758,103 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
                }
                else
                {                                               /* binary */
-                       CopyGetData(&len, sizeof(int32), fp);
-                       if (CopyGetEof(fp))
+                       int16   fld_count,
+                                       fld_size;
+
+                       CopyGetData(&fld_count, sizeof(int16), fp);
+                       if (CopyGetEof(fp) ||
+                               fld_count == -1)
                                done = 1;
                        else
                        {
-                               if (oids)
+                               if (fld_count <= 0 || fld_count > attr_count)
+                                       elog(ERROR, "COPY BINARY: tuple field count is %d, expected %d",
+                                                (int) fld_count, attr_count);
+
+                               if (file_has_oids)
                                {
-                                       CopyGetData(&loaded_oid, sizeof(int32), fp);
+                                       CopyGetData(&fld_size, sizeof(int16), fp);
+                                       if (CopyGetEof(fp))
+                                               elog(ERROR, "COPY BINARY: unexpected EOF");
+                                       if (fld_size != (int16) sizeof(Oid))
+                                               elog(ERROR, "COPY BINARY: sizeof(Oid) is %d, expected %d",
+                                                        (int) fld_size, (int) sizeof(Oid));
+                                       CopyGetData(&loaded_oid, sizeof(Oid), fp);
+                                       if (CopyGetEof(fp))
+                                               elog(ERROR, "COPY BINARY: unexpected EOF");
                                        if (loaded_oid == InvalidOid)
                                                elog(ERROR, "COPY BINARY: Invalid Oid");
                                }
-                               CopyGetData(&null_ct, sizeof(int32), fp);
-                               if (null_ct > 0)
+
+                               for (i = 0; i < (int) fld_count; i++)
                                {
-                                       for (i = 0; i < null_ct; i++)
+                                       CopyGetData(&fld_size, sizeof(int16), fp);
+                                       if (CopyGetEof(fp))
+                                               elog(ERROR, "COPY BINARY: unexpected EOF");
+                                       if (fld_size == 0)
+                                               continue; /* it's NULL; nulls[i] already set */
+                                       if (fld_size != attr[i]->attlen)
+                                               elog(ERROR, "COPY BINARY: sizeof(field %d) is %d, expected %d",
+                                                        i+1, (int) fld_size, (int) attr[i]->attlen);
+                                       if (fld_size == -1)
                                        {
-                                               CopyGetData(&null_id, sizeof(int32), fp);
-                                               nulls[null_id] = 'n';
+                                               /* varlena field */
+                                               int32   varlena_size;
+                                               Pointer varlena_ptr;
+
+                                               CopyGetData(&varlena_size, sizeof(int32), fp);
+                                               if (CopyGetEof(fp))
+                                                       elog(ERROR, "COPY BINARY: unexpected EOF");
+                                               if (varlena_size < (int32) sizeof(int32))
+                                                       elog(ERROR, "COPY BINARY: bogus varlena length");
+                                               varlena_ptr = (Pointer) palloc(varlena_size);
+                                               VARATT_SIZEP(varlena_ptr) = varlena_size;
+                                               CopyGetData(VARDATA(varlena_ptr),
+                                                                       varlena_size - sizeof(int32),
+                                                                       fp);
+                                               if (CopyGetEof(fp))
+                                                       elog(ERROR, "COPY BINARY: unexpected EOF");
+                                               values[i] = PointerGetDatum(varlena_ptr);
+                                       }
+                                       else if (!attr[i]->attbyval)
+                                       {
+                                               /* fixed-length pass-by-reference */
+                                               Pointer refval_ptr;
+
+                                               Assert(fld_size > 0);
+                                               refval_ptr = (Pointer) palloc(fld_size);
+                                               CopyGetData(refval_ptr, fld_size, fp);
+                                               if (CopyGetEof(fp))
+                                                       elog(ERROR, "COPY BINARY: unexpected EOF");
+                                               values[i] = PointerGetDatum(refval_ptr);
+                                       }
+                                       else
+                                       {
+                                               /* pass-by-value */
+                                               Datum           datumBuf;
+
+                                               /*
+                                                * We need this horsing around because we don't know
+                                                * how shorter data values are aligned within a Datum.
+                                                */
+                                               Assert(fld_size > 0 && fld_size <= sizeof(Datum));
+                                               CopyGetData(&datumBuf, fld_size, fp);
+                                               if (CopyGetEof(fp))
+                                                       elog(ERROR, "COPY BINARY: unexpected EOF");
+                                               values[i] = fetch_att(&datumBuf, true, fld_size);
                                        }
-                               }
-
-                               string = (char *) palloc(len);
-                               CopyGetData(string, len, fp);
-
-                               ptr = string;
 
-                               for (i = 0; i < attr_count; i++)
-                               {
-                                       if (nulls[i] == 'n')
-                                               continue;
-                                       ptr = (char *) att_align((long) ptr, attr[i]->attlen, attr[i]->attalign);
-                                       values[i] = fetchatt(attr[i], ptr);
-                                       ptr = att_addlength(ptr, attr[i]->attlen, ptr);
+                                       nulls[i] = ' ';
                                }
                        }
                }
+
                if (done)
-                       continue;
+                       break;
 
                tuple = heap_formtuple(tupDesc, values, nulls);
-               if (oids)
+
+               if (oids && file_has_oids)
                        tuple->t_data->t_oid = loaded_oid;
 
                skip_tuple = false;
@@ -796,25 +902,13 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
                                ExecARInsertTriggers(rel, tuple);
                }
 
-               if (binary)
-                       pfree(string);
-
                for (i = 0; i < attr_count; i++)
                {
                        if (!attr[i]->attbyval && nulls[i] != 'n')
-                       {
-                               if (!binary)
-                                       pfree((void *) values[i]);
-                       }
-                       /* reset nulls[] array for next time */
-                       nulls[i] = ' ';
+                               pfree(DatumGetPointer(values[i]));
                }
 
                heap_freetuple(tuple);
-               tuples_read++;
-
-               if (!reading_to_eof && ntuples == tuples_read)
-                       done = true;
        }
 
        /*
@@ -829,7 +923,6 @@ CopyFrom(Relation rel, bool binary, bool oids, FILE *fp,
        {
                pfree(in_functions);
                pfree(elements);
-               pfree(typmod);
        }
 
        ExecDropTupleTable(tupleTable, true);
@@ -1099,26 +1192,3 @@ CopyAttributeOut(FILE *fp, char *server_string, char *delim)
                pfree(string_start);    /* pfree pg_server_to_client result */
 #endif
 }
-
-/*
- * Returns the number of tuples in a relation. Unfortunately, currently
- * must do a scan of the entire relation to determine this.
- *
- * relation is expected to be an open relation descriptor.
- */
-static int
-CountTuples(Relation relation)
-{
-       HeapScanDesc scandesc;
-       HeapTuple       tuple;
-
-       int                     i;
-
-       scandesc = heap_beginscan(relation, 0, QuerySnapshot, 0, NULL);
-
-       i = 0;
-       while (HeapTupleIsValid(tuple = heap_getnext(scandesc, 0)))
-               i++;
-       heap_endscan(scandesc);
-       return i;
-}