Modify COPY TO to emit carriage returns and newlines as backslash escapes

author Tom Lane

Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)

committer Tom Lane

Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
author Tom Lane
Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
committer Tom Lane
Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
diff --git a/doc/src/sgml/ref/copy.sgml b/doc/src/sgml/ref/copy.sgml

index 850af1f077584c434b6109dd7df3564423fce16e..b4a226876a9076f5f1adce09e4ec411b604fb72f 100644 (file)
--- a/doc/src/sgml/ref/copy.sgml
+++ b/doc/src/sgml/ref/copy.sgml
@@ -1,5 +1,5 @@
  
  
@@ -74,7 +74,7 @@ COPY [ BINARY ] table [ WITH OIDS ]
        filename
        
         
-   The absolute Unix file name of the input or output file.
+   The absolute Unix path name of the input or output file.
         
        
       
@@ -225,7 +225,7 @@ ERROR: reason
      By default, a text copy uses a tab ("\t") character as a delimiter
      between fields.  The field delimiter may be changed to any other single
      character with the keyword phrase USING DELIMITERS.  Characters
-    in data fields which happen to match the delimiter character will
+    in data fields that happen to match the delimiter character will
      be backslash quoted.
     
     
@@ -265,8 +265,8 @@ ERROR: reason
      by the PostgreSQL user (the user ID the
      server runs as), not the client.
      COPY naming a file is only allowed to database
-    superusers, since it allows writing on any file that the backend has
-    privileges to write on.
+    superusers, since it allows reading or writing any file that the backend
+    has privileges to access.
      
      
       
@@ -297,57 +297,109 @@ ERROR: reason
    File Formats
    
     
-    2001-01-02
+    2002-02-12
     
     Text Format
     
-    When COPY TO is used without the BINARY option,
-    the file generated will have each row (instance) on a single line, with each
-    column (attribute) separated by the delimiter character.  Embedded
-    delimiter characters will be preceded by a backslash character
-    ("\").  The attribute values themselves are strings generated by the
-    output function associated with each attribute type.  The output
-    function for a type should not try to generate the backslash
-    character; this will be handled by COPY itself.
+    When COPY is used without the BINARY option,
+    the file read or written is a text file with one line per table row.
+    Columns (attributes) in a row are separated by the delimiter character.
+    The attribute values themselves are strings generated by the
+    output function, or acceptable to the input function, of each
+    attribute's data type.  The specified null-value string is used in
+    place of attributes that are NULL.
     
     
-    The actual format for each instance is
-    
-<attr1><separator><attr2><separator>...<separator><attrn><newline>
-    
-    Note that the end of each row is marked by a Unix-style newline
-    ("\n").  COPY FROM will not behave as desired
-    if given a file containing DOS- or Mac-style newlines.
+    If WITH OIDS is specified, the OID is read or written as the first column,
+    preceding the user data columns.  (An error is raised if WITH OIDS is
+    specified for a table that does not have OIDs.)
     
     
-    The OID is emitted as the first column if WITH OIDS is specified.
-    (An error is raised if WITH OIDS is specified for a table that does not
-    have OIDs.)
+    End of data can be represented by a single line containing just
+    backslash-period (\.).  An end-of-data marker is
+    not necessary when reading from a Unix file, since the end of file
+    serves perfectly well; but an end marker must be provided when copying
+    data to or from a client application.
     
     
-    If COPY TO is sending its output to standard
-    output instead of a file, after the last row it will send a backslash ("\")
-    and a period (".") followed by a newline.
-    Similarly, if COPY FROM is reading
-    from standard input, it will expect a backslash ("\") and a period
-    (".") followed by a newline, as the first three characters on a
-    line to denote end-of-file.  However, COPY FROM
-    will terminate correctly (followed by the backend itself) if the
-    input connection is closed before this special end-of-file pattern is
-    found.
+    Backslash characters (\) may be used in the
+    COPY data to quote data characters that might otherwise
+    be taken as row or column delimiters.  In particular, the following
+    characters must be preceded by a backslash if they appear
+    as part of an attribute value: backslash itself, newline, and the current
+    delimiter character.
     
     
-    The backslash character has other special meanings.  A literal backslash
-    character is represented as two
-    consecutive backslashes ("\\").  A literal tab character is represented
-    as a backslash and a tab.  (If you are using something other than tab
-    as the column delimiter, backslash that delimiter character to include
-    it in data.)  A literal newline character is
-    represented as a backslash and a newline.  When loading text data
-    not generated by PostgreSQL,
-    you will need to convert backslash
-    characters ("\") to double-backslashes ("\\") to ensure that they 
-    are loaded properly.
+    The following special backslash sequences are recognized by
+    COPY FROM:
+
+   
+    
+     
+      
+       Sequence
+       Represents
+      
+     
+
+     
+      
+       \b
+       Backspace (ASCII 8)
+      
+      
+       \f
+       Form feed (ASCII 12)
+      
+      
+       \n
+       Newline (ASCII 10)
+      
+      
+       \r
+       Carriage return (ASCII 13)
+      
+      
+       \t
+       Tab (ASCII 9)
+      
+      
+       \v
+       Vertical tab (ASCII 11)
+      
+      
+       \digits
+       Backslash followed by one to three octal digits specifies
+       the character with that numeric code
+      
+     
+    
+   
+
+    Presently, COPY TO will never emit an octal-digits
+    backslash sequence, but it does use the other sequences listed above
+    for those control characters.
+   
+   
+    Never put a backslash before a data character N or period
+    (.). Such pairs will be mistaken for the default null string
+    or the end-of-data marker, respectively.  Any other backslashed character
+    that is not mentioned in the above table will be taken to represent itself.
+   
+   
+    It is strongly recommended that applications generating COPY data convert
+    data newlines and carriage returns to the \n and
+    \r sequences respectively.  At present
+    (PostgreSQL 7.2 and older versions) it is
+    possible to represent a data carriage return without any special quoting,
+    and to represent a data newline by a backslash and newline.  However,
+    these representations will not be accepted by default in future releases.
+   
+   
+    Note that the end of each row is marked by a Unix-style newline
+    ("\n").  Presently, COPY FROM will not behave as
+    desired if given a file containing DOS- or Mac-style newlines.
+    This is expected to change in future releases.
     
    
  
diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c

index b944279d7e1e21f5a0302a98fc70427cff72e87f..f42b865687c6fa0caa6f0904fe0e4be786e1a69e 100644 (file)
--- a/src/backend/commands/copy.c
+++ b/src/backend/commands/copy.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.144 2001/12/04 21:19:57 tgl Exp $
+ *   $Header: /cvsroot/pgsql/src/backend/commands/copy.c,v 1.145 2002/02/12 21:25:41 tgl Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -41,7 +41,7 @@
  #endif
  
  #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7'))
-#define VALUE(c) ((c) - '0')
+#define OCTVALUE(c) ((c) - '0')
  
  
  /* non-export function prototypes */
@@ -83,13 +83,13 @@ static int  server_encoding;
   * Internal communications functions
   */
  static void CopySendData(void *databuf, int datasize, FILE *fp);
-static void CopySendString(char *str, FILE *fp);
+static void CopySendString(const char *str, FILE *fp);
  static void CopySendChar(char c, FILE *fp);
  static void CopyGetData(void *databuf, int datasize, FILE *fp);
  static int CopyGetChar(FILE *fp);
  static int CopyGetEof(FILE *fp);
  static int CopyPeekChar(FILE *fp);
-static void CopyDonePeek(FILE *fp, int c, int pickup);
+static void CopyDonePeek(FILE *fp, int c, bool pickup);
  
  /*
   * CopySendData sends output data either to the file
@@ -118,9 +118,9 @@ CopySendData(void *databuf, int datasize, FILE *fp)
  }
  
  static void
-CopySendString(char *str, FILE *fp)
+CopySendString(const char *str, FILE *fp)
  {
-   CopySendData(str, strlen(str), fp);
+   CopySendData((void *) str, strlen(str), fp);
  }
  
  static void
@@ -178,10 +178,12 @@ CopyGetEof(FILE *fp)
  
  /*
   * CopyPeekChar reads a byte in "peekable" mode.
+ *
   * after each call to CopyPeekChar, a call to CopyDonePeek _must_
   * follow, unless EOF was returned.
- * CopyDonePeek will either take the peeked char off the steam
- * (if pickup is != 0) or leave it on the stream (if pickup == 0)
+ *
+ * CopyDonePeek will either take the peeked char off the stream
+ * (if pickup is true) or leave it on the stream (if pickup is false).
   */
  static int
  CopyPeekChar(FILE *fp)
@@ -199,15 +201,13 @@ CopyPeekChar(FILE *fp)
  }
  
  static void
-CopyDonePeek(FILE *fp, int c, int pickup)
+CopyDonePeek(FILE *fp, int c, bool pickup)
  {
     if (!fp)
     {
         if (pickup)
         {
-           /*
-            * We want to pick it up
-            */
+           /* We want to pick it up */
             (void) pq_getbyte();
         }
         /* If we didn't want to pick it up, just leave it where it sits */
@@ -219,7 +219,7 @@ CopyDonePeek(FILE *fp, int c, int pickup)
             /* We don't want to pick it up - so put it back in there */
             ungetc(c, fp);
         }
-       /* If we wanted to pick it up, it's already there */
+       /* If we wanted to pick it up, it's already done */
     }
  }
  
@@ -1078,31 +1078,30 @@ CopyReadAttribute(FILE *fp, bool *isnull, char *delim, int *newline, char *null_
                     {
                         int         val;
  
-                       val = VALUE(c);
+                       val = OCTVALUE(c);
                         c = CopyPeekChar(fp);
                         if (ISOCTAL(c))
                         {
-                           val = (val << 3) + VALUE(c);
-                           CopyDonePeek(fp, c, 1);     /* Pick up the
-                                                        * character! */
+                           val = (val << 3) + OCTVALUE(c);
+                           CopyDonePeek(fp, c, true /*pick up*/);
                             c = CopyPeekChar(fp);
                             if (ISOCTAL(c))
                             {
-                               CopyDonePeek(fp, c, 1); /* pick up! */
-                               val = (val << 3) + VALUE(c);
+                               val = (val << 3) + OCTVALUE(c);
+                               CopyDonePeek(fp, c, true /*pick up*/);
                             }
                             else
                             {
                                 if (c == EOF)
                                     goto endOfFile;
-                               CopyDonePeek(fp, c, 0); /* Return to stream! */
+                               CopyDonePeek(fp, c, false /*put back*/);
                             }
                         }
                         else
                         {
                             if (c == EOF)
                                 goto endOfFile;
-                           CopyDonePeek(fp, c, 0);     /* Return to stream! */
+                           CopyDonePeek(fp, c, false /*put back*/);
                         }
                         c = val & 0377;
                     }
@@ -1144,6 +1143,7 @@ CopyReadAttribute(FILE *fp, bool *isnull, char *delim, int *newline, char *null_
         }
         appendStringInfoCharMacro(&attribute_buf, c);
  #ifdef MULTIBYTE
+       /* XXX shouldn't this be done even when encoding is the same? */
         if (client_encoding != server_encoding)
         {
             /* get additional bytes of the char, if any */
@@ -1190,15 +1190,18 @@ CopyAttributeOut(FILE *fp, char *server_string, char *delim)
  {
     char       *string;
     char        c;
+   char        delimc = delim[0];
  
  #ifdef MULTIBYTE
+   bool        same_encoding;
     char       *string_start;
     int         mblen;
     int         i;
  #endif
  
  #ifdef MULTIBYTE
-   if (client_encoding != server_encoding)
+   same_encoding = (server_encoding == client_encoding);
+   if (!same_encoding)
     {
         string = (char *) pg_server_to_client((unsigned char *) server_string,
                                               strlen(server_string));
@@ -1207,31 +1210,64 @@ CopyAttributeOut(FILE *fp, char *server_string, char *delim)
     else
     {
         string = server_string;
-       string_start = NULL;    /* unused, but keep compiler quiet */
+       string_start = NULL;
     }
  #else
     string = server_string;
  #endif
  
  #ifdef MULTIBYTE
-   for (; (mblen = (server_encoding == client_encoding ? 1 : pg_encoding_mblen(client_encoding, string))) &&
-        ((c = *string) != '\0'); string += mblen)
+   for (; (c = *string) != '\0'; string += mblen)
  #else
     for (; (c = *string) != '\0'; string++)
  #endif
     {
-       if (c == delim[0] || c == '\n' || c == '\\')
-           CopySendChar('\\', fp);
  #ifdef MULTIBYTE
-       for (i = 0; i < mblen; i++)
-           CopySendChar(*(string + i), fp);
-#else
-       CopySendChar(c, fp);
+       mblen = 1;
  #endif
+       switch (c)
+       {
+           case '\b':
+               CopySendString("\\b", fp);
+               break;
+           case '\f':
+               CopySendString("\\f", fp);
+               break;
+           case '\n':
+               CopySendString("\\n", fp);
+               break;
+           case '\r':
+               CopySendString("\\r", fp);
+               break;
+           case '\t':
+               CopySendString("\\t", fp);
+               break;
+           case '\v':
+               CopySendString("\\v", fp);
+               break;
+           case '\\':
+               CopySendString("\\\\", fp);
+               break;
+           default:
+               if (c == delimc)
+                   CopySendChar('\\', fp);
+               CopySendChar(c, fp);
+#ifdef MULTIBYTE
+               /* XXX shouldn't this be done even when encoding is same? */
+               if (!same_encoding)
+               {
+                   /* send additional bytes of the char, if any */
+                   mblen = pg_encoding_mblen(client_encoding, string);
+                   for (i = 1; i < mblen; i++)
+                       CopySendChar(string[i], fp);
+               }
+#endif
+               break;
+       }
     }
  
  #ifdef MULTIBYTE
-   if (client_encoding != server_encoding)
+   if (string_start)
         pfree(string_start);    /* pfree pg_server_to_client result */
  #endif
  }
author	Tom Lane
	Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
committer	Tom Lane
	Tue, 12 Feb 2002 21:25:41 +0000 (21:25 +0000)
doc/src/sgml/ref/copy.sgml		patch \| blob \| blame \| history
src/backend/commands/copy.c		patch \| blob \| blame \| history