Fix de-escaping checks so that we will reject \000 as well as other invalidly
authorTom Lane
Sun, 19 Apr 2009 21:08:54 +0000 (21:08 +0000)
committerTom Lane
Sun, 19 Apr 2009 21:08:54 +0000 (21:08 +0000)
encoded sequences.  Per discussion of a couple of days ago.

src/backend/commands/copy.c
src/backend/parser/scan.l

index 90ceb77bbbef31c88a0f69d1330b2cbf7e0eec4f..9ba7c5fc03735dc700d3bb2b7c70e126e4c6b79c 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.307 2009/03/31 22:12:46 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.308 2009/04/19 21:08:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2718,7 +2718,7 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
        char       *start_ptr;
        char       *end_ptr;
        int         input_len;
-       bool        saw_high_bit = false;
+       bool        saw_non_ascii = false;
 
        /* Make sure space remains in fieldvals[] */
        if (fieldno >= maxfields)
@@ -2783,8 +2783,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
                                }
                            }
                            c = val & 0377;
-                           if (IS_HIGHBIT_SET(c))
-                               saw_high_bit = true;
+                           if (c == '\0' || IS_HIGHBIT_SET(c))
+                               saw_non_ascii = true;
                        }
                        break;
                    case 'x':
@@ -2808,8 +2808,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
                                    }
                                }
                                c = val & 0xff;
-                               if (IS_HIGHBIT_SET(c))
-                                   saw_high_bit = true;
+                               if (c == '\0' || IS_HIGHBIT_SET(c))
+                                   saw_non_ascii = true;
                            }
                        }
                        break;
@@ -2847,11 +2847,11 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
        *output_ptr++ = '\0';
 
        /*
-        * If we de-escaped a char with the high bit set, make sure we still
+        * If we de-escaped a non-7-bit-ASCII char, make sure we still
         * have valid data for the db encoding. Avoid calling strlen here for
         * the sake of efficiency.
         */
-       if (saw_high_bit)
+       if (saw_non_ascii)
        {
            char       *fld = fieldvals[fieldno];
 
index a3d4d857c89b5b3c42eddd4bcbddd1fa61317e1e..8551cd27538eed6a8b1b2fa75560ec251c104a14 100644 (file)
@@ -24,7 +24,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.150 2009/04/14 22:18:47 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.151 2009/04/19 21:08:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -60,7 +60,7 @@ bool          escape_string_warning = true;
 bool           standard_conforming_strings = false;
 
 static bool        warn_on_first_escape;
-static bool     saw_high_bit = false;
+static bool        saw_non_ascii = false;
 
 /*
  * literalbuf is used to accumulate literal values when multiple rules
@@ -453,7 +453,7 @@ other           .
 
 {xqstart}      {
                    warn_on_first_escape = true;
-                   saw_high_bit = false;
+                   saw_non_ascii = false;
                    SET_YYLLOC();
                    if (standard_conforming_strings)
                        BEGIN(xq);
@@ -463,7 +463,7 @@ other           .
                }
 {xestart}      {
                    warn_on_first_escape = false;
-                   saw_high_bit = false;
+                   saw_non_ascii = false;
                    SET_YYLLOC();
                    BEGIN(xe);
                    startlit();
@@ -477,10 +477,11 @@ other         .
 {quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
-                   /* check that the data remains valid if it might have been
+                   /*
+                    * check that the data remains valid if it might have been
                     * made invalid by unescaping any chars.
                     */
-                   if (saw_high_bit)
+                   if (saw_non_ascii)
                        pg_verifymbstr(literalbuf, literallen, false);
                    yylval.str = litbufdup();
                    return SCONST;
@@ -526,16 +527,16 @@ other         .
 
                    check_escape_warning();
                    addlitchar(c);
-                   if (IS_HIGHBIT_SET(c))
-                       saw_high_bit = true;
+                   if (c == '\0' || IS_HIGHBIT_SET(c))
+                       saw_non_ascii = true;
                }
 {xehexesc}  {
                    unsigned char c = strtoul(yytext+2, NULL, 16);
 
                    check_escape_warning();
                    addlitchar(c);
-                   if (IS_HIGHBIT_SET(c))
-                       saw_high_bit = true;
+                   if (c == '\0' || IS_HIGHBIT_SET(c))
+                       saw_non_ascii = true;
                }
 {quotecontinue} {
                    /* ignore */
@@ -1083,6 +1084,11 @@ litbuf_udeescape(unsigned char escape)
    }
 
    *out = '\0';
+   /*
+    * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
+    * codes; but it's probably not worth the trouble, since this isn't
+    * likely to be a performance-critical path.
+    */
    pg_verifymbstr(new, out - new, false);
    return new;
 }
@@ -1090,14 +1096,6 @@ litbuf_udeescape(unsigned char escape)
 static unsigned char
 unescape_single_char(unsigned char c)
 {
-   /* Normally we wouldn't expect to see \n where n has its high bit set
-    * but we set the flag to check the string if we do get it, so
-    * that this doesn't become a way of getting around the coding validity
-    * checks.
-    */
-   if (IS_HIGHBIT_SET(c))
-       saw_high_bit = true;
-
    switch (c)
    {
        case 'b':
@@ -1111,6 +1109,10 @@ unescape_single_char(unsigned char c)
        case 't':
            return '\t';
        default:
+           /* check for backslash followed by non-7-bit-ASCII */
+           if (c == '\0' || IS_HIGHBIT_SET(c))
+               saw_non_ascii = true;
+
            return c;
    }
 }