Perform post-escaping encoding validity checks on SQL literals and COPY input
authorAndrew Dunstan
Wed, 12 Sep 2007 20:49:27 +0000 (20:49 +0000)
committerAndrew Dunstan
Wed, 12 Sep 2007 20:49:27 +0000 (20:49 +0000)
so that invalidly encoded data cannot enter the database by these means.

src/backend/commands/copy.c
src/backend/parser/scan.l

index d28a6ad11c2f3682f60a49730619f4563e8e764b..fdfe5ea965fdfbcb18d5a16ef663180bbbd33955 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.286 2007/09/07 20:59:26 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/commands/copy.c,v 1.287 2007/09/12 20:49:27 adunstan Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2685,6 +2685,7 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
        char       *start_ptr;
        char       *end_ptr;
        int         input_len;
+       bool        saw_high_bit = false;
 
        /* Make sure space remains in fieldvals[] */
        if (fieldno >= maxfields)
@@ -2749,6 +2750,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
                                }
                            }
                            c = val & 0377;
+                           if (IS_HIGHBIT_SET(c))
+                               saw_high_bit = true;
                        }
                        break;
                    case 'x':
@@ -2772,6 +2775,8 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
                                    }
                                }
                                c = val & 0xff;
+                               if (IS_HIGHBIT_SET(c))
+                                   saw_high_bit = true;                            
                            }
                        }
                        break;
@@ -2799,7 +2804,7 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
                         * literally
                         */
                }
-           }
+           }           
 
            /* Add c to output string */
            *output_ptr++ = c;
@@ -2808,6 +2813,16 @@ CopyReadAttributesText(CopyState cstate, int maxfields, char **fieldvals)
        /* Terminate attribute value in output area */
        *output_ptr++ = '\0';
 
+       /* If we de-escaped a char with the high bit set, make sure
+        * we still have valid data for the db encoding. Avoid calling strlen 
+        * here for the sake of efficiency.
+        */
+       if (saw_high_bit)
+       {
+           char *fld = fieldvals[fieldno];
+           pg_verifymbstr(fld, output_ptr - (fld + 1), false);
+       }
+
        /* Check whether raw input matched null marker */
        input_len = end_ptr - start_ptr;
        if (input_len == cstate->null_print_len &&
index baa5992277152ff5216ae24512b59e9717d215e6..a138a66131a4f36df05fa538a79f2d127dbb22b6 100644 (file)
@@ -24,7 +24,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.140 2007/08/12 20:18:06 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.141 2007/09/12 20:49:27 adunstan Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -60,6 +60,7 @@ bool          escape_string_warning = true;
 bool           standard_conforming_strings = false;
 
 static bool        warn_on_first_escape;
+static bool     saw_high_bit = false;
 
 /*
  * literalbuf is used to accumulate literal values when multiple rules
@@ -426,6 +427,7 @@ other           .
 
 {xqstart}      {
                    warn_on_first_escape = true;
+                   saw_high_bit = false;
                    SET_YYLLOC();
                    if (standard_conforming_strings)
                        BEGIN(xq);
@@ -435,6 +437,7 @@ other           .
                }
 {xestart}      {
                    warn_on_first_escape = false;
+                   saw_high_bit = false;
                    SET_YYLLOC();
                    BEGIN(xe);
                    startlit();
@@ -443,6 +446,11 @@ other          .
 {quotefail} {
                    yyless(1);
                    BEGIN(INITIAL);
+                   /* check that the data remains valid if it might have been
+                    * made invalid by unescaping any chars.
+                    */
+                   if (saw_high_bit)
+                       pg_verifymbstr(literalbuf, literallen, false);
                    yylval.str = litbufdup();
                    return SCONST;
                }
@@ -475,12 +483,16 @@ other         .
 
                    check_escape_warning();
                    addlitchar(c);
+                   if (IS_HIGHBIT_SET(c))
+                       saw_high_bit = true;
                }
 {xehexesc}  {
                    unsigned char c = strtoul(yytext+2, NULL, 16);
 
                    check_escape_warning();
                    addlitchar(c);
+                   if (IS_HIGHBIT_SET(c))
+                       saw_high_bit = true;
                }
 {quotecontinue} {
                    /* ignore */
@@ -892,6 +904,14 @@ litbufdup(void)
 static unsigned char
 unescape_single_char(unsigned char c)
 {
+   /* Normally we wouldn't expect to see \n where n has its high bit set
+    * but we set the flag to check the string if we do get it, so
+    * that this doesn't become a way of getting around the coding validity
+    * checks.
+    */
+   if (IS_HIGHBIT_SET(c))
+       saw_high_bit = true;
+
    switch (c)
    {
        case 'b':