Implement a solution to the 'Turkish locale downcases I incorrectly'
authorTom Lane
Sat, 21 Feb 2004 00:34:53 +0000 (00:34 +0000)
committerTom Lane
Sat, 21 Feb 2004 00:34:53 +0000 (00:34 +0000)
problem, per previous discussion.  Make some additional changes to
centralize the knowledge of just how identifier downcasing is done,
in hopes of simplifying any future tweaking in this area.

src/backend/commands/define.c
src/backend/commands/functioncmds.c
src/backend/commands/proclang.c
src/backend/parser/keywords.c
src/backend/parser/scan.l
src/backend/parser/scansup.c
src/backend/utils/adt/varlena.c
src/include/commands/defrem.h
src/include/parser/scansup.h
src/pl/plpgsql/src/pl_funcs.c

index 8e30d53d3dd149aa570f73c8d4c5e6c418efbcbc..fc24c2c30fb2cbde081275a229868de3516ebc69 100644 (file)
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/commands/define.c,v 1.85 2003/11/29 19:51:47 pgsql Exp $
+ *   $PostgreSQL: pgsql/src/backend/commands/define.c,v 1.86 2004/02/21 00:34:52 tgl Exp $
  *
  * DESCRIPTION
  *   The "DefineFoo" routines take the parse tree and pick out the
 #include "catalog/namespace.h"
 #include "commands/defrem.h"
 #include "parser/parse_type.h"
+#include "parser/scansup.h"
 #include "utils/int8.h"
 
 
 /*
- * Translate the input language name to lower case.
+ * Translate the input language name to lower case, and truncate if needed.
  *
- * Output buffer must be NAMEDATALEN long.
+ * Returns a palloc'd string
  */
-void
-case_translate_language_name(const char *input, char *output)
+char *
+case_translate_language_name(const char *input)
 {
-   int         i;
-
-   MemSet(output, 0, NAMEDATALEN);     /* ensure result Name is
-                                        * zero-filled */
-
-   for (i = 0; i < NAMEDATALEN - 1 && input[i]; ++i)
-       output[i] = tolower((unsigned char) input[i]);
+   return downcase_truncate_identifier(input, strlen(input), false);
 }
 
 
index 2eb4c100a2b2ae9701970b686666ab4f86f06eff..c91b31ed6fd58f9238ee0b1f917a069d042820f7 100644 (file)
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.43 2004/01/06 23:55:18 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/commands/functioncmds.c,v 1.44 2004/02/21 00:34:52 tgl Exp $
  *
  * DESCRIPTION
  *   These routines take the parse tree and pick out the
@@ -401,7 +401,7 @@ CreateFunction(CreateFunctionStmt *stmt)
    Oid         prorettype;
    bool        returnsSet;
    char       *language;
-   char        languageName[NAMEDATALEN];
+   char       *languageName;
    Oid         languageOid;
    Oid         languageValidator;
    char       *funcname;
@@ -437,7 +437,7 @@ CreateFunction(CreateFunctionStmt *stmt)
               &as_clause, &language, &volatility, &isStrict, &security);
 
    /* Convert language name to canonical case */
-   case_translate_language_name(language, languageName);
+   languageName = case_translate_language_name(language);
 
    /* Look up the language and validate permissions */
    languageTuple = SearchSysCache(LANGNAME,
index 3c8e3185cb91985891bb87f18619ace0b87fc56c..ba6929325bd776250b949511b06935fb958c58fd 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/commands/proclang.c,v 1.52 2003/11/29 19:51:47 pgsql Exp $
+ *   $PostgreSQL: pgsql/src/backend/commands/proclang.c,v 1.53 2004/02/21 00:34:52 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 void
 CreateProceduralLanguage(CreatePLangStmt *stmt)
 {
-   char        languageName[NAMEDATALEN];
+   char       *languageName;
    Oid         procOid,
                valProcOid;
    Oid         funcrettype;
    Oid         typev[FUNC_MAX_ARGS];
+   NameData    langname;
    char        nulls[Natts_pg_language];
    Datum       values[Natts_pg_language];
    Relation    rel;
@@ -66,7 +67,7 @@ CreateProceduralLanguage(CreatePLangStmt *stmt)
     * Translate the language name and check that this language doesn't
     * already exist
     */
-   case_translate_language_name(stmt->plname, languageName);
+   languageName = case_translate_language_name(stmt->plname);
 
    if (SearchSysCacheExists(LANGNAME,
                             PointerGetDatum(languageName),
@@ -124,12 +125,13 @@ CreateProceduralLanguage(CreatePLangStmt *stmt)
    }
 
    i = 0;
-   values[i++] = PointerGetDatum(languageName);
-   values[i++] = BoolGetDatum(true);   /* lanispl */
-   values[i++] = BoolGetDatum(stmt->pltrusted);
-   values[i++] = ObjectIdGetDatum(procOid);
-   values[i++] = ObjectIdGetDatum(valProcOid);
-   nulls[i] = 'n';             /* lanacl */
+   namestrcpy(&langname, languageName);
+   values[i++] = NameGetDatum(&langname);          /* lanname */
+   values[i++] = BoolGetDatum(true);               /* lanispl */
+   values[i++] = BoolGetDatum(stmt->pltrusted);    /* lanpltrusted */
+   values[i++] = ObjectIdGetDatum(procOid);        /* lanplcallfoid */
+   values[i++] = ObjectIdGetDatum(valProcOid);     /* lanvalidator */
+   nulls[i] = 'n';                                 /* lanacl */
 
    rel = heap_openr(LanguageRelationName, RowExclusiveLock);
 
@@ -173,7 +175,7 @@ CreateProceduralLanguage(CreatePLangStmt *stmt)
 void
 DropProceduralLanguage(DropPLangStmt *stmt)
 {
-   char        languageName[NAMEDATALEN];
+   char       *languageName;
    HeapTuple   langTup;
    ObjectAddress object;
 
@@ -189,7 +191,7 @@ DropProceduralLanguage(DropPLangStmt *stmt)
     * Translate the language name, check that this language exist and is
     * a PL
     */
-   case_translate_language_name(stmt->plname, languageName);
+   languageName = case_translate_language_name(stmt->plname);
 
    langTup = SearchSysCache(LANGNAME,
                             CStringGetDatum(languageName),
index 57e020c10808d3879821a8d455eba7bd6b6aaf23..a94786690ed52e04190cdc4e43bab3a54aea0433 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.144 2003/11/29 19:51:51 pgsql Exp $
+ *   $PostgreSQL: pgsql/src/backend/parser/keywords.c,v 1.145 2004/02/21 00:34:52 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -369,17 +369,13 @@ ScanKeywordLookup(const char *text)
 
    /*
     * Apply an ASCII-only downcasing.  We must not use tolower() since it
-    * may produce the wrong translation in some locales (eg, Turkish),
-    * and we don't trust isupper() very much either.  In an ASCII-based
-    * encoding the tests against A and Z are sufficient, but we also
-    * check isupper() so that we will work correctly under EBCDIC.  The
-    * actual case conversion step should work for either ASCII or EBCDIC.
+    * may produce the wrong translation in some locales (eg, Turkish).
     */
    for (i = 0; i < len; i++)
    {
        char        ch = text[i];
 
-       if (ch >= 'A' && ch <= 'Z' && isupper((unsigned char) ch))
+       if (ch >= 'A' && ch <= 'Z')
            ch += 'a' - 'A';
        word[i] = ch;
    }
index 13cbfb9895e52478c2e8f84bd893ce1f6fb18945..caab9a002cf075298accde359be5d39c02e1e89d 100644 (file)
@@ -10,7 +10,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.113 2004/02/19 19:11:30 tgl Exp $
+ *   $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.114 2004/02/21 00:34:52 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -27,6 +27,7 @@
 #include "parser/keywords.h"
 /* Not needed now that this file is compiled as part of gram.y */
 /* #include "parser/parse.h" */
+#include "parser/scansup.h"
 #include "utils/builtins.h"
 #include "mb/pg_wchar.h"
 
@@ -395,23 +396,15 @@ other         .
                    startlit();
                }
 {xdstop}   {
+                   char           *ident;
+
                    BEGIN(INITIAL);
                    if (literallen == 0)
                        yyerror("zero-length delimited identifier");
+                   ident = litbufdup();
                    if (literallen >= NAMEDATALEN)
-                   {
-                       int len;
-
-                       len = pg_mbcliplen(literalbuf, literallen,
-                                          NAMEDATALEN-1);
-                       ereport(NOTICE,
-                               (errcode(ERRCODE_NAME_TOO_LONG),
-                                errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
-                                       literalbuf, len, literalbuf)));
-                       literalbuf[len] = '\0';
-                       literallen = len;
-                   }
-                   yylval.str = litbufdup();
+                       truncate_identifier(ident, literallen, true);
+                   yylval.str = ident;
                    return IDENT;
                }
 {xddouble} {
@@ -537,7 +530,6 @@ other           .
 {identifier}   {
                    const ScanKeyword *keyword;
                    char           *ident;
-                   int             i;
 
                    /* Is it a keyword? */
                    keyword = ScanKeywordLookup(yytext);
@@ -550,28 +542,8 @@ other          .
                    /*
                     * No.  Convert the identifier to lower case, and truncate
                     * if necessary.
-                    *
-                    * Note: here we use a locale-dependent case conversion,
-                    * which seems appropriate under standard SQL rules, whereas
-                    * the keyword comparison was NOT locale-dependent.
                     */
-                   ident = pstrdup(yytext);
-                   for (i = 0; ident[i]; i++)
-                   {
-                       if (isupper((unsigned char) ident[i]))
-                           ident[i] = tolower((unsigned char) ident[i]);
-                   }
-                   if (i >= NAMEDATALEN)
-                    {
-                       int len;
-
-                       len = pg_mbcliplen(ident, i, NAMEDATALEN-1);
-                       ereport(NOTICE,
-                               (errcode(ERRCODE_NAME_TOO_LONG),
-                                errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
-                                       ident, len, ident)));
-                       ident[len] = '\0';
-                    }
+                   ident = downcase_truncate_identifier(yytext, yyleng, true);
                    yylval.str = ident;
                    return IDENT;
                }
index 9177b858a794f23e5644087e684576d4d979f416..76c620b394e595b04b87ddf3cc820f92a1275f33 100644 (file)
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.25 2003/11/29 19:51:52 pgsql Exp $
+ *   $PostgreSQL: pgsql/src/backend/parser/scansup.c,v 1.26 2004/02/21 00:34:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,8 @@
 
 #include "miscadmin.h"
 #include "parser/scansup.h"
+#include "mb/pg_wchar.h"
+
 
 /* ----------------
  *     scanstr
@@ -32,7 +34,7 @@
  */
 
 char *
-scanstr(char *s)
+scanstr(const char *s)
 {
    char       *newStr;
    int         len,
@@ -109,3 +111,75 @@ scanstr(char *s)
    newStr[j] = '\0';
    return newStr;
 }
+
+
+/*
+ * downcase_truncate_identifier() --- do appropriate downcasing and
+ * truncation of an unquoted identifier.  Optionally warn of truncation.
+ *
+ * Returns a palloc'd string containing the adjusted identifier.
+ *
+ * Note: in some usages the passed string is not null-terminated.
+ *
+ * Note: the API of this function is designed to allow for downcasing
+ * transformations that increase the string length, but we don't yet
+ * support that.  If you want to implement it, you'll need to fix
+ * SplitIdentifierString() in utils/adt/varlena.c.
+ */
+char *
+downcase_truncate_identifier(const char *ident, int len, bool warn)
+{
+   char       *result;
+   int         i;
+
+   result = palloc(len + 1);
+   /*
+    * SQL99 specifies Unicode-aware case normalization, which we don't yet
+    * have the infrastructure for.  Instead we use tolower() to provide a
+    * locale-aware translation.  However, there are some locales where this
+    * is not right either (eg, Turkish may do strange things with 'i' and
+    * 'I').  Our current compromise is to use tolower() for characters with
+    * the high bit set, and use an ASCII-only downcasing for 7-bit
+    * characters.
+    */
+   for (i = 0; i < len; i++)
+   {
+       unsigned char   ch = (unsigned char) ident[i];
+
+       if (ch >= 'A' && ch <= 'Z')
+           ch += 'a' - 'A';
+       else if (ch >= 0x80 && isupper(ch))
+           ch = tolower(ch);
+       result[i] = (char) ch;
+   }
+   result[i] = '\0';
+
+   if (i >= NAMEDATALEN)
+       truncate_identifier(result, i, warn);
+
+   return result;
+}
+
+/*
+ * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
+ *
+ * The given string is modified in-place, if necessary.  A warning is
+ * issued if requested.
+ *
+ * We require the caller to pass in the string length since this saves a
+ * strlen() call in some common usages.
+ */
+void
+truncate_identifier(char *ident, int len, bool warn)
+{
+   if (len >= NAMEDATALEN)
+   {
+       len = pg_mbcliplen(ident, len, NAMEDATALEN-1);
+       if (warn)
+           ereport(NOTICE,
+                   (errcode(ERRCODE_NAME_TOO_LONG),
+                    errmsg("identifier \"%s\" will be truncated to \"%.*s\"",
+                           ident, len, ident)));
+       ident[len] = '\0';
+   }
+}
index 3d96ce23ac5e5a74d590b7466656361ef2c42268..f329486321d96c8af9fe9b9e28a5196e65cb7f36 100644 (file)
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.111 2004/01/31 05:09:40 neilc Exp $
+ *   $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.112 2004/02/21 00:34:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 #include 
 
-#include "mb/pg_wchar.h"
-#include "miscadmin.h"
 #include "access/tuptoaster.h"
 #include "catalog/pg_type.h"
 #include "lib/stringinfo.h"
 #include "libpq/crypt.h"
 #include "libpq/pqformat.h"
+#include "mb/pg_wchar.h"
+#include "miscadmin.h"
+#include "parser/scansup.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
-#include "utils/pg_locale.h"
 #include "utils/lsyscache.h"
+#include "utils/pg_locale.h"
 
 
 typedef struct varlena unknown;
@@ -1695,7 +1696,6 @@ SplitIdentifierString(char *rawstring, char separator,
    {
        char       *curname;
        char       *endp;
-       int         curlen;
 
        if (*nextp == '\"')
        {
@@ -1718,21 +1718,30 @@ SplitIdentifierString(char *rawstring, char separator,
        else
        {
            /* Unquoted name --- extends to separator or whitespace */
+           char       *downname;
+           int         len;
+
            curname = nextp;
            while (*nextp && *nextp != separator &&
                   !isspace((unsigned char) *nextp))
-           {
-               /*
-                * It's important that this match the identifier
-                * downcasing code used by backend/parser/scan.l.
-                */
-               if (isupper((unsigned char) *nextp))
-                   *nextp = tolower((unsigned char) *nextp);
                nextp++;
-           }
            endp = nextp;
            if (curname == nextp)
                return false;   /* empty unquoted name not allowed */
+           /*
+            * Downcase the identifier, using same code as main lexer does.
+            *
+            * XXX because we want to overwrite the input in-place, we cannot
+            * support a downcasing transformation that increases the
+            * string length.  This is not a problem given the current
+            * implementation of downcase_truncate_identifier, but we'll
+            * probably have to do something about this someday.
+            */
+           len = endp - curname;
+           downname = downcase_truncate_identifier(curname, len, false);
+           Assert(strlen(downname) <= len);
+           strncpy(curname, downname, len);
+           pfree(downname);
        }
 
        while (isspace((unsigned char) *nextp))
@@ -1753,13 +1762,8 @@ SplitIdentifierString(char *rawstring, char separator,
        /* Now safe to overwrite separator with a null */
        *endp = '\0';
 
-       /* Truncate name if it's overlength; again, should match scan.l */
-       curlen = strlen(curname);
-       if (curlen >= NAMEDATALEN)
-       {
-           curlen = pg_mbcliplen(curname, curlen, NAMEDATALEN - 1);
-           curname[curlen] = '\0';
-       }
+       /* Truncate name if it's overlength */
+       truncate_identifier(curname, strlen(curname), false);
 
        /*
         * Finished isolating current name --- add it to list
index a462dd55acb1c42c9313b7bce26f8c65f1cab9f3..00f5fa1a4801c23e8a72d0791da80f3fc2d5e236 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/commands/defrem.h,v 1.53 2003/11/29 22:40:59 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/commands/defrem.h,v 1.54 2004/02/21 00:34:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -61,7 +61,7 @@ extern void RenameOpClass(List *name, const char *access_method, const char *new
 
 /* support routines in commands/define.c */
 
-extern void case_translate_language_name(const char *input, char *output);
+extern char *case_translate_language_name(const char *input);
 
 extern char *defGetString(DefElem *def);
 extern double defGetNumeric(DefElem *def);
index caa2f5d1727d9541e776d6ea3b03149324a5fa4e..d710c81060a6daee4e305a5fcffc7a13c4e80c47 100644 (file)
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/parser/scansup.h,v 1.14 2003/11/29 22:41:09 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/parser/scansup.h,v 1.15 2004/02/21 00:34:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef SCANSUP_H
 #define SCANSUP_H
 
-extern char *scanstr(char *s);
+extern char *scanstr(const char *s);
+
+extern char *downcase_truncate_identifier(const char *ident, int len,
+                                         bool warn);
+
+extern void truncate_identifier(char *ident, int len, bool warn);
 
 #endif   /* SCANSUP_H */
index b9a8a8b0ba940513881f5967b721874a0c9b1804..f49a2ac500c72da0526c68badbf5cba5bec49839 100644 (file)
@@ -3,7 +3,7 @@
  *           procedural language
  *
  * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_funcs.c,v 1.31 2003/11/29 19:52:12 pgsql Exp $
+ *   $PostgreSQL: pgsql/src/pl/plpgsql/src/pl_funcs.c,v 1.32 2004/02/21 00:34:53 tgl Exp $
  *
  *   This software is copyrighted by Jan Wieck - Hamburg.
  *
@@ -40,7 +40,7 @@
 
 #include 
 
-#include "mb/pg_wchar.h"
+#include "parser/scansup.h"
 
 
 /* ----------
@@ -348,15 +348,15 @@ plpgsql_convert_ident(const char *s, char **output, int numidents)
    {
        char       *curident;
        char       *cp;
-       int         i;
 
        /* Process current identifier */
-       curident = palloc(strlen(s) + 1);       /* surely enough room */
-       cp = curident;
 
        if (*s == '"')
        {
            /* Quoted identifier: copy, collapsing out doubled quotes */
+
+           curident = palloc(strlen(s) + 1); /* surely enough room */
+           cp = curident;
            s++;
            while (*s)
            {
@@ -373,35 +373,20 @@ plpgsql_convert_ident(const char *s, char **output, int numidents)
                        (errcode(ERRCODE_SYNTAX_ERROR),
                         errmsg("unterminated \" in name: %s", sstart)));
            s++;
+           *cp = '\0';
+           /* Truncate to NAMEDATALEN */
+           truncate_identifier(curident, cp-curident, false);
        }
        else
        {
-           /*
-            * Normal identifier: downcase, stop at dot or whitespace.
-            *
-            * Note that downcasing is locale-sensitive, following SQL99
-            * rules for identifiers.  We have already decided that the
-            * item is not a PLPGSQL keyword.
-            */
-           while (*s && *s != '.' && !isspace((unsigned char) *s))
-           {
-               if (isupper((unsigned char) *s))
-                   *cp++ = tolower((unsigned char) *s++);
-               else
-                   *cp++ = *s++;
-           }
-       }
-
-       /* Truncate to NAMEDATALEN */
-       *cp = '\0';
-       i = cp - curident;
-
-       if (i >= NAMEDATALEN)
-       {
-           int         len;
+           /* Normal identifier: extends till dot or whitespace */
+           const char *thisstart = s;
 
-           len = pg_mbcliplen(curident, i, NAMEDATALEN - 1);
-           curident[len] = '\0';
+           while (*s && *s != '.' && !isspace((unsigned char) *s))
+               s++;
+           /* Downcase and truncate to NAMEDATALEN */
+           curident = downcase_truncate_identifier(thisstart, s-thisstart,
+                                                   false);
        }
 
        /* Pass ident to caller */