Make citext's equality and hashing functions collation-insensitive.

author Tom Lane

Wed, 8 Jun 2011 19:24:27 +0000 (15:24 -0400)

committer Tom Lane

Wed, 8 Jun 2011 19:25:02 +0000 (15:25 -0400)
author Tom Lane
Wed, 8 Jun 2011 19:24:27 +0000 (15:24 -0400)
committer Tom Lane
Wed, 8 Jun 2011 19:25:02 +0000 (15:25 -0400)
diff --git a/contrib/citext/citext.c b/contrib/citext/citext.c

index 25e4dd3999b6f181b9d195bf83c473cb3866c5aa..31b952b3f74a1cec5e2a937f83f17883d36b0e93 100644 (file)
--- a/contrib/citext/citext.c
+++ b/contrib/citext/citext.c
@@ -4,6 +4,7 @@
  #include "postgres.h"
  
  #include "access/hash.h"
+#include "catalog/pg_collation.h"
  #include "fmgr.h"
  #include "utils/builtins.h"
  #include "utils/formatting.h"
@@ -48,8 +49,16 @@ citextcmp(text *left, text *right, Oid collid)
                *rcstr;
     int32       result;
  
-   lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), collid);
-   rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), collid);
+   /*
+    * We must do our str_tolower calls with DEFAULT_COLLATION_OID, not the
+    * input collation as you might expect.  This is so that the behavior of
+    * citext's equality and hashing functions is not collation-dependent.  We
+    * should change this once the core infrastructure is able to cope with
+    * collation-dependent equality and hashing functions.
+    */
+
+   lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+   rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
  
     result = varstr_cmp(lcstr, strlen(lcstr),
                         rcstr, strlen(rcstr),
@@ -93,7 +102,7 @@ citext_hash(PG_FUNCTION_ARGS)
     char       *str;
     Datum       result;
  
-   str = str_tolower(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), PG_GET_COLLATION());
+   str = str_tolower(VARDATA_ANY(txt), VARSIZE_ANY_EXHDR(txt), DEFAULT_COLLATION_OID);
     result = hash_any((unsigned char *) str, strlen(str));
     pfree(str);
  
@@ -122,8 +131,8 @@ citext_eq(PG_FUNCTION_ARGS)
  
     /* We can't compare lengths in advance of downcasing ... */
  
-   lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), PG_GET_COLLATION());
-   rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), PG_GET_COLLATION());
+   lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+   rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
  
     /*
      * Since we only care about equality or not-equality, we can avoid all the
@@ -152,8 +161,8 @@ citext_ne(PG_FUNCTION_ARGS)
  
     /* We can't compare lengths in advance of downcasing ... */
  
-   lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), PG_GET_COLLATION());
-   rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), PG_GET_COLLATION());
+   lcstr = str_tolower(VARDATA_ANY(left), VARSIZE_ANY_EXHDR(left), DEFAULT_COLLATION_OID);
+   rcstr = str_tolower(VARDATA_ANY(right), VARSIZE_ANY_EXHDR(right), DEFAULT_COLLATION_OID);
  
     /*
      * Since we only care about equality or not-equality, we can avoid all the
diff --git a/doc/src/sgml/citext.sgml b/doc/src/sgml/citext.sgml

index 8cbde88a3e77b8f18f104f9375366a7abf2c2b2d..0c6855fea62b53e128184b1d463b9e9b95a19432 100644 (file)
--- a/doc/src/sgml/citext.sgml
+++ b/doc/src/sgml/citext.sgml
@@ -58,9 +58,9 @@ SELECT * FROM tab WHERE lower(col) = LOWER(?);
      The citext data type allows you to eliminate calls
      to lower in SQL queries, and allows a primary key to
      be case-insensitive. citext is locale-aware, just
-    like text, which means that the comparison of upper case and
+    like text, which means that the matching of upper case and
      lower case characters is dependent on the rules of
-    the LC_CTYPE locale setting. Again, this behavior is
+    the database's LC_CTYPE setting. Again, this behavior is
      identical to the use of lower in queries. But because it's
      done transparently by the data type, you don't have to remember to do
      anything special in your queries.
@@ -97,17 +97,25 @@ SELECT * FROM users WHERE nick = 'Larry';
  
   
    String Comparison Behavior
+
+  
+   citext performs comparisons by converting each string to lower
+   case (as though lower were called) and then comparing the
+   results normally.  Thus, for example, two strings are considered equal
+   if lower would produce identical results for them.
+  
+
    
     In order to emulate a case-insensitive collation as closely as possible,
-   there are citext-specific versions of a number of the comparison
+   there are citext-specific versions of a number of string-processing
     operators and functions.  So, for example, the regular expression
     operators ~ and ~* exhibit the same behavior when
-   applied to citext: they both compare case-insensitively.
+   applied to citext: they both match case-insensitively.
     The same is true
     for !~ and !~*, as well as for the
     LIKE operators ~~ and ~~*, and
     !~~ and !~~*. If you'd like to match
-   case-sensitively, you can always cast to text before comparing.
+   case-sensitively, you can cast the operator's arguments to text.
    
  
    
@@ -168,10 +176,10 @@ SELECT * FROM users WHERE nick = 'Larry';
     
      
       
-      citext's behavior depends on
+      citext's case-folding behavior depends on
        the LC_CTYPE setting of your database. How it compares
-      values is therefore determined when
-      initdb is run to create the cluster. It is not truly
+      values is therefore determined when the database is created.
+      It is not truly
        case-insensitive in the terms defined by the Unicode standard.
        Effectively, what this means is that, as long as you're happy with your
        collation, you should be happy with citext's comparisons. But
@@ -181,6 +189,20 @@ SELECT * FROM users WHERE nick = 'Larry';
       
      
  
+    
+     
+      As of PostgreSQL 9.1, you can attach a
+      COLLATE specification to citext columns or data
+      values.  Currently, citext operators will honor a non-default
+      COLLATE specification while comparing case-folded strings,
+      but the initial folding to lower case is always done according to the
+      database's LC_CTYPE setting (that is, as though
+      COLLATE "default" were given).  This may be changed in a
+      future release so that both steps follow the input COLLATE
+      specification.
+     
+    
+
      
       
         citext is not as efficient as text because the
@@ -198,11 +220,11 @@ SELECT * FROM users WHERE nick = 'Larry';
        contexts.  The standard answer is to use the text type and
        manually use the lower function when you need to compare
        case-insensitively; this works all right if case-insensitive comparison
-      is needed only infrequently.  If you need case-insensitive most of
-      the time and case-sensitive infrequently, consider storing the data
+      is needed only infrequently.  If you need case-insensitive behavior most
+      of the time and case-sensitive infrequently, consider storing the data
        as citext and explicitly casting the column to text
-      when you want case-sensitive comparison.  In either situation, you
-      will need two indexes if you want both types of searches to be fast.
+      when you want case-sensitive comparison.  In either situation, you will
+      need two indexes if you want both types of searches to be fast.
      
      
  
@@ -210,8 +232,8 @@ SELECT * FROM users WHERE nick = 'Larry';
       
        The schema containing the citext operators must be
        in the current search_path (typically public);
-      if it is not, a normal case-sensitive text comparison
-      is performed.
+      if it is not, the normal case-sensitive text operators
+      will be invoked instead.
author	Tom Lane
	Wed, 8 Jun 2011 19:24:27 +0000 (15:24 -0400)
committer	Tom Lane
	Wed, 8 Jun 2011 19:25:02 +0000 (15:25 -0400)
contrib/citext/citext.c		patch \| blob \| blame \| history
doc/src/sgml/citext.sgml		patch \| blob \| blame \| history