Improve support of multibyte encoding:
authorTeodor Sigaev
Mon, 12 Dec 2005 11:10:12 +0000 (11:10 +0000)
committerTeodor Sigaev
Mon, 12 Dec 2005 11:10:12 +0000 (11:10 +0000)
- tsvector_(in|out)
- tsquery_(in|out)
- to_tsvector
- to_tsquery, plainto_tsquery
- 'simple' dictionary

19 files changed:
contrib/tsearch2/dict.h
contrib/tsearch2/dict_ex.c
contrib/tsearch2/dict_ispell.c
contrib/tsearch2/dict_snowball.c
contrib/tsearch2/dict_syn.c
contrib/tsearch2/gendict/dict_snowball.c.IN
contrib/tsearch2/gendict/dict_tmpl.c.IN
contrib/tsearch2/ispell/spell.c
contrib/tsearch2/prs_dcfg.c
contrib/tsearch2/query.c
contrib/tsearch2/query.h
contrib/tsearch2/stopword.c
contrib/tsearch2/ts_locale.c
contrib/tsearch2/ts_locale.h
contrib/tsearch2/ts_stat.c
contrib/tsearch2/tsvector.c
contrib/tsearch2/tsvector_op.c
contrib/tsearch2/wordparser/parser.c
contrib/tsearch2/wordparser/parser.h

index 0227bb484508f7d8087926b6e097a276566ffabb..8aef0b0cb70dd0d919ef1ffd360a8ee4887ed84c 100644 (file)
@@ -14,7 +14,6 @@ void      sortstoplist(StopList * s);
 void       freestoplist(StopList * s);
 void       readstoplist(text *in, StopList * s);
 bool       searchstoplist(StopList * s, char *key);
-char      *lowerstr(char *str);
 
 typedef struct
 {
index 8ec3950f9f88353fd2fbc1086d18c9992723120e..334bb5248d8a7ac56cf759670c682090585ca981 100644 (file)
@@ -6,6 +6,7 @@
 
 #include "dict.h"
 #include "common.h"
+#include "ts_locale.h"
 
 typedef struct
 {
index 28ce70a285e7f49a92ca7051739c01bdb2de6867..0e887da584a81cfc199538213f259b4ab1c8ee82 100644 (file)
@@ -9,6 +9,7 @@
 #include "dict.h"
 #include "common.h"
 #include "ispell/spell.h"
+#include "ts_locale.h"
 
 typedef struct
 {
index 0c08c293d360cd4dd11c0d641dc1d9af61a2ab1e..bbd44246b8ecb4100eec8d7face2c068d2a5021f 100644 (file)
@@ -10,6 +10,7 @@
 #include "snowball/header.h"
 #include "snowball/english_stem.h"
 #include "snowball/russian_stem.h"
+#include "ts_locale.h"
 
 typedef struct
 {
index f3281520809d9fa78dd5979f7b36f5c93e18ef6c..b0c50334eac9a8b72b6e74df8c2dd542d1a2cd59 100644 (file)
@@ -8,6 +8,7 @@
 
 #include "dict.h"
 #include "common.h"
+#include "ts_locale.h"
 
 #define SYNBUFLEN  4096
 typedef struct
index ec25edc0fffeb709075df7ae5f5ef1b0f22bd599..818fd6b15770c07622115b9622c61a452a0db159 100644 (file)
@@ -12,6 +12,7 @@
 #include "common.h"
 #include "snowball/header.h"
 #include "subinclude.h"
+#include "ts_locale.h"
 
 typedef struct {
    struct SN_env *z;
index e534ed30a78ab5560d25cea199d687446ee16f8f..9d90df712bf3796cc2adaf8aded27c3423d92380 100644 (file)
@@ -12,6 +12,7 @@
 #include "common.h"
 
 #include "subinclude.h"
+#include "ts_locale.h"
 
 HASINIT typedef struct {
 HASINIT    StopList    stoplist;
index 9999983cc83632dcf9de0e9ce743d54974f921c5..baa36f31f10f988344b7814b141b83cba08410c9 100644 (file)
@@ -6,6 +6,7 @@
 #include "postgres.h"
 
 #include "spell.h"
+#include "ts_locale.h"
 
 #define MAX_NORM 1024
 #define MAXNORMLEN 256
@@ -30,18 +31,6 @@ cmpspellaffix(const void *s1, const void *s2)
    return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag));
 }
 
-static void
-strlower(char *str)
-{
-   unsigned char *ptr = (unsigned char *) str;
-
-   while (*ptr)
-   {
-       *ptr = tolower(*ptr);
-       ptr++;
-   }
-}
-
 static char *
 strnduplicate(char *s, int len)
 {
@@ -175,7 +164,7 @@ NIImportDictionary(IspellDict * Conf, const char *filename)
        }
        else
            flag = "";
-       strlower(str);
+       lowerstr(str);
        /* Dont load words if first letter is not required */
        /* It allows to optimize loading at  search time   */
        s = str;
@@ -385,7 +374,7 @@ NIImportAffixes(IspellDict * Conf, const char *filename)
            *s = 0;
        if (!*str)
            continue;
-       strlower(str);
+       lowerstr(str);
        strcpy(mask, "");
        strcpy(find, "");
        strcpy(repl, "");
@@ -851,7 +840,7 @@ NormalizeSubWord(IspellDict * Conf, char *word, char flag)
 
    if (wrdlen > MAXNORMLEN)
        return NULL;
-   strlower(word);
+   lowerstr(word);
    cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
    *cur = NULL;
 
index 240aaa44973a508f9ceb5e8ac14d1f053575f1f1..c54ca11803c0bfd1ba602be15eba3176d517bf87 100644 (file)
@@ -8,6 +8,7 @@
 
 #include "dict.h"
 #include "common.h"
+#include "ts_locale.h"
 
 #define CS_WAITKEY 0
 #define CS_INKEY   1
@@ -30,11 +31,11 @@ nstrdup(char *ptr, int len)
    cptr = ptr = res;
    while (*ptr)
    {
-       if (*ptr == '\\')
+       if (t_iseq(ptr, '\\'))
            ptr++;
-       *cptr = *ptr;
-       ptr++;
-       cptr++;
+       COPYCHAR( cptr, ptr );
+       cptr+=pg_mblen(ptr);
+       ptr+=pg_mblen(ptr);
    }
    *cptr = '\0';
 
@@ -52,9 +53,9 @@ parse_cfgdict(text *in, Map ** m)
 
    while (ptr - VARDATA(in) < VARSIZE(in) - VARHDRSZ)
    {
-       if (*ptr == ',')
+       if ( t_iseq(ptr, ',') )
            num++;
-       ptr++;
+       ptr+=pg_mblen(ptr);
    }
 
    *m = mptr = (Map *) palloc(sizeof(Map) * (num + 2));
@@ -64,56 +65,56 @@ parse_cfgdict(text *in, Map ** m)
    {
        if (state == CS_WAITKEY)
        {
-           if (isalpha((unsigned char) *ptr))
+           if (t_isalpha(ptr))
            {
                begin = ptr;
                state = CS_INKEY;
            }
-           else if (!isspace((unsigned char) *ptr))
+           else if (!t_isspace(ptr))
                ereport(ERROR,
                        (errcode(ERRCODE_SYNTAX_ERROR),
                         errmsg("syntax error"),
-                        errdetail("Syntax error in position %d near \"%c\"",
-                                  (int) (ptr - VARDATA(in)), *ptr)));
+                        errdetail("Syntax error in position %d",
+                                  (int) (ptr - VARDATA(in)))));
        }
        else if (state == CS_INKEY)
        {
-           if (isspace((unsigned char) *ptr))
+           if (t_isspace(ptr))
            {
                mptr->key = nstrdup(begin, ptr - begin);
                state = CS_WAITEQ;
            }
-           else if (*ptr == '=')
+           else if (t_iseq(ptr,'='))
            {
                mptr->key = nstrdup(begin, ptr - begin);
                state = CS_WAITVALUE;
            }
-           else if (!isalpha((unsigned char) *ptr))
+           else if (!t_isalpha(ptr))
                ereport(ERROR,
                        (errcode(ERRCODE_SYNTAX_ERROR),
                         errmsg("syntax error"),
-                        errdetail("Syntax error in position %d near \"%c\"",
-                                  (int) (ptr - VARDATA(in)), *ptr)));
+                        errdetail("Syntax error in position %d",
+                                  (int) (ptr - VARDATA(in)))));
        }
        else if (state == CS_WAITEQ)
        {
-           if (*ptr == '=')
+           if (t_iseq(ptr, '='))
                state = CS_WAITVALUE;
-           else if (!isspace((unsigned char) *ptr))
+           else if (!t_isspace(ptr))
                ereport(ERROR,
                        (errcode(ERRCODE_SYNTAX_ERROR),
                         errmsg("syntax error"),
-                        errdetail("Syntax error in position %d near \"%c\"",
-                                  (int) (ptr - VARDATA(in)), *ptr)));
+                        errdetail("Syntax error in position %d",
+                                  (int) (ptr - VARDATA(in)))));
        }
        else if (state == CS_WAITVALUE)
        {
-           if (*ptr == '"')
+           if (t_iseq(ptr, '"'))
            {
                begin = ptr + 1;
                state = CS_INVALUE;
            }
-           else if (!isspace((unsigned char) *ptr))
+           else if (!t_isspace(ptr))
            {
                begin = ptr;
                state = CS_IN2VALUE;
@@ -121,36 +122,36 @@ parse_cfgdict(text *in, Map ** m)
        }
        else if (state == CS_INVALUE)
        {
-           if (*ptr == '"')
+           if (t_iseq(ptr, '"'))
            {
                mptr->value = nstrdup(begin, ptr - begin);
                mptr++;
                state = CS_WAITDELIM;
            }
-           else if (*ptr == '\\')
+           else if (t_iseq(ptr, '\\'))
                state = CS_INESC;
        }
        else if (state == CS_IN2VALUE)
        {
-           if (isspace((unsigned char) *ptr) || *ptr == ',')
+           if (t_isspace(ptr) || t_iseq(ptr, ','))
            {
                mptr->value = nstrdup(begin, ptr - begin);
                mptr++;
-               state = (*ptr == ',') ? CS_WAITKEY : CS_WAITDELIM;
+               state = (t_iseq(ptr, ',')) ? CS_WAITKEY : CS_WAITDELIM;
            }
-           else if (*ptr == '\\')
+           else if (t_iseq(ptr, '\\'))
                state = CS_INESC;
        }
        else if (state == CS_WAITDELIM)
        {
-           if (*ptr == ',')
+           if (t_iseq(ptr, ','))
                state = CS_WAITKEY;
-           else if (!isspace((unsigned char) *ptr))
+           else if (!t_isspace(ptr))
                ereport(ERROR,
                        (errcode(ERRCODE_SYNTAX_ERROR),
                         errmsg("syntax error"),
-                        errdetail("Syntax error in position %d near \"%c\"",
-                                  (int) (ptr - VARDATA(in)), *ptr)));
+                        errdetail("Syntax error in position %d",
+                                  (int) (ptr - VARDATA(in)))));
        }
        else if (state == CS_INESC)
            state = CS_INVALUE;
@@ -160,9 +161,9 @@ parse_cfgdict(text *in, Map ** m)
            ereport(ERROR,
                    (errcode(ERRCODE_SYNTAX_ERROR),
                     errmsg("bad parser state"),
-                    errdetail("%d at position %d near \"%c\"",
-                              state, (int) (ptr - VARDATA(in)), *ptr)));
-       ptr++;
+                    errdetail("%d at position %d",
+                              state, (int) (ptr - VARDATA(in)))));
+       ptr+=pg_mblen(ptr);
    }
 
    if (state == CS_IN2VALUE)
index de6d96ed52eb29c94f12ece44843d98d3d21712c..e6285fd9d2ec4b3e8d41b6352c5242ddacc9fc91 100644 (file)
@@ -25,7 +25,7 @@
 #include "query.h"
 #include "query_cleanup.h"
 #include "common.h"
-
+#include "ts_locale.h"
 
 PG_FUNCTION_INFO_V1(tsquery_in);
 Datum      tsquery_in(PG_FUNCTION_ARGS);
@@ -108,24 +108,28 @@ get_weight(char *buf, int2 *weight)
 {
    *weight = 0;
 
-   if (*buf != ':')
+   if ( !t_iseq(buf, ':') )
        return buf;
 
    buf++;
-   while (*buf)
+   while ( *buf && pg_mblen(buf) == 1 )
    {
-       switch (tolower(*buf))
+       switch (*buf)
        {
            case 'a':
+           case 'A':
                *weight |= 1 << 3;
                break;
            case 'b':
+           case 'B':
                *weight |= 1 << 2;
                break;
            case 'c':
+           case 'C':
                *weight |= 1 << 1;
                break;
            case 'd':
+           case 'D':
                *weight |= 1;
                break;
            default:
@@ -149,25 +153,25 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
        {
            case WAITFIRSTOPERAND:
            case WAITOPERAND:
-               if (*(state->buf) == '!')
+               if ( t_iseq(state->buf, '!') )
                {
-                   (state->buf)++;
+                   (state->buf)++; /* can safely ++, t_iseq guarantee that pg_mblen()==1 */
                    *val = (int4) '!';
                    return OPR;
                }
-               else if (*(state->buf) == '(')
+               else if ( t_iseq(state->buf, '(') )
                {
                    state->count++;
                    (state->buf)++;
                    return OPEN;
                }
-               else if (*(state->buf) == ':')
+               else if ( t_iseq(state->buf, ':') )
                {
                    ereport(ERROR,
                            (errcode(ERRCODE_SYNTAX_ERROR),
                             errmsg("error at start of operand")));
                }
-               else if (*(state->buf) != ' ')
+               else if ( !t_isspace(state->buf) )
                {
                    state->valstate.prsbuf = state->buf;
                    if (gettoken_tsvector(&(state->valstate)))
@@ -187,14 +191,14 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                }
                break;
            case WAITOPERATOR:
-               if (*(state->buf) == '&' || *(state->buf) == '|')
+               if ( t_iseq(state->buf, '&') || t_iseq(state->buf, '|') )
                {
                    state->state = WAITOPERAND;
                    *val = (int4) *(state->buf);
                    (state->buf)++;
                    return OPR;
                }
-               else if (*(state->buf) == ')')
+               else if ( t_iseq(state->buf, ')') )
                {
                    (state->buf)++;
                    state->count--;
@@ -202,7 +206,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                }
                else if (*(state->buf) == '\0')
                    return (state->count) ? ERR : END;
-               else if (*(state->buf) != ' ')
+               else if ( !t_isspace(state->buf) )
                    return ERR;
                break;
            case WAITSINGLEOPERAND:
@@ -217,7 +221,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                return ERR;
                break;
        }
-       (state->buf)++;
+       state->buf+=pg_mblen(state->buf);
    }
    return END;
 }
@@ -697,8 +701,11 @@ static QUERYTYPE *
 Datum
 tsquery_in(PG_FUNCTION_ARGS)
 {
+   char * in = (char*)PG_GETARG_POINTER(0);
+   pg_verifymbstr( in, strlen(in), false);
+
    SET_FUNCOID();
-   PG_RETURN_POINTER(queryin((char *) PG_GETARG_POINTER(0), pushval_asis, 0, false));
+   PG_RETURN_POINTER(queryin((char *) in, pushval_asis, 0, false));
 }
 
 /*
@@ -732,20 +739,23 @@ infix(INFIX * in, bool first)
    if (in->curpol->type == VAL)
    {
        char       *op = in->op + in->curpol->distance;
+       int     clen;
 
-       RESIZEBUF(in, in->curpol->length * 2 + 2 + 5);
+       RESIZEBUF(in, in->curpol->length * (pg_database_encoding_max_length()+1) + 2 + 5);
        *(in->cur) = '\'';
        in->cur++;
        while (*op)
        {
-           if (*op == '\'')
+           if ( t_iseq(op, '\'') )
            {
                *(in->cur) = '\\';
                in->cur++;
            }
-           *(in->cur) = *op;
-           op++;
-           in->cur++;
+           COPYCHAR(in->cur,op);
+
+           clen = pg_mblen(op);
+           op+=clen;
+           in->cur+=clen;
        }
        *(in->cur) = '\'';
        in->cur++;
index 9eff69cc719b505f29988f829995cce19e8e691f..b4d586a684b085c043c0eb611a269a73a0edf97d 100644 (file)
@@ -4,7 +4,7 @@
 #define BS_DEBUG
 */
 
-
+#include "ts_locale.h"
 /*
  * item in polish notation with back link
  * to left operand
@@ -38,7 +38,7 @@ typedef struct
 #define GETQUERY(x)  (ITEM*)( (char*)(x)+HDRSIZEQT )
 #define GETOPERAND(x)  ( (char*)GETQUERY(x) + ((QUERYTYPE*)(x))->size * sizeof(ITEM) )
 
-#define ISOPERATOR(x) ( (x)=='!' || (x)=='&' || (x)=='|' || (x)=='(' || (x)==')' )
+#define ISOPERATOR(x) (  pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
 
 #define END                0
 #define ERR                1
index b8789f9e648ce3999e60ccfe87c0a85539b4d802..2a9a464596eee94cea81ffabd65d73c22442cb65 100644 (file)
 
 #include "common.h"
 #include "dict.h"
+#include "ts_locale.h"
 
 #define STOPBUFLEN 4096
 
-char *
-lowerstr(char *str)
-{
-   char       *ptr = str;
-
-   while (*ptr)
-   {
-       *ptr = tolower(*(unsigned char *) ptr);
-       ptr++;
-   }
-   return str;
-}
-
 void
 freestoplist(StopList * s)
 {
@@ -60,10 +48,16 @@ readstoplist(text *in, StopList * s)
        {
            char        sharepath[MAXPGPATH];
            char       *absfn;
+#ifdef WIN32
+           char    delim = '\\';
+#else
+           char    delim = '/';
+#endif
 
            get_share_path(my_exec_path, sharepath);
            absfn = palloc(strlen(sharepath) + strlen(filename) + 2);
-           sprintf(absfn, "%s/%s", sharepath, filename);
+           sprintf(absfn, "%s%c%s", sharepath, delim, filename);
+
            pfree(filename);
            filename = absfn;
        }
index 5dc67abc8dc292ad9fc505308083bf75f9ad522d..29c07c0eab6955cb3286467a261dbc95204d4821 100644 (file)
@@ -5,7 +5,9 @@
 #include "mb/pg_wchar.h"
 
 
-#if defined(TS_USE_WIDE) && defined(WIN32)
+#ifdef TS_USE_WIDE
+
+#ifdef WIN32
 
 size_t
 wchar2char(char *to, const wchar_t *from, size_t len)
@@ -69,4 +71,59 @@ char2wchar(wchar_t *to, const char *from, size_t len)
    return mbstowcs(to, from, len);
 }
 
+#endif /* WIN32 */
+
+int
+_t_isalpha( char *ptr ) {
+   wchar_t character;
+
+   char2wchar(&character, ptr, 1);
+
+   return iswalpha( (wint_t)character );   
+}
+
+int
+_t_isprint( char *ptr ) {
+   wchar_t character;
+
+   char2wchar(&character, ptr, 1);
+
+   return iswprint( (wint_t)character );   
+}
+
+#endif /* TS_USE_WIDE */
+
+char *
+lowerstr(char *str)
+{
+   char       *ptr = str;
+
+#ifdef TS_USE_WIDE
+   /*
+    * Use wide char code only when max encoding length > 1 and ctype != C.
+    * Some operating systems fail with multi-byte encodings and a C locale.
+    * Also, for a C locale there is no need to process as multibyte. From
+    * backend/utils/adt/oracle_compat.c Teodor
+    */
+   if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c()) {
+           wchar_t *wstr, *wptr;
+           int len = strlen(str);
+
+           wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len+1));
+           char2wchar(wstr, str, len+1);
+           while (*wptr) {
+               *wptr = towlower((wint_t) *wptr);
+               wptr++;
+           }
+           wchar2char(str, wstr, len);
+           pfree( wstr );
+   } else
 #endif
+       while (*ptr)
+       {
+           *ptr = tolower(*(unsigned char *) ptr);
+           ptr++;
+       }
+   return str;
+}
+
index 905eb94af089eca6a2e7a85417078e2a6ef0e36c..2d5bc17a961c1bbb76bdc38d3b9cab3689e3acba 100644 (file)
@@ -2,6 +2,8 @@
 #define __TSLOCALE_H__
 
 #include "postgres.h"
+#include "utils/pg_locale.h"
+#include "mb/pg_wchar.h"
 
 #include 
 #include 
 
 #if defined(HAVE_WCSTOMBS) && defined(HAVE_TOWLOWER)
 #define TS_USE_WIDE
+#endif
+
+#ifdef TS_USE_WIDE
+#endif   /* TS_USE_WIDE */
+
+
+#define TOUCHAR(x) (*((unsigned char*)(x)))
+
+#ifdef TS_USE_WIDE
 
 #ifdef WIN32
 
 size_t     wchar2char(char *to, const wchar_t *from, size_t len);
 size_t     char2wchar(wchar_t *to, const char *from, size_t len);
-#else                          /* WIN32 */
+#else    /* WIN32 */
 
 /* correct mbstowcs */
 #define char2wchar mbstowcs
 #define wchar2char wcstombs
 #endif   /* WIN32 */
-#endif   /* defined(HAVE_WCSTOMBS) &&
-                                * defined(HAVE_TOWLOWER) */
+
+#define    t_isdigit(x)    ( pg_mblen(x)==1 && isdigit( TOUCHAR(x) ) )
+#define    t_isspace(x)    ( pg_mblen(x)==1 && isspace( TOUCHAR(x) ) )
+int _t_isalpha( char *ptr );
+#define    t_isalpha(x)    ( (pg_mblen(x)==1) ? isalpha( TOUCHAR(x) ) : _t_isalpha(x) )
+int _t_isprint( char *ptr );
+#define    t_isprint(x)    ( (pg_mblen(x)==1) ? isprint( TOUCHAR(x) ) : _t_isprint(x) )
+/*
+ * t_iseq() should be called only for ASCII symbols 
+ */
+#define t_iseq(x,c)    ( (pg_mblen(x)==1) ? ( TOUCHAR(x) == ((unsigned char)(c)) ) : false ) 
+
+#define COPYCHAR(d,s)  do {                \
+   int lll = pg_mblen( s );            \
+                           \
+   while( lll-- )                  \
+       TOUCHAR(d+lll) = TOUCHAR(s+lll);    \
+} while(0)
+
+       
+#else /* not def TS_USE_WIDE */
+
+#define t_isdigit(x)   isdigit( TOUCHAR(x) )
+#define t_isspace(x)   isspace( TOUCHAR(x) )
+#define t_isalpha(x)   isalpha( TOUCHAR(x) )
+#define t_isprint(x)   isprint( TOUCHAR(x) )
+#define t_iseq(x,c)    ( TOUCHAR(x) == ((unsigned char)(c)) )
+
+#define COPYCHAR(d,s)  TOUCHAR(d) = TOUCHAR(s) 
+
+#endif
+
+char* lowerstr(char *str);
 
 #endif   /* __TSLOCALE_H__ */
index b8ecf96e6db4e23d37aac5dc948f92e3c42c9e1e..ae9575b35322a1c4ebfbbfa67101f7c8b56a4efe 100644 (file)
@@ -8,6 +8,7 @@
 #include "catalog/pg_type.h"
 #include "executor/spi.h"
 #include "common.h"
+#include "ts_locale.h"
 
 PG_FUNCTION_INFO_V1(tsstat_in);
 Datum      tsstat_in(PG_FUNCTION_ARGS);
@@ -476,24 +477,30 @@ ts_stat_sql(text *txt, text *ws)
        buf = VARDATA(ws);
        while (buf - VARDATA(ws) < VARSIZE(ws) - VARHDRSZ)
        {
-           switch (tolower(*buf))
-           {
-               case 'a':
-                   stat->weight |= 1 << 3;
-                   break;
-               case 'b':
-                   stat->weight |= 1 << 2;
-                   break;
-               case 'c':
-                   stat->weight |= 1 << 1;
-                   break;
-               case 'd':
-                   stat->weight |= 1;
-                   break;
-               default:
-                   stat->weight |= 0;
+           if ( pg_mblen(buf) == 1 ) {
+               switch (*buf)
+               {
+                   case 'A':
+                   case 'a':
+                       stat->weight |= 1 << 3;
+                       break;
+                   case 'B':
+                   case 'b':
+                       stat->weight |= 1 << 2;
+                       break;
+                   case 'C':
+                   case 'c':
+                       stat->weight |= 1 << 1;
+                       break;
+                   case 'D':
+                   case 'd':
+                       stat->weight |= 1;
+                       break;
+                   default:
+                       stat->weight |= 0;
+               }
            }
-           buf++;
+           buf+=pg_mblen(buf);
        }
    }
 
index cfed6e428a3b1de23c002406326f7318eb41478a..dd895ff38ab0eb6a420edfa40d05302c530b8cba 100644 (file)
@@ -16,8 +16,9 @@
 #include "catalog/namespace.h"
 
 #include "utils/pg_locale.h"
+#include "mb/pg_wchar.h"
 
-#include              /* tolower */
+#include 
 #include "tsvector.h"
 #include "query.h"
 #include "ts_cfg.h"
@@ -173,7 +174,7 @@ uniqueentry(WordEntryIN * a, int4 l, char *buf, int4 *outbuflen)
 
 #define RESIZEPRSBUF \
 do { \
-   if ( state->curpos - state->word + 1 >= state->len ) \
+   if ( state->curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
    { \
        int4 clen = state->curpos - state->word; \
        state->len *= 2; \
@@ -182,6 +183,7 @@ do { \
    } \
 } while (0)
 
+
 int4
 gettoken_tsvector(TI_IN_STATE * state)
 {
@@ -197,21 +199,21 @@ gettoken_tsvector(TI_IN_STATE * state)
        {
            if (*(state->prsbuf) == '\0')
                return 0;
-           else if (*(state->prsbuf) == '\'')
+           else if ( t_iseq(state->prsbuf, '\'') )
                state->state = WAITENDCMPLX;
-           else if (*(state->prsbuf) == '\\')
+           else if ( t_iseq(state->prsbuf, '\\') )
            {
                state->state = WAITNEXTCHAR;
                oldstate = WAITENDWORD;
            }
-           else if (state->oprisdelim && ISOPERATOR(*(state->prsbuf)))
+           else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
                ereport(ERROR,
                        (errcode(ERRCODE_SYNTAX_ERROR),
                         errmsg("syntax error")));
-           else if (*(state->prsbuf) != ' ')
+           else if (!t_isspace(state->prsbuf))
            {
-               *(state->curpos) = *(state->prsbuf);
-               state->curpos++;
+               COPYCHAR(state->curpos, state->prsbuf);
+               state->curpos+=pg_mblen(state->prsbuf);
                state->state = WAITENDWORD;
            }
        }
@@ -224,20 +226,20 @@ gettoken_tsvector(TI_IN_STATE * state)
            else
            {
                RESIZEPRSBUF;
-               *(state->curpos) = *(state->prsbuf);
-               state->curpos++;
+               COPYCHAR(state->curpos, state->prsbuf);
+               state->curpos+=pg_mblen(state->prsbuf);
                state->state = oldstate;
            }
        }
        else if (state->state == WAITENDWORD)
        {
-           if (*(state->prsbuf) == '\\')
+           if ( t_iseq(state->prsbuf, '\\') )
            {
                state->state = WAITNEXTCHAR;
                oldstate = WAITENDWORD;
            }
-           else if (*(state->prsbuf) == ' ' || *(state->prsbuf) == '\0' ||
-                    (state->oprisdelim && ISOPERATOR(*(state->prsbuf))))
+           else if ( t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
+                    (state->oprisdelim && ISOPERATOR(state->prsbuf)))
            {
                RESIZEPRSBUF;
                if (state->curpos == state->word)
@@ -247,7 +249,7 @@ gettoken_tsvector(TI_IN_STATE * state)
                *(state->curpos) = '\0';
                return 1;
            }
-           else if (*(state->prsbuf) == ':')
+           else if ( t_iseq(state->prsbuf,':') )
            {
                if (state->curpos == state->word)
                    ereport(ERROR,
@@ -262,13 +264,13 @@ gettoken_tsvector(TI_IN_STATE * state)
            else
            {
                RESIZEPRSBUF;
-               *(state->curpos) = *(state->prsbuf);
-               state->curpos++;
+               COPYCHAR(state->curpos, state->prsbuf);
+               state->curpos+=pg_mblen(state->prsbuf);
            }
        }
        else if (state->state == WAITENDCMPLX)
        {
-           if (*(state->prsbuf) == '\'')
+           if ( t_iseq(state->prsbuf, '\'') )
            {
                RESIZEPRSBUF;
                *(state->curpos) = '\0';
@@ -278,13 +280,13 @@ gettoken_tsvector(TI_IN_STATE * state)
                             errmsg("syntax error")));
                if (state->oprisdelim)
                {
-                   state->prsbuf++;
+                   state->prsbuf+=pg_mblen(state->prsbuf);
                    return 1;
                }
                else
                    state->state = WAITPOSINFO;
            }
-           else if (*(state->prsbuf) == '\\')
+           else if ( t_iseq(state->prsbuf, '\\') )
            {
                state->state = WAITNEXTCHAR;
                oldstate = WAITENDCMPLX;
@@ -296,20 +298,20 @@ gettoken_tsvector(TI_IN_STATE * state)
            else
            {
                RESIZEPRSBUF;
-               *(state->curpos) = *(state->prsbuf);
-               state->curpos++;
+               COPYCHAR(state->curpos, state->prsbuf);
+               state->curpos+=pg_mblen(state->prsbuf);
            }
        }
        else if (state->state == WAITPOSINFO)
        {
-           if (*(state->prsbuf) == ':')
+           if ( t_iseq(state->prsbuf, ':') )
                state->state = INPOSINFO;
            else
                return 1;
        }
        else if (state->state == INPOSINFO)
        {
-           if (isdigit((unsigned char) *(state->prsbuf)))
+           if (t_isdigit(state->prsbuf))
            {
                if (state->alen == 0)
                {
@@ -338,9 +340,9 @@ gettoken_tsvector(TI_IN_STATE * state)
        }
        else if (state->state == WAITPOSDELIM)
        {
-           if (*(state->prsbuf) == ',')
+           if ( t_iseq(state->prsbuf, ',') )
                state->state = INPOSINFO;
-           else if (tolower(*(state->prsbuf)) == 'a' || *(state->prsbuf) == '*')
+           else if ( t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*') )
            {
                if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
                    ereport(ERROR,
@@ -348,7 +350,7 @@ gettoken_tsvector(TI_IN_STATE * state)
                             errmsg("syntax error")));
                WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 3);
            }
-           else if (tolower(*(state->prsbuf)) == 'b')
+           else if ( t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B') )
            {
                if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
                    ereport(ERROR,
@@ -356,7 +358,7 @@ gettoken_tsvector(TI_IN_STATE * state)
                             errmsg("syntax error")));
                WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 2);
            }
-           else if (tolower(*(state->prsbuf)) == 'c')
+           else if ( t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C') )
            {
                if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
                    ereport(ERROR,
@@ -364,7 +366,7 @@ gettoken_tsvector(TI_IN_STATE * state)
                             errmsg("syntax error")));
                WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 1);
            }
-           else if (tolower(*(state->prsbuf)) == 'd')
+           else if ( t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D') )
            {
                if (WEP_GETWEIGHT(state->pos[*(uint16 *) (state->pos)]))
                    ereport(ERROR,
@@ -372,10 +374,10 @@ gettoken_tsvector(TI_IN_STATE * state)
                             errmsg("syntax error")));
                WEP_SETWEIGHT(state->pos[*(uint16 *) (state->pos)], 0);
            }
-           else if (isspace((unsigned char) *(state->prsbuf)) ||
+           else if (t_isspace(state->prsbuf) ||
                     *(state->prsbuf) == '\0')
                return 1;
-           else if (!isdigit((unsigned char) *(state->prsbuf)))
+           else if (!t_isdigit(state->prsbuf))
                ereport(ERROR,
                        (errcode(ERRCODE_SYNTAX_ERROR),
                         errmsg("syntax error")));
@@ -383,7 +385,7 @@ gettoken_tsvector(TI_IN_STATE * state)
        else
            /* internal error */
            elog(ERROR, "internal error");
-       state->prsbuf++;
+       state->prsbuf+=pg_mblen(state->prsbuf);
    }
 
    return 0;
@@ -405,6 +407,8 @@ tsvector_in(PG_FUNCTION_ARGS)
                buflen = 256;
 
    SET_FUNCOID();
+
+   pg_verifymbstr( buf, strlen(buf), false );
    state.prsbuf = buf;
    state.len = 32;
    state.word = (char *) palloc(state.len);
@@ -495,17 +499,16 @@ tsvector_out(PG_FUNCTION_ARGS)
    tsvector   *out = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
    char       *outbuf;
    int4        i,
-               j,
                lenbuf = 0,
                pp;
    WordEntry  *ptr = ARRPTR(out);
-   char       *curin,
+   char       *curbegin, *curin,
               *curout;
 
    lenbuf = out->size * 2 /* '' */ + out->size - 1 /* space */ + 2 /* \0 */ ;
    for (i = 0; i < out->size; i++)
    {
-       lenbuf += ptr[i].len * 2 /* for escape */ ;
+       lenbuf += ptr[i].len * 2 * pg_database_encoding_max_length()/* for escape */ ;
        if (ptr[i].haspos)
            lenbuf += 7 * POSDATALEN(out, &(ptr[i]));
    }
@@ -513,14 +516,14 @@ tsvector_out(PG_FUNCTION_ARGS)
    curout = outbuf = (char *) palloc(lenbuf);
    for (i = 0; i < out->size; i++)
    {
-       curin = STRPTR(out) + ptr->pos;
+       curbegin = curin = STRPTR(out) + ptr->pos;
        if (i != 0)
            *curout++ = ' ';
        *curout++ = '\'';
-       j = ptr->len;
-       while (j--)
+       while ( curin-curbegin < ptr->len )
        {
-           if (*curin == '\'')
+           int len = pg_mblen(curin);
+           if ( t_iseq(curin, '\'') )
            {
                int4        pos = curout - outbuf;
 
@@ -528,7 +531,8 @@ tsvector_out(PG_FUNCTION_ARGS)
                curout = outbuf + pos;
                *curout++ = '\\';
            }
-           *curout++ = *curin++;
+           while(len--)
+               *curout++ = *curin++;
        }
        *curout++ = '\'';
        if ((pp = POSDATALEN(out, ptr)) != 0)
index b2562e8984bcf82545a9568a6b44c673d36d3fc5..c9119753941cb32ea33d10560f1c1f1bb271e22a 100644 (file)
@@ -15,7 +15,6 @@
 
 #include "utils/pg_locale.h"
 
-#include              /* tolower */
 #include "tsvector.h"
 #include "query.h"
 #include "ts_cfg.h"
@@ -76,17 +75,21 @@ setweight(PG_FUNCTION_ARGS)
    WordEntryPos *p;
    int         w = 0;
 
-   switch (tolower(cw))
+   switch (cw)
    {
+       case 'A':
        case 'a':
            w = 3;
            break;
+       case 'B':
        case 'b':
            w = 2;
            break;
+       case 'C':
        case 'c':
            w = 1;
            break;
+       case 'D':
        case 'd':
            w = 0;
            break;
index 23b031be79671f5245ed659f7d40c943950ec2fe..8a5fcdabe66aae9eee5ab9ef6ca51767ec474fa7 100644 (file)
@@ -71,8 +71,11 @@ TParserClose(TParser * prs)
        prs->state = ptr;
    }
 
+#ifdef TS_USE_WIDE
    if (prs->wstr)
        pfree(prs->wstr);
+#endif
+
    pfree(prs);
 }
 
index 923edea5896bf4931ebfe2ba5c66ff5fbb2be81e..baeabf72cd7d2e16b51bd14bf50ea0d423deb063 100644 (file)
@@ -134,8 +134,10 @@ typedef struct TParser
    /* string and position information */
    char       *str;            /* multibyte string */
    int         lenstr;         /* length of mbstring */
+#ifdef TS_USE_WIDE
    wchar_t    *wstr;           /* wide character string */
    int         lenwstr;        /* length of wsting */
+#endif
 
    /* State of parse */
    int         charmaxlen;