Add prefix support for synonym dictionary

author Teodor Sigaev

Fri, 14 Aug 2009 14:53:20 +0000 (14:53 +0000)

committer Teodor Sigaev

Fri, 14 Aug 2009 14:53:20 +0000 (14:53 +0000)
author Teodor Sigaev
Fri, 14 Aug 2009 14:53:20 +0000 (14:53 +0000)
committer Teodor Sigaev
Fri, 14 Aug 2009 14:53:20 +0000 (14:53 +0000)
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml

index 547c0153ac8be831491f799155702f55d7c4f400..ed78c1d10a0baab3f63ab38c7ce5d8ffe0c4dd0d 100644 (file)
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1,4 +1,4 @@
-
+
  
  
   Full Text Search
@@ -2288,6 +2288,63 @@ SELECT * FROM ts_debug('english', 'Paris');
   asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
  
     
+   
+   
+    An asterisk (*) at the end of definition word indicates 
+    that definition word is a prefix, and to_tsquery() 
+    function will transform that definition to the prefix search format (see 
+    ). 
+    Notice that it is ignored in to_tsvector().
+   
+
+   
+    Contents of $SHAREDIR/tsearch_data/synonym_sample.syn:
+   
+
+postgres        pgsql
+postgresql      pgsql
+postgre pgsql
+gogle   googl
+indices index*
+
+
+   
+    Results:
+   
+
+=# create text search dictionary syn( template=synonym,synonyms='synonym_sample');
+=# select ts_lexize('syn','indices');
+ ts_lexize
+-----------
+ {index}
+(1 row)
+
+=# create text search configuration tst ( copy=simple);
+=# alter text search configuration tst alter mapping for asciiword with syn;
+=# select to_tsquery('tst','indices');
+ to_tsquery
+------------
+ 'index':*
+(1 row)
+
+=# select 'indexes are very useful'::tsvector;
+            tsvector             
+---------------------------------
+ 'are' 'indexes' 'useful' 'very'
+(1 row)
+
+=# select 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
+ ?column?
+----------
+ t
+(1 row)
+
+=# select to_tsvector('tst','indices');
+ to_tsvector
+-------------
+ 'index':1
+(1 row)
+
  
     
      The only parameter required by the synonym template is
diff --git a/src/backend/tsearch/dict_synonym.c b/src/backend/tsearch/dict_synonym.c

index a9d094880def285b44d9b5428c0d33bf0ff27135..13ecfd0eed2594d069d3d6647625d8a8593090f4 100644 (file)
--- a/src/backend/tsearch/dict_synonym.c
+++ b/src/backend/tsearch/dict_synonym.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.10 2009/01/01 17:23:48 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.11 2009/08/14 14:53:20 teodor Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -23,6 +23,8 @@ typedef struct
  {
     char       *in;
     char       *out;
+   int         outlen;
+   uint16      flags;
  } Syn;
  
  typedef struct
@@ -36,11 +38,14 @@ typedef struct
   * Finds the next whitespace-delimited word within the 'in' string.
   * Returns a pointer to the first character of the word, and a pointer
   * to the next byte after the last character in the word (in *end).
+ * Character '*' at the end of word will not be threated as word
+ * charater if flags is not null.
   */
  static char *
-findwrd(char *in, char **end)
+findwrd(char *in, char **end, uint16 *flags)
  {
     char       *start;
+   char       *lastchar;
  
     /* Skip leading spaces */
     while (*in && t_isspace(in))
@@ -53,13 +58,27 @@ findwrd(char *in, char **end)
         return NULL;
     }
  
-   start = in;
+   lastchar = start = in;
  
     /* Find end of word */
     while (*in && !t_isspace(in))
+   {
+       lastchar = in;
         in += pg_mblen(in);
+   }
+
+   if ( in - lastchar == 1 && t_iseq(lastchar, '*') && flags )
+   {
+       *flags = TSL_PREFIX;
+       *end = lastchar;
+   }
+   else
+   {
+       if (flags)
+               *flags = 0;
+       *end = in;
+   }
  
-   *end = in;
     return start;
  }
  
@@ -84,6 +103,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
                *end = NULL;
     int         cur = 0;
     char       *line = NULL;
+   uint16      flags = 0;
  
     foreach(l, dictoptions)
     {
@@ -117,7 +137,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
  
     while ((line = tsearch_readline(&trst)) != NULL)
     {
-       starti = findwrd(line, &end);
+       starti = findwrd(line, &end, NULL);
         if (!starti)
         {
             /* Empty line */
@@ -130,7 +150,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
         }
         *end = '\0';
  
-       starto = findwrd(end + 1, &end);
+       starto = findwrd(end + 1, &end, &flags);
         if (!starto)
         {
             /* A line with only one word (+whitespace). Ignore silently. */
@@ -168,6 +188,9 @@ dsynonym_init(PG_FUNCTION_ARGS)
             d->syn[cur].out = lowerstr(starto);
         }
  
+       d->syn[cur].outlen = strlen(starto);
+       d->syn[cur].flags = flags; 
+
         cur++;
  
  skipline:
@@ -212,7 +235,8 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
         PG_RETURN_POINTER(NULL);
  
     res = palloc0(sizeof(TSLexeme) * 2);
-   res[0].lexeme = pstrdup(found->out);
+   res[0].lexeme = pnstrdup(found->out, found->outlen);
+   res[0].flags = found->flags;
  
     PG_RETURN_POINTER(res);
  }
diff --git a/src/backend/tsearch/synonym_sample.syn b/src/backend/tsearch/synonym_sample.syn

index 4e2eaeec0c1ac0d7fa3b5f66500d1a55829e7bd0..3ecbcf901cbb69e1ba649515371ba8d1d865de8c 100644 (file)
--- a/src/backend/tsearch/synonym_sample.syn
+++ b/src/backend/tsearch/synonym_sample.syn
@@ -2,3 +2,4 @@ postgres    pgsql
  postgresql pgsql
  postgre    pgsql
  gogle  googl
+indices    index*
diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out

index 3ae6a671dad4dd63deda69b4a41436ba92543aed..aba67fcab79cd6e18ce9b61de720c413dd36933b 100644 (file)
--- a/src/test/regress/expected/tsdicts.out
+++ b/src/test/regress/expected/tsdicts.out
@@ -208,6 +208,12 @@ SELECT ts_lexize('synonym', 'Gogle');
   {googl}
  (1 row)
  
+SELECT ts_lexize('synonym', 'indices');
+ ts_lexize 
+-----------
+ {index}
+(1 row)
+
  -- Create and simple test thesaurus dictionary
  -- More tests in configuration checks because ts_lexize()
  -- cannot pass more than one word to thesaurus.
@@ -290,6 +296,18 @@ SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead
   'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6
  (1 row)
  
+SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
+                 to_tsvector                  
+----------------------------------------------
+ 'form':8 'index':1,3,10 'plural':7 'right':6
+(1 row)
+
+SELECT to_tsquery('synonym_tst', 'Index & indices');
+     to_tsquery      
+---------------------
+ 'index' & 'index':*
+(1 row)
+
  -- test thesaurus in configuration
  -- see thesaurus_sample.ths to understand 'odd' resulting tsvector
  CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
diff --git a/src/test/regress/sql/tsdicts.sql b/src/test/regress/sql/tsdicts.sql

index f36e63a31107ef024328b8ea9b43e9bfe31fde45..000f6eb2e7b9e99ebd0e5f92665c39655b83c6a2 100644 (file)
--- a/src/test/regress/sql/tsdicts.sql
+++ b/src/test/regress/sql/tsdicts.sql
@@ -56,6 +56,7 @@ CREATE TEXT SEARCH DICTIONARY synonym (
  
  SELECT ts_lexize('synonym', 'PoStGrEs');
  SELECT ts_lexize('synonym', 'Gogle');
+SELECT ts_lexize('synonym', 'indices');
  
  -- Create and simple test thesaurus dictionary
  -- More tests in configuration checks because ts_lexize()
@@ -104,6 +105,8 @@ ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
  
  SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
  SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google');
+SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
+SELECT to_tsquery('synonym_tst', 'Index & indices');
  
  -- test thesaurus in configuration
  -- see thesaurus_sample.ths to understand 'odd' resulting tsvector
author	Teodor Sigaev
	Fri, 14 Aug 2009 14:53:20 +0000 (14:53 +0000)
committer	Teodor Sigaev
	Fri, 14 Aug 2009 14:53:20 +0000 (14:53 +0000)
doc/src/sgml/textsearch.sgml		patch \| blob \| blame \| history
src/backend/tsearch/dict_synonym.c		patch \| blob \| blame \| history
src/backend/tsearch/synonym_sample.syn		patch \| blob \| blame \| history
src/test/regress/expected/tsdicts.out		patch \| blob \| blame \| history
src/test/regress/sql/tsdicts.sql		patch \| blob \| blame \| history