Have text search thesaurus files use "?" for stop words.

author Bruce Momjian

Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)

committer Bruce Momjian

Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
author Bruce Momjian
Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
committer Bruce Momjian
Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml

index 26fdad0c6ff071f34607f07361dca00ce8135e3a..e556c6dd78a8bdf5b7ae6f573d596e8983453024 100644 (file)
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1,4 +1,4 @@
-
+
  
  
   Full Text Search
@@ -2258,20 +2258,17 @@ more sample word(s) : more indexed word(s)
     
  
     
-    Stop words recognized by the subdictionary are replaced by a stop
-    word placeholder to record their position. To illustrate this,
-    consider these phrases:
+    Specific stop words recognized by the subdictionary cannot be
+    specified;  instead use ? to mark the location where any
+    stop word can appear.  For example, assuming that a and
+    the are stop words according to the subdictionary:
  
  
-a one the two : swsw
-the one a two : swsw2
+? one ? two : swsw
  
  
-    Assuming that a and the are stop words according
-    to the subdictionary, these two phrases are identical to the thesaurus:
-    they both look like stopword one
-    stopword two.  Input matching this pattern
-    will be replaced by swsw2, according to the tie-breaking rule.
+    matches a one the two and the one a two;
+    both would be replaced by swsw.
     
  
     
@@ -3576,6 +3573,12 @@ Parser: "pg_catalog.default"
      
     
  
+   
+    
+     Thesaurus files now use ? for stop words.
+    
+   
+
     
      
       What else?
diff --git a/src/backend/tsearch/dict_thesaurus.c b/src/backend/tsearch/dict_thesaurus.c

index 7a0ae4afd3e40d6d0a43734ac802c7cd9f8815e9..31564a789935c13edf0ae81feae7a6738f753266 100644 (file)
--- a/src/backend/tsearch/dict_thesaurus.c
+++ b/src/backend/tsearch/dict_thesaurus.c
@@ -7,7 +7,7 @@
   *
   *
   * IDENTIFICATION
- *   $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.5 2007/11/09 01:32:22 momjian Exp $
+ *   $PostgreSQL: pgsql/src/backend/tsearch/dict_thesaurus.c,v 1.6 2007/11/10 15:39:34 momjian Exp $
   *
   *-------------------------------------------------------------------------
   */
@@ -412,47 +412,48 @@ compileTheLexeme(DictThesaurus * d)
     {
         TSLexeme   *ptr;
  
-       ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
-                                      PointerGetDatum(d->subdict->dictData),
-                                         PointerGetDatum(d->wrds[i].lexeme),
-                                   Int32GetDatum(strlen(d->wrds[i].lexeme)),
-                                                    PointerGetDatum(NULL)));
-
-       if (!ptr)
-           elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
-                d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
-       else if (!(ptr->lexeme))
-       {
-           elog(NOTICE, "thesaurus word-sample \"%s\" is recognized as stop-word, assign any stop-word (rule %d)",
-                d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
-
+       if (strcmp(d->wrds[i].lexeme, "?") == 0)    /* Is stop word marker? */
             newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, NULL, d->wrds[i].entries, 0);
-       }
         else
         {
-           while (ptr->lexeme)
+           ptr = (TSLexeme *) DatumGetPointer(FunctionCall4(&(d->subdict->lexize),
+                                          PointerGetDatum(d->subdict->dictData),
+                                             PointerGetDatum(d->wrds[i].lexeme),
+                                       Int32GetDatum(strlen(d->wrds[i].lexeme)),
+                                                        PointerGetDatum(NULL)));
+   
+           if (!ptr)
+               elog(ERROR, "thesaurus word-sample \"%s\" isn't recognized by subdictionary (rule %d)",
+                    d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+           else if (!(ptr->lexeme))
+               elog(ERROR, "thesaurus word-sample \"%s\" is recognized as stop-word, use \"?\" for stop words instead (rule %d)",
+                    d->wrds[i].lexeme, d->wrds[i].entries->idsubst + 1);
+           else
             {
-               TSLexeme   *remptr = ptr + 1;
-               int         tnvar = 1;
-               int         curvar = ptr->nvariant;
-
-               /* compute n words in one variant */
-               while (remptr->lexeme)
+               while (ptr->lexeme)
                 {
-                   if (remptr->nvariant != (remptr - 1)->nvariant)
-                       break;
-                   tnvar++;
-                   remptr++;
-               }
-
-               remptr = ptr;
-               while (remptr->lexeme && remptr->nvariant == curvar)
-               {
-                   newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
-                   remptr++;
+                   TSLexeme   *remptr = ptr + 1;
+                   int         tnvar = 1;
+                   int         curvar = ptr->nvariant;
+   
+                   /* compute n words in one variant */
+                   while (remptr->lexeme)
+                   {
+                       if (remptr->nvariant != (remptr - 1)->nvariant)
+                           break;
+                       tnvar++;
+                       remptr++;
+                   }
+   
+                   remptr = ptr;
+                   while (remptr->lexeme && remptr->nvariant == curvar)
+                   {
+                       newwrds = addCompiledLexeme(newwrds, &nnw, &tnm, remptr, d->wrds[i].entries, tnvar);
+                       remptr++;
+                   }
+   
+                   ptr = remptr;
                 }
-
-               ptr = remptr;
             }
         }
  
diff --git a/src/backend/tsearch/thesaurus_sample.ths b/src/backend/tsearch/thesaurus_sample.ths

index 77a32a75d056242c0816b7aa0a515c065c4a7ede..0b4857ec33403b59ad3ebb2a42388cae7d7886c1 100644 (file)
--- a/src/backend/tsearch/thesaurus_sample.ths
+++ b/src/backend/tsearch/thesaurus_sample.ths
@@ -14,4 +14,5 @@ two : *2
  supernovae stars : *sn
  supernovae : *sn
  booking tickets : order invitation cards
-# booking the tickets : order invitation Cards
+booking ? tickets : order invitation Cards
+
diff --git a/src/test/regress/expected/tsdicts.out b/src/test/regress/expected/tsdicts.out

index 3520baceac71007d5da591a0d2b66679f0291de6..4b8929361a85e09e6fc30f4bddce7a0612b06f07 100644 (file)
--- a/src/test/regress/expected/tsdicts.out
+++ b/src/test/regress/expected/tsdicts.out
@@ -311,8 +311,8 @@ SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usuall
  (1 row)
  
  SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
-                             to_tsvector                             
----------------------------------------------------------------------
- 'book':8 'card':3 'like':6 'look':5 'invit':2 'order':1 'ticket':10
+                      to_tsvector                      
+-------------------------------------------------------
+ 'card':3,10 'like':6 'look':5 'invit':2,9 'order':1,8
  (1 row)
author	Bruce Momjian
	Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
committer	Bruce Momjian
	Sat, 10 Nov 2007 15:39:34 +0000 (15:39 +0000)
doc/src/sgml/textsearch.sgml		patch \| blob \| blame \| history
src/backend/tsearch/dict_thesaurus.c		patch \| blob \| blame \| history
src/backend/tsearch/thesaurus_sample.ths		patch \| blob \| blame \| history
src/test/regress/expected/tsdicts.out		patch \| blob \| blame \| history