Document filtering dictionaries in textsearch.sgml.

author Tom Lane

Wed, 25 Aug 2010 21:42:55 +0000 (21:42 +0000)

committer Tom Lane

Wed, 25 Aug 2010 21:42:55 +0000 (21:42 +0000)
author Tom Lane
Wed, 25 Aug 2010 21:42:55 +0000 (21:42 +0000)
committer Tom Lane
Wed, 25 Aug 2010 21:42:55 +0000 (21:42 +0000)
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml

index fb7f2050917d2f3b74dc11af5a259c9bd45798e6..60fac102df797f8b5cee78cebb9e6bcaf3345906 100644 (file)
--- a/doc/src/sgml/textsearch.sgml
+++ b/doc/src/sgml/textsearch.sgml
@@ -1,4 +1,4 @@
-
+
  
  
   Full Text Search
@@ -112,7 +112,7 @@
       as a sorted array of normalized lexemes. Along with the lexemes it is
       often desirable to store positional information to use for
       proximity ranking, so that a document that
-     contains a more dense region of query words is 
+     contains a more dense region of query words is
       assigned a higher rank than one with scattered query words.
      
     
@@ -1151,13 +1151,13 @@ MaxFragments=0, FragmentDelimiter=" ... "
  
  SELECT ts_headline('english',
    'The most common type of search
-is to find all documents containing given query terms 
+is to find all documents containing given query terms
  and return them in order of their similarity to the
  query.',
    to_tsquery('query & similarity'));
                          ts_headline                         
  ------------------------------------------------------------
- containing given <b>query</b> terms 
+ containing given <b>query</b> terms
   and return them in order of their <b>similarity</b> to the
   <b>query</b>.
  
@@ -1166,7 +1166,7 @@ SELECT ts_headline('english',
  is to find all documents containing given query terms
  and return them in order of their similarity to the
  query.',
-  to_tsquery('query & similarity'), 
+  to_tsquery('query & similarity'),
    'StartSel = <, StopSel = >');
                        ts_headline                      
  -------------------------------------------------------
@@ -2064,6 +2064,14 @@ SELECT alias, description, token FROM ts_debug('http://example.com/stuff/index.h
        (notice that one token can produce more than one lexeme)
       
      
+    
+     
+      a single lexeme with the TSL_FILTER flag set, to replace
+      the original token with a new token to be passed to subsequent
+      dictionaries (a dictionary that does this is called a
+      filtering dictionary)
+     
+    
      
       
        an empty array if the dictionary knows the token, but it is a stop word
@@ -2096,6 +2104,13 @@ SELECT alias, description, token FROM ts_debug('http://example.com/stuff/index.h
     until some dictionary recognizes it as a known word.  If it is identified
     as a stop word, or if no dictionary recognizes the token, it will be
     discarded and not indexed or searched for.
+   Normally, the first dictionary that returns a non-NULL
+   output determines the result, and any remaining dictionaries are not
+   consulted; but a filtering dictionary can replace the given word
+   with a modified word, which is then passed to subsequent dictionaries.
+  
+
+  
     The general rule for configuring a list of dictionaries
     is to place first the most narrow, most specific dictionary, then the more
     general dictionaries, finishing with a very general dictionary, like
@@ -2112,6 +2127,16 @@ ALTER TEXT SEARCH CONFIGURATION astro_en
  
    
  
+  
+   A filtering dictionary can be placed anywhere in the list, except at the
+   end where it'd be useless.  Filtering dictionaries are useful to partially
+   normalize words to simplify the task of later dictionaries.  For example,
+   a filtering dictionary could be used to remove accents from accented
+   letters, as is done by the
+   contrib/unaccent
+   extension module.
+  
+
    
     Stop Words
  
@@ -2184,7 +2209,7 @@ CREATE TEXT SEARCH DICTIONARY public.simple_dict (
      Here, english is the base name of a file of stop words.
      The file's full name will be
      $SHAREDIR/tsearch_data/english.stop,
-    where $SHAREDIR means the 
+    where $SHAREDIR means the
      PostgreSQL installation's shared-data directory,
      often /usr/local/share/postgresql (use pg_config
      --sharedir to determine it if you're not sure).
@@ -2295,17 +2320,39 @@ SELECT * FROM ts_debug('english', 'Paris');
   asciiword | Word, all ASCII | Paris | {my_synonym,english_stem} | my_synonym | {paris}
  
     
-   
+
     
-    An asterisk (*) at the end of definition word indicates 
-    that definition word is a prefix, and to_tsquery() 
-    function will transform that definition to the prefix search format (see 
-    ). 
-    Notice that it is ignored in to_tsvector().
+    The only parameter required by the synonym template is
+    SYNONYMS, which is the base name of its configuration file
+    — my_synonyms in the above example.
+    The file's full name will be
+    $SHAREDIR/tsearch_data/my_synonyms.syn
+    (where $SHAREDIR means the
+    PostgreSQL installation's shared-data directory).
+    The file format is just one line
+    per word to be substituted, with the word followed by its synonym,
+    separated by white space.  Blank lines and trailing spaces are ignored.
+   
+
+   
+    The synonym template also has an optional parameter
+    CaseSensitive, which defaults to false.  When
+    CaseSensitive is false, words in the synonym file
+    are folded to lower case, as are input tokens.  When it is
+    true, words and tokens are not folded to lower case,
+    but are compared as-is.
     
  
     
-    Contents of $SHAREDIR/tsearch_data/synonym_sample.syn:
+    An asterisk (*) can be placed at the end of a synonym
+    in the configuration file.  This indicates that the synonym is a prefix.
+    The asterisk is ignored when the entry is used in
+    to_tsvector(), but when it is used in
+    to_tsquery(), the result will be a query item with
+    the prefix match marker (see
+    ).
+    For example, suppose we have these entries in
+    $SHAREDIR/tsearch_data/synonym_sample.syn:
  
  postgres        pgsql
  postgresql      pgsql
@@ -2313,67 +2360,42 @@ postgre pgsql
  gogle   googl
  indices index*
  
-   
-
-   
-    Results:
+    Then we will get these results:
  
-=# CREATE TEXT SEARCH DICTIONARY syn (template=synonym, synonyms='synonym_sample');
-=# SELECT ts_lexize('syn','indices');
+mydb=# CREATE TEXT SEARCH DICTIONARY syn (template=synonym, synonyms='synonym_sample');
+mydb=# SELECT ts_lexize('syn','indices');
   ts_lexize
  -----------
   {index}
  (1 row)
  
-=# CREATE TEXT SEARCH CONFIGURATION tst (copy=simple);
-=# ALTER TEXT SEARCH CONFIGURATION tst ALTER MAPPING FOR asciiword WITH syn;
-=# SELECT to_tsquery('tst','indices');
+mydb=# CREATE TEXT SEARCH CONFIGURATION tst (copy=simple);
+mydb=# ALTER TEXT SEARCH CONFIGURATION tst ALTER MAPPING FOR asciiword WITH syn;
+mydb=# SELECT to_tsvector('tst','indices');
+ to_tsvector
+-------------
+ 'index':1
+(1 row)
+
+mydb=# SELECT to_tsquery('tst','indices');
   to_tsquery
  ------------
   'index':*
  (1 row)
  
-=# SELECT 'indexes are very useful'::tsvector;
+mydb=# SELECT 'indexes are very useful'::tsvector;
              tsvector             
  ---------------------------------
   'are' 'indexes' 'useful' 'very'
  (1 row)
  
-=# SELECT 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
+mydb=# SELECT 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
   ?column?
  ----------
   t
  (1 row)
-
-=# SELECT to_tsvector('tst','indices');
- to_tsvector
--------------
- 'index':1
-(1 row)
  
     
-
-   
-    The only parameter required by the synonym template is
-    SYNONYMS, which is the base name of its configuration file
-    — my_synonyms in the above example.
-    The file's full name will be
-    $SHAREDIR/tsearch_data/my_synonyms.syn
-    (where $SHAREDIR means the
-    PostgreSQL installation's shared-data directory).
-    The file format is just one line
-    per word to be substituted, with the word followed by its synonym,
-    separated by white space.  Blank lines and trailing spaces are ignored.
-   
-
-   
-    The synonym template also has an optional parameter
-    CaseSensitive, which defaults to false.  When
-    CaseSensitive is false, words in the synonym file
-    are folded to lower case, as are input tokens.  When it is
-    true, words and tokens are not folded to lower case,
-    but are compared as-is.
-   
    
  
    
diff --git a/doc/src/sgml/unaccent.sgml b/doc/src/sgml/unaccent.sgml

index 6c73c3f298664d5f3a14d28ee80f0abd03189cd7..135fcdb6dc68e86e9b7ee127f653b4e6aebedb40 100644 (file)
--- a/doc/src/sgml/unaccent.sgml
+++ b/doc/src/sgml/unaccent.sgml
@@ -1,4 +1,4 @@
-
+
  
  
   unaccent
@@ -75,8 +75,10 @@
    
     Running the installation script unaccent.sql creates a text
     search template unaccent and a dictionary unaccent
-   based on it, with default parameters.  You can alter the
-   parameters, for example
+   based on it.  The unaccent dictionary has the default
+   parameter setting RULES='unaccent', which makes it immediately
+   usable with the standard unaccent.rules file.
+   If you wish, you can alter the parameter, for example
  
  
  mydb=# ALTER TEXT SEARCH DICTIONARY unaccent (RULES='my_rules');
author	Tom Lane
	Wed, 25 Aug 2010 21:42:55 +0000 (21:42 +0000)
committer	Tom Lane
	Wed, 25 Aug 2010 21:42:55 +0000 (21:42 +0000)
doc/src/sgml/textsearch.sgml		patch \| blob \| blame \| history
doc/src/sgml/unaccent.sgml		patch \| blob \| blame \| history