First rough cut at text search documentation: bare bones reference

author Tom Lane

Tue, 21 Aug 2007 21:08:47 +0000 (21:08 +0000)

committer Tom Lane

Tue, 21 Aug 2007 21:08:47 +0000 (21:08 +0000)
author Tom Lane
Tue, 21 Aug 2007 21:08:47 +0000 (21:08 +0000)
committer Tom Lane
Tue, 21 Aug 2007 21:08:47 +0000 (21:08 +0000)
diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml

index fe1fdfd579433e1e8305525f445f6c0b5916be64..8de816db6be6fe78ee1b605371247fd3f182d4ec 100644 (file)
--- a/doc/src/sgml/filelist.sgml
+++ b/doc/src/sgml/filelist.sgml
@@ -1,4 +1,4 @@
-
+
  
  
  
@@ -17,7 +17,6 @@
  
  
  
-
  
  
  
@@ -26,7 +25,9 @@
  
  
  
+
  
+
  
  
  
diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml

index 12a90dcec7df80ed851e3c3dcb182d9771b4baad..ecacb75bdffb5a341f418b959bba5dab87486b56 100644 (file)
--- a/doc/src/sgml/postgres.sgml
+++ b/doc/src/sgml/postgres.sgml
@@ -1,4 +1,4 @@
-
+
  
  
 
@@ -101,6 +101,7 @@
   &func;
   &typeconv;
   &indices;
+  &textsearch;
   &mvcc;
   &perform;
 
  
@@ -101,6 +101,7 @@
    &func;
    &typeconv;
    &indices;
+  &textsearch;
    &mvcc;
    &perform;
  
diff --git a/doc/src/sgml/ref/allfiles.sgml b/doc/src/sgml/ref/allfiles.sgml

index 3e08b55bab12a1f3b88347bfbcdad821cc675bca..898c2c67910e6efa5ce190aff896d9a29ca3bf78 100644 (file)
--- a/doc/src/sgml/ref/allfiles.sgml
+++ b/doc/src/sgml/ref/allfiles.sgml
@@ -1,5 +1,5 @@
  
@@ -22,6 +22,10 @@ Complete list of usable sgml source files in this directory.
  
  
  
+
+
+
+
  
  
  
@@ -56,6 +60,10 @@ Complete list of usable sgml source files in this directory.
  
  
  
+
+
+
+
  
  
  
@@ -83,6 +91,10 @@ Complete list of usable sgml source files in this directory.
  
  
  
+
+
+
+
  
  
  
diff --git a/doc/src/sgml/ref/alter_tsconfig.sgml b/doc/src/sgml/ref/alter_tsconfig.sgml

new file mode 100644 (file)

index 0000000..295ba1d
--- /dev/null
+++ b/doc/src/sgml/ref/alter_tsconfig.sgml
@@ -0,0 +1,202 @@
+
+
+
+ 
+  ALTER TEXT SEARCH CONFIGURATION
+  SQL - Language Statements
+ 
+
+ 
+  ALTER TEXT SEARCH CONFIGURATION
+  change the definition of a text search configuration
+   
+  
+ 
+  ALTER TEXT SEARCH CONFIGURATION
+ 
+
+ 
+
+ALTER TEXT SEARCH CONFIGURATION name (
+    PARSER = parser_name
+)
+ALTER TEXT SEARCH CONFIGURATION name
+    ADD MAPPING FOR token_type [, ... ] WITH dictionary_name [, ... ]
+ALTER TEXT SEARCH CONFIGURATION name
+    ALTER MAPPING FOR token_type [, ... ] WITH dictionary_name [, ... ]
+ALTER TEXT SEARCH CONFIGURATION name
+    ALTER MAPPING REPLACE old_dictionary WITH new_dictionary
+ALTER TEXT SEARCH CONFIGURATION name
+    ALTER MAPPING FOR token_type [, ... ] REPLACE old_dictionary WITH new_dictionary
+ALTER TEXT SEARCH CONFIGURATION name
+    DROP MAPPING [ IF EXISTS ] FOR token_type [, ... ]
+ALTER TEXT SEARCH CONFIGURATION name RENAME TO newname
+ALTER TEXT SEARCH CONFIGURATION name OWNER TO newowner
+
+ 
+  
+ 
+  Description
+
+  
+   ALTER TEXT SEARCH CONFIGURATION changes the definition of
+   a text search configuration.  You can change which parser it uses, modify
+   its mapping from token types to dictionaries,
+   or change the configuration's name or owner.
+  
+
+  
+   You must be the owner of the configuration to use
+   ALTER TEXT SEARCH CONFIGURATION.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search
+      configuration.
+     
+    
+   
+
+   
+    parser_name
+    
+     
+      The name of a new text search parser to use for this configuration.
+     
+    
+   
+
+   
+    token_type
+    
+     
+      The name of a token type that is emitted by the configuration's
+      parser.
+     
+    
+   
+
+   
+    dictionary_name
+    
+     
+      The name of a text search dictionary to be consulted for the
+      specified token type(s).  If multiple dictionaries are listed,
+      they are consulted in the specified order.
+     
+    
+   
+
+   
+    old_dictionary
+    
+     
+      The name of a text search dictionary to be replaced in the mapping.
+     
+    
+   
+
+   
+    old_dictionary
+    
+     
+      The name of a text search dictionary to be substituted for
+      old_dictionary.
+     
+    
+   
+
+   
+    newname
+    
+     
+      The new name of the text search configuration.
+     
+    
+   
+
+   
+    newowner
+    
+     
+      The new owner of the text search configuration.
+     
+    
+   
+ 
+
+  
+   The ADD MAPPING FOR form installs a list of dictionaries to be
+   consulted for the specified token type(s); it is an error if there is
+   already a mapping for any of the token types.
+   The ALTER MAPPING FOR form does the same, but first removing
+   any existing mapping for those token types.
+   The ALTER MAPPING REPLACE forms substitute 
+   class="parameter">new_dictionary for 
+   class="parameter">old_dictionary anywhere the latter appears.
+   This is done for only the specified token types when FOR
+   appears, or for all mappings of the configuration when it doesn't.
+   The DROP MAPPING form removes all dictionaries for the
+   specified token type(s), causing tokens of those types to be ignored
+   by the text search configuration.  It is an error if there is no mapping
+   for the token types, unless IF EXISTS appears.
+  
+
+ 
+  
+ 
+  Notes
+
+  
+   While changing the text search parser used by a configuration is allowed,
+   this will only work nicely if old and new parsers use the same set of
+   token types.  It is advisable to drop the mappings for any incompatible
+   token types before changing parsers.
+  
+
+ 
+  
+ 
+  Examples
+
+  
+   The following example replaces the english dictionary
+   with the swedish dictionary anywhere that english
+   is used within my_config.
+  
+
+
+ALTER TEXT SEARCH CONFIGURATION my_config
+  ALTER MAPPING REPLACE english WITH swedish;
+  
+ 
+
+ 
+  Compatibility
+
+  
+   There is no ALTER TEXT SEARCH CONFIGURATION statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+


diff --git a/doc/src/sgml/ref/alter_tsdictionary.sgml b/doc/src/sgml/ref/alter_tsdictionary.sgml

new file mode 100644 (file)

index 0000000..59c3366


--- /dev/null
+++ b/doc/src/sgml/ref/alter_tsdictionary.sgml
@@ -0,0 +1,118 @@
+
+
+
+ 
+  ALTER TEXT SEARCH DICTIONARY
+  SQL - Language Statements
+ 
+
+ 
+  ALTER TEXT SEARCH DICTIONARY
+  change the definition of a text search dictionary
+   
+  
+ 
+  ALTER TEXT SEARCH DICTIONARY
+ 
+
+ 
+
+ALTER TEXT SEARCH DICTIONARY name ( OPTION = init_options )
+ALTER TEXT SEARCH DICTIONARY name RENAME TO newname
+ALTER TEXT SEARCH DICTIONARY name OWNER TO newowner
+
+ 
+  
+ 
+  Description
+
+  
+   ALTER TEXT SEARCH DICTIONARY changes the definition of
+   a text search dictionary.  You can change the dictionary's initialization
+   options, or change the dictionary's name or owner.
+  
+
+  
+   You must be the owner of the dictionary to use
+   ALTER TEXT SEARCH DICTIONARY.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search
+      dictionary.
+     
+    
+   
+
+   
+    init_options
+    
+     
+      A new list of initialization options, or NULL to
+      remove all options.
+     
+    
+   
+
+   
+    newname
+    
+     
+      The new name of the text search dictionary.
+     
+    
+   
+
+   
+    newowner
+    
+     
+      The new owner of the text search dictionary.
+     
+    
+   
+ 
+ 
+  
+ 
+  Examples
+
+  
+   The following example command sets the language and stopword list
+   for a Snowball-based dictionary.
+  
+
+
+ALTER TEXT SEARCH DICTIONARY my_russian ( option = 'Language=russian, StopWords=my_russian' );
+  
+ 
+
+ 
+  Compatibility
+
+  
+   There is no ALTER TEXT SEARCH DICTIONARY statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+


diff --git a/doc/src/sgml/ref/alter_tsparser.sgml b/doc/src/sgml/ref/alter_tsparser.sgml

new file mode 100644 (file)

index 0000000..a94d393


--- /dev/null
+++ b/doc/src/sgml/ref/alter_tsparser.sgml
@@ -0,0 +1,82 @@
+
+
+
+ 
+  ALTER TEXT SEARCH PARSER
+  SQL - Language Statements
+ 
+
+ 
+  ALTER TEXT SEARCH PARSER
+  change the definition of a text search parser
+   
+  
+ 
+  ALTER TEXT SEARCH PARSER
+ 
+
+ 
+
+ALTER TEXT SEARCH PARSER name RENAME TO newname
+
+ 
+  
+ 
+  Description
+
+  
+   ALTER TEXT SEARCH PARSER changes the definition of
+   a text search parser.  Currently, the only supported functionality
+   is to change the parser's name.
+  
+
+  
+   You must be a superuser to use ALTER TEXT SEARCH PARSER.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search parser.
+     
+    
+   
+
+   
+    newname
+    
+     
+      The new name of the text search parser.
+     
+    
+   
+ 
+ 
+
+ 
+  Compatibility
+
+  
+   There is no ALTER TEXT SEARCH PARSER statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+


diff --git a/doc/src/sgml/ref/alter_tstemplate.sgml b/doc/src/sgml/ref/alter_tstemplate.sgml

new file mode 100644 (file)

index 0000000..4ee9e82


--- /dev/null
+++ b/doc/src/sgml/ref/alter_tstemplate.sgml
@@ -0,0 +1,82 @@
+
+
+
+ 
+  ALTER TEXT SEARCH TEMPLATE
+  SQL - Language Statements
+ 
+
+ 
+  ALTER TEXT SEARCH TEMPLATE
+  change the definition of a text search template
+   
+  
+ 
+  ALTER TEXT SEARCH TEMPLATE
+ 
+
+ 
+
+ALTER TEXT SEARCH TEMPLATE name RENAME TO newname
+
+ 
+  
+ 
+  Description
+
+  
+   ALTER TEXT SEARCH TEMPLATE changes the definition of
+   a text search template.  Currently, the only supported functionality
+   is to change the template's name.
+  
+
+  
+   You must be a superuser to use ALTER TEXT SEARCH TEMPLATE.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search template.
+     
+    
+   
+
+   
+    newname
+    
+     
+      The new name of the text search template.
+     
+    
+   
+ 
+ 
+
+ 
+  Compatibility
+
+  
+   There is no ALTER TEXT SEARCH TEMPLATE statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+


diff --git a/doc/src/sgml/ref/comment.sgml b/doc/src/sgml/ref/comment.sgml

index 733f75ab3b78e15da953ddc5441fc26e34b2503f..c8993e915b8e683f4fc722f7feda32eeb7e6a8ae 100644 (file)


--- a/doc/src/sgml/ref/comment.sgml
+++ b/doc/src/sgml/ref/comment.sgml
@@ -1,5 +1,5 @@
 
 
@@ -42,6 +42,10 @@ COMMENT ON
   SCHEMA object_name |
   SEQUENCE object_name |
   TABLESPACE object_name |
+  TEXT SEARCH CONFIGURATION object_name |
+  TEXT SEARCH DICTIONARY object_name |
+  TEXT SEARCH PARSER object_name |
+  TEXT SEARCH TEMPLATE object_name |
   TRIGGER trigger_name ON table_name |
   TYPE object_name |
   VIEW object_name
@@ -65,9 +69,8 @@ COMMENT ON
   
 
   
-    Comments can be
-    easily retrieved with the psql commands
-    \dd, \d+, and \l+.
+    Comments can be viewed using psql's 
+    \d family of commands.
     Other user interfaces to retrieve comments can be built atop
     the same built-in functions that psql uses, namely
     obj_description, col_description,
@@ -93,7 +96,8 @@ COMMENT ON
      
       The name of the object to be commented.  Names of tables,
       aggregates, domains, functions, indexes, operators, operator classes,
-      operator families, sequences, types, and views can be schema-qualified.
+      operator families, sequences, text search objects, types, and views can
+      be schema-qualified.
      
     
    
@@ -255,6 +259,10 @@ COMMENT ON SCHEMA my_schema IS 'Departmental data';
 COMMENT ON SEQUENCE my_sequence IS 'Used to generate primary keys';
 COMMENT ON TABLE my_schema.my_table IS 'Employee Information';
 COMMENT ON TABLESPACE my_tablespace IS 'Tablespace for indexes';
+COMMENT ON TEXT SEARCH CONFIGURATION my_config IS 'Special word filtering';
+COMMENT ON TEXT SEARCH DICTIONARY swedish IS 'Snowball stemmer for swedish language';
+COMMENT ON TEXT SEARCH PARSER my_parser IS 'Splits text into words';
+COMMENT ON TEXT SEARCH TEMPLATE snowball IS 'Snowball stemmer';
 COMMENT ON TRIGGER my_trigger ON my_table IS 'Used for RI';
 COMMENT ON TYPE complex IS 'Complex number data type';
 COMMENT ON VIEW my_view IS 'View of departmental costs';


diff --git a/doc/src/sgml/ref/create_tsconfig.sgml b/doc/src/sgml/ref/create_tsconfig.sgml

new file mode 100644 (file)

index 0000000..49be941


--- /dev/null
+++ b/doc/src/sgml/ref/create_tsconfig.sgml
@@ -0,0 +1,126 @@
+
+
+
+ 
+  CREATE TEXT SEARCH CONFIGURATION
+  SQL - Language Statements
+ 
+
+ 
+  CREATE TEXT SEARCH CONFIGURATION
+  define a new text search configuration
+ 
+
+ 
+  CREATE TEXT SEARCH CONFIGURATION
+ 
+
+ 
+
+CREATE TEXT SEARCH CONFIGURATION name (
+    PARSER = parser_name |
+    COPY = source_config
+)
+
+ 
+
+ 
+  Description
+
+  
+   CREATE TEXT SEARCH CONFIGURATION creates a new text
+   search configuration.  A text search configuration specifies a text
+   search parser that can divide a string into tokens, plus dictionaries
+   that can be used to determine which tokens are of interest for searching.
+  
+
+  
+   If only the parser is specified, then the new text search configuration
+   initially has no mappings from token types to dictionaries, and therefore
+   will ignore all words.  Subsequent ALTER TEXT SEARCH
+   CONFIGURATION commands must be used to create mappings to
+   make the configuration useful.  Alternatively, an existing text search
+   configuration can be copied.
+  
+
+  
+   If a schema name is given then the text search configuration is created in
+   the specified schema.  Otherwise it is created in the current schema.
+  
+
+  
+   The user who defines a text search configuration becomes its owner.
+  
+
+  
+   Refer to  for further information.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name of the text search configuration to be created.  The name can be
+      schema-qualified.
+     
+    
+   
+
+   
+    parser_name
+    
+     
+      The name of the text search parser to use for this configuration.
+     
+    
+   
+
+   
+    source_config
+    
+     
+      The name of an existing text search configuration to copy.
+     
+    
+   
+  
+ 
+  
+ 
+  Notes
+
+  
+   It is allowed to specify both PARSER and COPY,
+   resulting in the specified parser being used with whatever mappings
+   are in the source configuration.  This is generally inadvisable,
+   unless you know that both parsers involved use the same token type set.
+  
+
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no CREATE TEXT SEARCH CONFIGURATION statement
+   in the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+


diff --git a/doc/src/sgml/ref/create_tsdictionary.sgml b/doc/src/sgml/ref/create_tsdictionary.sgml

new file mode 100644 (file)

index 0000000..81c6a0c


--- /dev/null
+++ b/doc/src/sgml/ref/create_tsdictionary.sgml
@@ -0,0 +1,111 @@
+
+
+
+ 
+  CREATE TEXT SEARCH DICTIONARY
+  SQL - Language Statements
+ 
+
+ 
+  CREATE TEXT SEARCH DICTIONARY
+  define a new text search dictionary
+ 
+
+ 
+  CREATE TEXT SEARCH DICTIONARY
+ 
+
+ 
+
+CREATE TEXT SEARCH DICTIONARY name (
+    TEMPLATE = template
+    [, OPTION = init_options ]
+)
+
+ 
+
+ 
+  Description
+
+  
+   CREATE TEXT SEARCH DICTIONARY creates a new text search
+   dictionary.  A text search dictionary specifies a way of recognizing
+   interesting or uninteresting words for searching.  A dictionary depends
+   on a text search template, which specifies the functions that actually
+   perform the work.  Typically the dictionary provides some options that
+   control the detailed behavior of the template's functions.
+  
+
+  
+   If a schema name is given then the text search dictionary is created in the
+   specified schema.  Otherwise it is created in the current schema.
+  
+
+  
+   The user who defines a text search dictionary becomes its owner.
+  
+
+  
+   Refer to  for further information.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name of the text search dictionary to be created.  The name can be
+      schema-qualified.
+     
+    
+   
+
+   
+    template
+    
+     
+      The name of the text search template that will define the basic
+      behavior of this dictionary.
+     
+    
+   
+
+   
+    init_options
+    
+     
+      A list of initialization options for the template functions.
+      This is a string containing keyword =
+      value pairs.  The specific keywords allowed
+      vary depending on the text search template.
+     
+    
+   
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no CREATE TEXT SEARCH DICTIONARY statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+


diff --git a/doc/src/sgml/ref/create_tsparser.sgml b/doc/src/sgml/ref/create_tsparser.sgml

new file mode 100644 (file)

index 0000000..5f612cf


--- /dev/null
+++ b/doc/src/sgml/ref/create_tsparser.sgml
@@ -0,0 +1,152 @@
+
+
+
+ 
+  CREATE TEXT SEARCH PARSER
+  SQL - Language Statements
+ 
+
+ 
+  CREATE TEXT SEARCH PARSER
+  define a new text search parser
+ 
+
+ 
+  CREATE TEXT SEARCH PARSER
+ 
+
+ 
+
+CREATE TEXT SEARCH PARSER name (
+    START = start_function ,
+    GETTOKEN = gettoken_function ,
+    END = end_function ,
+    LEXTYPES = lextypes_function
+    [, HEADLINE = headline_function ]
+)
+
+ 
+
+ 
+  Description
+
+  
+   CREATE TEXT SEARCH PARSER creates a new text search
+   parser.  A text search parser defines a method for splitting a text
+   string into tokens and assigning types (categories) to the tokens.
+   A parser is not particularly useful by itself, but must be bound into a
+   text search configuration along with some text search dictionaries
+   to be used for searching.
+  
+
+  
+   If a schema name is given then the text search parser is created in the
+   specified schema.  Otherwise it is created in the current schema.
+  
+
+  
+   You must be a superuser to use CREATE TEXT SEARCH PARSER.
+   (This restriction is made because an erroneous text search parser
+   definition could confuse or even crash the server.)
+  
+
+  
+   Refer to  for further information.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name of the text search parser to be created.  The name can be
+      schema-qualified.
+     
+    
+   
+
+   
+    start_function
+    
+     
+      The name of the start function for the parser.
+     
+    
+   
+
+   
+    gettoken_function
+    
+     
+      The name of the get-next-token function for the parser.
+     
+    
+   
+
+   
+    end_function
+    
+     
+      The name of the end function for the parser.
+     
+    
+   
+
+   
+    lextypes_function
+    
+     
+      The name of the lextypes function for the parser (a function that
+      returns information about the set of token types it produces).
+     
+    
+   
+
+   
+    headline_function
+    
+     
+      The name of the headline function for the parser (a function that
+      summarizes a set of tokens).
+     
+    
+   
+  
+
+  
+   The function names can be schema-qualified if necessary.  Argument types
+   are not given, since the argument list for each type of function is
+   predetermined.  All except the headline function are required.
+  
+
+  
+   The arguments can appear in any order, not only the one shown above.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no
+   CREATE TEXT SEARCH PARSER statement in the SQL
+   standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+


diff --git a/doc/src/sgml/ref/create_tstemplate.sgml b/doc/src/sgml/ref/create_tstemplate.sgml

new file mode 100644 (file)

index 0000000..5abadb7


--- /dev/null
+++ b/doc/src/sgml/ref/create_tstemplate.sgml
@@ -0,0 +1,125 @@
+
+
+
+ 
+  CREATE TEXT SEARCH TEMPLATE
+  SQL - Language Statements
+ 
+
+ 
+  CREATE TEXT SEARCH TEMPLATE
+  define a new text search template
+ 
+
+ 
+  CREATE TEXT SEARCH TEMPLATE
+ 
+
+ 
+
+CREATE TEXT SEARCH TEMPLATE name (
+    [ INIT = init_function , ]
+    LEXIZE = lexize_function
+)
+
+ 
+
+ 
+  Description
+
+  
+   CREATE TEXT SEARCH TEMPLATE creates a new text search
+   template.  Text search templates define the functions that implement
+   text search dictionaries.  A template is not useful by itself, but must
+   be instantiated as a dictionary to be used.  The dictionary typically
+   specifies parameters to be given to the template functions.
+  
+
+  
+   If a schema name is given then the text search template is created in the
+   specified schema.  Otherwise it is created in the current schema.
+  
+
+  
+   You must be a superuser to use CREATE TEXT SEARCH
+   TEMPLATE.  This restriction is made because an erroneous text
+   search template definition could confuse or even crash the server.
+   The reason for separating templates from dictionaries is that a template
+   encapsulates the unsafe aspects of defining a dictionary.
+   The parameters that can be set when defining a dictionary are safe for
+   unprivileged users to set, and so creating a dictionary need not be a
+   privileged operation.
+  
+
+  
+   Refer to  for further information.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name of the text search template to be created.  The name can be
+      schema-qualified.
+     
+    
+   
+
+   
+    init_function
+    
+     
+      The name of the init function for the template.
+     
+    
+   
+
+   
+    lexize_function
+    
+     
+      The name of the lexize function for the template.
+     
+    
+   
+  
+
+  
+   The function names can be schema-qualified if necessary.  Argument types
+   are not given, since the argument list for each type of function is
+   predetermined.  The lexize function is required, but the init function
+   is optional.
+  
+
+  
+   The arguments can appear in any order, not only the one shown above.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no
+   CREATE TEXT SEARCH TEMPLATE statement in the SQL
+   standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+


diff --git a/doc/src/sgml/ref/drop_tsconfig.sgml b/doc/src/sgml/ref/drop_tsconfig.sgml

new file mode 100644 (file)

index 0000000..5cc8e3c


--- /dev/null
+++ b/doc/src/sgml/ref/drop_tsconfig.sgml
@@ -0,0 +1,118 @@
+
+
+
+ 
+  DROP TEXT SEARCH CONFIGURATION
+  SQL - Language Statements
+ 
+
+ 
+  DROP TEXT SEARCH CONFIGURATION
+  remove a text search configuration
+ 
+  
+ 
+  DROP TEXT SEARCH CONFIGURATION
+ 
+
+ 
+
+DROP TEXT SEARCH CONFIGURATION [ IF EXISTS ] name [ CASCADE | RESTRICT ]
+
+ 
+
+ 
+  Description
+
+  
+   DROP TEXT SEARCH CONFIGURATION drops an existing text
+   search configuration.  To execute this command you must be the owner of the
+   configuration.
+  
+ 
+  
+ 
+  Parameters
+
+  
+
+   
+    IF EXISTS
+    
+     
+      Do not throw an error if the text search configuration does not exist.
+      A notice is issued in this case.
+     
+    
+   
+
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search
+      configuration.
+     
+    
+   
+
+   
+    CASCADE
+    
+     
+      Automatically drop objects that depend on the text search configuration.
+     
+    
+   
+
+   
+    RESTRICT
+    
+     
+      Refuse to drop the text search configuration if any objects depend on it.
+      This is the default.
+     
+    
+   
+  
+ 
+
+ 
+  Examples
+
+  
+   Remove the text search configuration my_english:
+
+
+DROP TEXT SEARCH CONFIGURATION my_english;
+
+
+   This command will not succeed if there are any existing indexes
+   that reference the configuration in to_tsvector calls.
+   Add CASCADE to
+   drop such indexes along with the text search configuration.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no DROP TEXT SEARCH CONFIGURATION statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
+


diff --git a/doc/src/sgml/ref/drop_tsdictionary.sgml b/doc/src/sgml/ref/drop_tsdictionary.sgml

new file mode 100644 (file)

index 0000000..683d4dc


--- /dev/null
+++ b/doc/src/sgml/ref/drop_tsdictionary.sgml
@@ -0,0 +1,117 @@
+
+
+
+ 
+  DROP TEXT SEARCH DICTIONARY
+  SQL - Language Statements
+ 
+
+ 
+  DROP TEXT SEARCH DICTIONARY
+  remove a text search dictionary
+ 
+  
+ 
+  DROP TEXT SEARCH DICTIONARY
+ 
+
+ 
+
+DROP TEXT SEARCH DICTIONARY [ IF EXISTS ] name [ CASCADE | RESTRICT ]
+
+ 
+
+ 
+  Description
+
+  
+   DROP TEXT SEARCH DICTIONARY drops an existing text
+   search dictionary.  To execute this command you must be the owner of the
+   dictionary.
+  
+ 
+  
+ 
+  Parameters
+
+  
+
+   
+    IF EXISTS
+    
+     
+      Do not throw an error if the text search dictionary does not exist.
+      A notice is issued in this case.
+     
+    
+   
+
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search
+      dictionary.
+     
+    
+   
+
+   
+    CASCADE
+    
+     
+      Automatically drop objects that depend on the text search dictionary.
+     
+    
+   
+
+   
+    RESTRICT
+    
+     
+      Refuse to drop the text search dictionary if any objects depend on it.
+      This is the default.
+     
+    
+   
+  
+ 
+
+ 
+  Examples
+
+  
+   Remove the text search dictionary english:
+
+
+DROP TEXT SEARCH DICTIONARY english;
+
+
+   This command will not succeed if there are any existing text search
+   configurations that use the dictionary.  Add CASCADE to
+   drop such configurations along with the dictionary.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no DROP TEXT SEARCH DICTIONARY statement in the
+   SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
+


diff --git a/doc/src/sgml/ref/drop_tsparser.sgml b/doc/src/sgml/ref/drop_tsparser.sgml

new file mode 100644 (file)

index 0000000..7a48228


--- /dev/null
+++ b/doc/src/sgml/ref/drop_tsparser.sgml
@@ -0,0 +1,115 @@
+
+
+
+ 
+  DROP TEXT SEARCH PARSER
+  SQL - Language Statements
+ 
+
+ 
+  DROP TEXT SEARCH PARSER
+  remove a text search parser
+ 
+  
+ 
+  DROP TEXT SEARCH PARSER
+ 
+
+ 
+
+DROP TEXT SEARCH PARSER [ IF EXISTS ] name [ CASCADE | RESTRICT ]
+
+ 
+
+ 
+  Description
+
+  
+   DROP TEXT SEARCH PARSER drops an existing text search
+   parser.  You must be a superuser to use this command.
+  
+ 
+  
+ 
+  Parameters
+
+  
+
+   
+    IF EXISTS
+    
+     
+      Do not throw an error if the text search parser does not exist.
+      A notice is issued in this case.
+     
+    
+   
+
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search parser.
+     
+    
+   
+
+   
+    CASCADE
+    
+     
+      Automatically drop objects that depend on the text search parser.
+     
+    
+   
+
+   
+    RESTRICT
+    
+     
+      Refuse to drop the text search parser if any objects depend on it.
+      This is the default.
+     
+    
+   
+  
+ 
+
+ 
+  Examples
+
+  
+   Remove the text search parser my_parser:
+
+
+DROP TEXT SEARCH PARSER my_parser;
+
+
+   This command will not succeed if there are any existing text search
+   configurations that use the parser.  Add CASCADE to
+   drop such configurations along with the parser.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no DROP TEXT SEARCH PARSER statement in the
+   SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
+


diff --git a/doc/src/sgml/ref/drop_tstemplate.sgml b/doc/src/sgml/ref/drop_tstemplate.sgml

new file mode 100644 (file)

index 0000000..6a776d1


--- /dev/null
+++ b/doc/src/sgml/ref/drop_tstemplate.sgml
@@ -0,0 +1,116 @@
+
+
+
+ 
+  DROP TEXT SEARCH TEMPLATE
+  SQL - Language Statements
+ 
+
+ 
+  DROP TEXT SEARCH TEMPLATE
+  remove a text search template
+ 
+  
+ 
+  DROP TEXT SEARCH TEMPLATE
+ 
+
+ 
+
+DROP TEXT SEARCH TEMPLATE [ IF EXISTS ] name [ CASCADE | RESTRICT ]
+
+ 
+
+ 
+  Description
+
+  
+   DROP TEXT SEARCH TEMPLATE drops an existing text search
+   template.  You must be a superuser to use this command.
+  
+ 
+  
+ 
+  Parameters
+
+  
+
+   
+    IF EXISTS
+    
+     
+      Do not throw an error if the text search template does not exist.
+      A notice is issued in this case.
+     
+    
+   
+
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search
+      template.
+     
+    
+   
+
+   
+    CASCADE
+    
+     
+      Automatically drop objects that depend on the text search template.
+     
+    
+   
+
+   
+    RESTRICT
+    
+     
+      Refuse to drop the text search template if any objects depend on it.
+      This is the default.
+     
+    
+   
+  
+ 
+
+ 
+  Examples
+
+  
+   Remove the text search template thesaurus:
+
+
+DROP TEXT SEARCH TEMPLATE thesaurus;
+
+
+   This command will not succeed if there are any existing text search
+   dictionaries that use the template.  Add CASCADE to
+   drop such dictionaries along with the template.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no DROP TEXT SEARCH TEMPLATE statement in the
+   SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
+


diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml

index a52fe7a9644b6c12ddba13d48bc52c6ec1ce10bc..1e8e54653147f8a83b23a884b579e4becbb996ce 100644 (file)


--- a/doc/src/sgml/reference.sgml
+++ b/doc/src/sgml/reference.sgml
@@ -1,4 +1,4 @@
-
+
 
 
  Reference
@@ -50,6 +50,10 @@
    &alterSequence;
    &alterTable;
    &alterTableSpace;
+   &alterTSConfig;
+   &alterTSDictionary;
+   &alterTSParser;
+   &alterTSTemplate;
    &alterTrigger;
    &alterType;
    &alterUser;
@@ -83,6 +87,10 @@
    &createTable;
    &createTableAs;
    &createTableSpace;
+   &createTSConfig;
+   &createTSDictionary;
+   &createTSParser;
+   &createTSTemplate;
    &createTrigger;
    &createType;
    &createUser;
@@ -110,6 +118,10 @@
    &dropSequence;
    &dropTable;
    &dropTableSpace;
+   &dropTSConfig;
+   &dropTSDictionary;
+   &dropTSParser;
+   &dropTSTemplate;
    &dropTrigger;
    &dropType;
    &dropUser;


diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml

new file mode 100644 (file)

index 0000000..a6601d6


--- /dev/null
+++ b/doc/src/sgml/textsearch.sgml
@@ -0,0 +1,3716 @@
+
+Full Text Search
+
+
+
+Introduction
+
+
+Full Text Searching (text search) allows the
+searching of documents that satisfy a query, and
+optionally returns them in some order. The most common search is to find
+all documents containing query terms and return them
+in order of their similarity to the
+query.  Notions of query and
+similarity are very flexible and depend on the specific
+application. The simplest search considers query as a
+set of words and similarity as the frequency of query
+words in the document.  Full text indexing can be done inside the
+database or outside.  Doing indexing inside the database allows easy access
+to document metadata to assist in indexing and display.
+
+
+
+Textual search operators have existed in databases for years.
+PostgreSQL has
+~,~*, LIKE,
+ILIKE operators for textual datatypes, but they lack
+many essential properties required by modern information systems:
+
+
+
+
+There is no linguistic support, even for English.  Regular expressions are
+not sufficient because they cannot easily handle derived words,
+e.g., satisfies and satisfy. You might
+miss documents which contain satisfies, although you
+probably would like to find them when searching for
+satisfy. It is possible to use OR
+to search any of them, but it is tedious and error-prone
+(some words can have several thousand derivatives).
+
+
+
+They provide no ordering (ranking) of search results, which makes them
+ineffective when thousands of matching documents are found.
+
+
+
+They tend to be slow because they process all documents for every search and
+there is no index support.
+
+
+
+
+
+
+Full text indexing allows documents to be preprocessed
+and an index saved for later rapid searching. Preprocessing includes:
+
+
+
+Parsing documents into lexemes. It is
+useful to identify various lexemes, e.g. digits, words, complex words,
+email addresses, so they can be processed differently.  In principle
+lexemes depend on the specific application but for an ordinary search it
+is useful to have a predefined list of lexemes.  
+
+
+
+Dictionaries allow the conversion of lexemes into
+a normalized form so it is not necessary to enter
+search words in a specific form.
+
+
+
+Store preprocessed documents
+optimized for searching.  For example, represent each document as a sorted array
+of lexemes. Along with lexemes it is desirable to store positional
+information to use for proximity ranking, so that a
+document which contains a more "dense" region of query words is assigned
+a higher rank than one with scattered query words.
+
+
+
+
+
+Dictionaries allow fine-grained control over how lexemes are created.  With
+dictionaries you can:
+
+
+Define "stop words" that should not be indexed.
+
+
+
+Map synonyms to a single word using ispell.
+
+
+Map phrases to a single word using a thesaurus.
+
+
+Map different variations of a word to a canonical form using
+an ispell dictionary.
+
+
+Map different variations of a word to a canonical form using
+snowball stemmer rules.
+
+
+
+
+
+
+A data type (), tsvector
+is provided, for storing preprocessed documents,
+along with a type tsquery for representing textual
+queries.  Also, a full text search operator @@ is defined
+for these data types ().  Full text
+searches can be accelerated using indexes (
+linkend="textsearch-indexes">).
+
+
+
+
+What Is a <firstterm>Document</firstterm>?
+
+
+document
+
+
+
+A document can be a simple text file stored in the file system.  The full
+text indexing engine can parse text files and store associations of lexemes
+(words) with their parent document. Later, these associations are used to
+search for documents which contain query words.  In this case, the database
+can be used to store the full text index and for executing searches, and
+some unique identifier can be used to retrieve the document from the file
+system.
+
+
+
+A document can also be any textual database attribute or a combination
+(concatenation), which in turn can be stored in various tables or obtained
+dynamically. In other words, a document can be constructed from different
+parts for indexing and it might not exist as a whole. For example:
+
+SELECT title || ' ' ||  author || ' ' ||  abstract || ' ' || body AS document
+FROM messages
+WHERE mid = 12;
+
+SELECT m.title || ' ' || m.author || ' ' || m.abstract || ' ' || d.body AS document
+FROM messages m, docs d
+WHERE mid = did AND mid = 12;
+
+
+
+
+
+Actually, in the previous example queries, COALESCE
+
+should be used to prevent a NULL attribute from causing
+a NULL result.
+
+
+
+
+
+Data Types
+
+
+
+
+
+tsvector
+
+
+
+
+tsvector
+
+
+
+tsvector is a data type that represents a document and is
+optimized for full text searching. In the simplest case,
+tsvector is a sorted list of lexemes, so even without indexes
+full text searches perform better than standard ~ and
+LIKE operations:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector;
+                      tsvector
+----------------------------------------------------
+ 'a' 'on' 'and' 'ate' 'cat' 'fat' 'mat' 'rat' 'sat'
+
+
+Notice, that space is also a lexeme:
+
+
+SELECT 'space ''    '' is a lexeme'::tsvector;
+             tsvector
+----------------------------------
+ 'a' 'is' '    ' 'space' 'lexeme'
+
+
+Each lexeme, optionally, can have positional information which is used for
+proximity ranking:
+
+SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector;
+                                   tsvector
+-------------------------------------------------------------------------------
+ 'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
+
+
+Each lexeme position also can be labeled as 'A',
+'B', 'C', 'D',
+where 'D' is the default. These labels can be used to group
+lexemes into different importance or
+rankings, for example to reflect document structure.
+Actual values can be assigned at search time and used during the calculation
+of the document rank.  This is very useful for controlling search results.
+
+
+The concatenation operator, e.g. tsvector || tsvector,
+can "construct" a document from several parts. The order is important if
+tsvector contains positional information. Of course,
+it is also possible to build a document using different tables:
+
+
+SELECT 'fat:1 cat:2'::tsvector || 'fat:1 rat:2'::tsvector;
+         ?column?
+---------------------------
+ 'cat':2 'fat':1,3 'rat':4
+SELECT 'fat:1 rat:2'::tsvector || 'fat:1 cat:2'::tsvector;
+         ?column?
+---------------------------
+ 'cat':4 'fat':1,3 'rat':2
+
+
+
+
+
+
+
+
+
+tsquery
+
+
+
+tsquery
+
+
+
+Tsquery is a data type for textual queries which supports
+the boolean operators & (AND), | (OR),
+and parentheses.  A Tsquery consists of lexemes
+(optionally labeled by letters) with boolean operators in between:
+
+
+SELECT 'fat & cat'::tsquery;
+    tsquery
+---------------
+ 'fat' & 'cat'
+SELECT 'fat:ab & cat'::tsquery;
+ tsquery
+------------------
+ 'fat':AB & 'cat'
+
+Labels can be used to restrict the search region, which allows the
+development of different search engines using the same full text index.
+
+
+
+tsqueries can be concatenated using && (AND)
+and || (OR) operators:
+
+SELECT 'a & b'::tsquery && 'c|d'::tsquery;
+         ?column?
+---------------------------
+ 'a' & 'b' & ( 'c' | 'd' )
+
+SELECT 'a & b'::tsquery || 'c|d'::tsquery;
+         ?column?
+---------------------------
+ 'a' & 'b' | ( 'c' | 'd' )
+
+
+
+
+
+
+
+
+
+Performing Searches
+
+
+Full text searching in PostgreSQL provides the
+operator @@ for two data types: tsvector
+(document) and tsquery (query).  Also, this operator
+supports TEXT, VARCHAR, and CHAR
+data types so simple full text searches can be done, but without ranking
+support:
+
+tsvector @@ tsquery
+tsquery  @@ tsvector
+TEXT | VARCHAR | CHAR @@ TEXT | tsquery
+
+
+
+
+The full text operator @@ returns true if
+tsvector contains tsquery:
+
+SELECT 'cat & rat'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector;
+ ?column?
+----------
+ t
+SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector;
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+
+Tables and Indexes
+
+
+The previous section described how to perform full text searches using
+constant strings.  This section shows how to search table data, optionally
+using indexes.
+
+
+
+Searching a Table
+
+
+It is possible to do full text table search with no index.  A simple query
+to find all title entries that contain the word
+friend is:
+
+SELECT title
+FROM pgweb
+WHERE to_tsvector('english', body) @@ to_tsquery('friend')
+
+
+
+
+A more complex query is to select the ten most recent documents which
+contain create and table in the title
+or body:
+
+SELECT title
+FROM pgweb
+WHERE to_tsvector('english', textcat(title, body)) @@ to_tsquery('create & table')
+ORDER BY dlm DESC LIMIT 10;
+
+dlm is the last-modified date in seconds since 1970 so we
+used ORDER BY dlm LIMIT 10 to get the most recent
+matches.  For clarity we omitted the coalesce function
+which prevents the unwanted effect of NULL
+concatenation.
+
+
+
+
+
+Creating Indexes
+
+
+We can create a GIN (
+linkend="textsearch-indexes">) index to speed up the search:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body));
+
+Notice that the 2-argument version of to_tsvector is
+used.  Only text search functions which specify a configuration name can
+be used in expression indexes ().
+Casting to a text search data type (::) is also unsupported.
+This is because the index contents should be unaffected by
+default_text_search_config.  If they were affected, the index
+contents might be inconsistent because they could contain
+tsvectors that were created with different default text search
+configurations.  Recovering a table from a pg_dump would
+also not recreate index tsvectors properly.
+
+
+
+Because the two-argument version of to_tsvector was
+used in the index above, only a query reference that uses the 2-argument
+version of to_tsvector with the same configuration
+name will use that index, i.e. WHERE 'a & b' @@
+to_svector('english', body) will use the index, but WHERE
+'a & b' @@ to_svector(body)) and WHERE 'a & b' @@
+body::tsvector will not.  This guarantees that an index will be used
+only with the same configuration used to create the index rows.
+
+
+
+It is possible to setup more complex expression indexes where the
+configuration name is specified by another column, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(conf_name, body));
+
+where conf_name is a column in the pgweb
+table.  This allows mixed configurations in the same index while
+recording which configuration was used for each index row.
+
+
+
+Indexes can even concatenate columns:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', textcat(title, body)));
+
+
+
+
+A more complex case is to create a separate tsvector column
+to hold the output of to_tsvector().  This example is a
+concatenation of title and body,
+with ranking information.  We assign different labels to them to encode
+information about the origin of each word:
+
+ALTER TABLE pgweb ADD COLUMN textsearch_index tsvector;
+UPDATE pgweb SET textsearch_index =
+    setweight(to_tsvector('english', coalesce(title,'')), 'A') || ' ' ||
+    setweight(to_tsvector('english', coalesce(body,'')),'D');
+
+Then we create a GIN index to speed up the search:
+
+CREATE INDEX textsearch_idx ON pgweb USING gin(textsearch_index);
+
+After vacuuming, we are ready to perform a fast full text search:
+
+SELECT rank_cd(textsearch_index, q) AS rank, title
+FROM pgweb, to_tsquery('create & table') q
+WHERE q @@ textsearch_index
+ORDER BY rank DESC LIMIT 10;
+
+It is necessary to create a trigger to keep the new tsvector
+column current anytime title or body changes.
+Keep in mind that, just like with expression indexes, it is important to
+specify the configuration name when creating text search data types
+inside triggers so the column's contents are not affected by changes to 
+default_text_search_config.
+
+
+
+
+
+
+
+Operators and Functions
+
+
+This section outlines all the functions and operators that are available
+for full text searching.
+
+
+
+Full text search vectors and queries both use lexemes, but for different
+purposes.  A tsvector represents the lexemes (tokens) parsed
+out of a document, with an optional position. A tsquery
+specifies a boolean condition using lexemes.
+
+
+
+All of the following functions that accept a configuration argument can
+use a textual configuration name to select a configuration.  If the option
+is omitted the configuration specified by
+default_text_search_config is used.  For more information on
+configuration, see .
+
+
+
+Search
+
+The operator @@ is used to perform full text
+searches:
+
+
+
+
+
+
+TSVECTOR @@ TSQUERY
+
+
+
+
+
+TSVECTOR @@ TSQUERY
+TSQUERY @@ TSVECTOR
+
+
+
+
+
+Returns true if TSQUERY is contained 
+in TSVECTOR, and false if not:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery;
+ ?column?
+ ----------
+  t
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow'::tsquery;
+ ?column?
+ ----------
+  f
+
+
+
+
+
+
+
+
+
+TEXT @@ TSQUERY
+
+
+
+
+TEXT @@ TSQUERY
+VARCHAR @@ TSQUERY
+CHAR @@ TSQUERY
+
+
+
+
+
+Returns true if TSQUERY is contained
+in TEXT/VARCHAR, and false if not:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery;
+ ?column?
+----------
+ t
+SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsquery;
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+TEXT @@ TEXT
+
+
+
+
+
+TEXT @@ TEXT
+VARCHAR @@ TEXT
+CHAR @@ TEXT
+
+
+
+
+
+Returns true if the right
+argument (the query) is contained in the left argument, and
+false otherwise:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat rat';
+ ?column?
+----------
+ t
+SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat cow';
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+
+For index support of full text operators consult .
+
+
+
+
+
+
+
+tsvector
+
+
+
+
+
+
+to_tsvector
+
+
+
+
+to_tsvector(conf_name,  document TEXT) returns TSVECTOR
+
+
+
+
+
+Parses a document into tokens, reduces the tokens to lexemes, and returns a
+tsvector which lists the lexemes together with their positions in the document
+in lexicographic order.
+
+
+
+
+
+
+
+
+strip
+
+
+
+
+strip(vector TSVECTOR) returns TSVECTOR
+
+
+
+
+
+Returns a vector which lists the same lexemes as the given vector, but
+which lacks any information about where in the document each lexeme
+appeared. While the returned vector is useless for relevance ranking it
+will usually be much smaller.
+
+
+
+
+
+
+
+
+setweight
+
+
+
+
+setweight(vector TSVECTOR, letter) returns TSVECTOR
+
+
+
+
+
+This function returns a copy of the input vector in which every location
+has been labeled with either the letter 'A',
+'B', or 'C', or the default label
+'D' (which is the default for new vectors
+and as such is usually not displayed). These labels are retained
+when vectors are concatenated, allowing words from different parts of a
+document to be weighted differently by ranking functions.
+
+
+
+
+
+
+
+
+
+tsvector concatenation
+
+
+
+
+vector1 || vector2
+concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR
+
+
+
+
+
+Returns a vector which combines the lexemes and positional information of
+the two vectors given as arguments. Positional weight labels (described
+in the previous paragraph) are retained during the concatenation.  This
+has at least two uses. First, if some sections of your document need to be
+parsed with different configurations than others, you can parse them
+separately and then concatenate the resulting vectors.  Second, you can
+weigh words from one section of your document differently than the others
+by parsing the sections into separate vectors and assigning each vector
+a different position label with the setweight()
+function.  You can then concatenate them into a single vector and provide
+a weights argument to the rank() function that assigns
+different weights to positions with different labels.
+
+
+
+
+
+
+
+length(tsvector)
+
+
+
+
+length(vector TSVECTOR) returns INT4
+
+
+
+
+
+Returns the number of lexemes stored in the vector.
+
+
+
+
+
+
+
+text::tsvector
+
+
+
+
+text::TSVECTOR returns TSVECTOR
+
+
+
+
+
+Directly casting text to a tsvector allows you
+to directly inject lexemes into a vector with whatever positions and
+positional weights you choose to specify. The text should be formatted to
+match the way a vector is displayed by SELECT.
+
+
+
+
+
+
+
+
+trigger
+
+
+
+
+tsvector_update_trigger(vector_column_name, filter_name, text_column_name , ... )
+
+
+
+
+
+The tsvector_update_trigger() trigger is used to
+automatically update vector_column_name.
+filter_name is the function name to preprocess
+text_column_name.  There can be many functions
+and text columns specified in a
+tsvector_update_trigger() trigger.  If multiple
+functions are specified, they apply to the following columns until the
+next function appears.  As an example of using a filter, function
+dropatsymbol replaces all entries of the
+@ sign with a space:
+
+
+CREATE FUNCTION dropatsymbol(text)
+RETURNS text 
+AS 'SELECT replace($1, ''@'', '' '');'
+LANGUAGE SQL;
+
+CREATE TRIGGER tsvectorupdate BEFORE UPDATE OR INSERT
+ON tblMessages FOR EACH ROW EXECUTE PROCEDURE
+tsvector_update_trigger(tsvector_column, dropatsymbol, strMessage);
+
+
+
+
+
+
+
+
+
+stat
+
+
+
+
+stat(sqlquery text , weight text ) returns SETOF statinfo
+
+
+
+
+
+
+Here statinfo is a type, defined as:
+
+CREATE TYPE statinfo AS (word text, ndoc int4, nentry int4);
+
+and sqlquery is a query which returns a
+tsvector column's contents.  stat returns
+statistics about a tsvector column, i.e., the number of
+documents, ndoc, and the total number of words in the
+collection, nentry.  It is useful for checking your
+configuration and to find stop word candidates.  For example, to find
+the ten most frequent words:
+
+
+SELECT * FROM stat('SELECT vector from apod')
+ORDER BY ndoc DESC, nentry DESC, word
+LIMIT 10;
+
+
+Optionally, one can specify weight to obtain
+statistics about words with a specific weight:
+
+
+SELECT * FROM stat('SELECT vector FROM apod','a')
+ORDER BY ndoc DESC, nentry DESC, word
+LIMIT 10;
+
+
+
+
+
+
+
+
+
+Btree operations for tsvector
+
+
+
+
+TSVECTOR < TSVECTOR
+TSVECTOR <= TSVECTOR
+TSVECTOR = TSVECTOR
+TSVECTOR >= TSVECTOR
+TSVECTOR > TSVECTOR
+
+
+
+
+
+All btree operations are defined for the tsvector type.
+tsvectors are compared with each other using
+lexicographical ordering.
+
+
+
+
+
+
+
+
+
+
+
+tsquery
+
+
+
+
+
+
+
+to_tsquery
+
+
+
+
+to_tsquery(conf_name, querytext text) returns TSQUERY
+
+
+
+
+
+Accepts querytext, which should consist of single tokens
+separated by the boolean operators & (and), |
+(or) and ! (not), which can be grouped using parentheses.
+In other words, to_tsquery expects already parsed text.
+Each token is reduced to a lexeme using the specified or current configuration.
+A weight class can be assigned to each lexeme entry to restrict the search region
+(see setweight for an explanation). For example:
+
+'fat:a & rats'
+
+The to_tsquery function can also accept a text
+string. In this case querytext should
+be quoted. This may be useful, for example, to use with a thesaurus
+dictionary. In the example below, a thesaurus contains rule supernovae
+stars : sn:
+
+SELECT to_tsquery('''supernovae stars'' & !crab');
+   to_tsquery
+----------------
+ 'sn' & !'crab'
+
+Without quotes to_tsquery will generate a syntax error.
+
+
+
+
+
+
+
+
+
+
+plainto_tsquery
+
+
+
+
+plainto_tsquery(conf_name,  querytext text) returns TSQUERY
+
+
+
+
+
+Transforms unformatted text querytext to tsquery.
+It is the same as to_tsquery but accepts text
+without quotes and will call the parser to break it into tokens.
+plainto_tsquery assumes the & boolean
+operator between words and does not recognize weight classes.
+
+
+
+
+
+
+
+
+
+querytree
+
+
+
+
+querytree(query TSQUERY) returns TEXT
+
+
+
+
+
+This returns the query used for searching an index. It can be used to test
+for an empty query. The SELECT below returns NULL,
+which corresponds to an empty query since GIN indexes do not support queries with negation
+
+(a full index scan is inefficient):
+
+SELECT querytree(to_tsquery('!defined'));
+ querytree
+-----------
+
+
+
+
+
+
+
+
+
+
+text::tsquery casting
+
+
+
+
+text::TSQUERY returns TSQUERY
+
+
+
+
+
+Directly casting text to a tsquery
+allows you to directly inject lexemes into a query using whatever positions
+and positional weight flags you choose to specify. The text should be
+formatted  to match the way a vector is displayed by
+SELECT.
+
+
+
+
+
+
+
+
+numnode
+
+
+
+
+numnode(query TSQUERY) returns INTEGER
+
+
+
+
+
+This returns the number of nodes in a query tree. This function can be
+used to determine if query is meaningful
+(returns > 0), or contains only stop words (returns 0):
+
+SELECT numnode(plainto_tsquery('the any'));
+NOTICE:  query contains only stopword(s) or does not contain lexeme(s),
+ignored
+ numnode
+---------
+       0
+SELECT numnode(plainto_tsquery('the table'));
+ numnode
+---------
+       1
+SELECT numnode(plainto_tsquery('long table'));
+ numnode
+---------
+       3
+
+
+
+
+
+
+
+
+TSQUERY && TSQUERY
+
+
+
+
+TSQUERY && TSQUERY returns TSQUERY
+
+
+
+
+
+Returns AND-ed TSQUERY
+
+
+
+
+
+
+
+TSQUERY || TSQUERY
+
+
+
+
+TSQUERY || TSQUERY returns TSQUERY
+
+
+
+
+
+Returns OR-ed TSQUERY
+
+
+
+
+
+
+
+!! TSQUERY
+
+
+
+
+!! TSQUERY returns TSQUERY
+
+
+
+
+
+negation of TSQUERY
+
+
+
+
+
+
+
+Btree operations for tsquery
+
+
+
+
+TSQUERY < TSQUERY
+TSQUERY <= TSQUERY
+TSQUERY = TSQUERY
+TSQUERY >= TSQUERY
+TSQUERY > TSQUERY
+
+
+
+
+
+All btree operations are defined for the tsquery type.
+tsqueries are compared to each other using lexicographical
+ordering.
+
+
+
+
+
+
+
+Query Rewriting
+
+
+Query rewriting is a set of functions and operators for the
+tsquery data type.  It allows control at search
+query time without reindexing (the opposite of the
+thesaurus).  For example, you can expand the search using synonyms
+(new york, big apple, nyc,
+gotham) or narrow the search to direct the user to some hot
+topic.
+
+
+
+The rewrite() function changes the original query by
+replacing part of the query with some other string of type tsquery,
+as defined by the rewrite rule. Arguments to rewrite()
+can be names of columns of type tsquery.
+
+
+
+CREATE TABLE aliases (t TSQUERY PRIMARY KEY, s TSQUERY);
+INSERT INTO aliases VALUES('a', 'c');
+
+
+
+
+
+
+rewrite - 1
+
+
+
+
+rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery);
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+
+rewrite - 2
+
+
+
+
+rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases;
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+
+rewrite - 3
+
+
+
+
+rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases');
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+What if there are several instances of rewriting? For example, query
+'a & b' can be rewritten as
+'b & c' and 'cc'.
+
+
+SELECT * FROM aliases;
+     t     |  s
+-----------+------
+ 'a'       | 'c'
+ 'x'       | 'z'
+ 'a' & 'b' | 'cc'
+
+This ambiguity can be resolved by specifying a sort order:
+
+SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC');
+ rewrite
+---------
+ 'cc'
+SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC');
+  rewrite
+-----------
+ 'b' & 'c'
+
+
+
+
+Let's consider a real-life astronomical example. We'll expand query
+supernovae using table-driven rewriting rules:
+
+CREATE TABLE aliases (t tsquery primary key, s tsquery);
+INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn'));
+SELECT rewrite(to_tsquery('supernovae'),  'SELECT * FROM aliases') && to_tsquery('crab');
+            ?column?
+---------------------------------
+ ( 'supernova' | 'sn' ) & 'crab'
+
+Notice, that we can change the rewriting rule online:
+
+UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae');
+SELECT rewrite(to_tsquery('supernovae'),  'SELECT * FROM aliases') && to_tsquery('crab');
+                  ?column?
+---------------------------------------------
+ ( 'supernova' | 'sn' & !'nebula' ) & 'crab'
+
+
+
+
+
+Operators For tsquery
+
+
+Rewriting can be slow for many rewriting rules since it checks every rule
+for a possible hit. To filter out obvious non-candidate rules there are containment
+operators for the tsquery type. In the example below, we select only those
+rules which might contain the original query:
+
+SELECT rewrite(ARRAY['a & b'::tsquery, t,s])
+FROM aliases
+WHERE 'a & b' @> t;
+  rewrite
+-----------
+ 'b' & 'c'
+
+
+
+
+
+Two operators are defined for tsquery:
+
+
+
+
+
+
+TSQUERY @> TSQUERY
+
+
+
+
+TSQUERY @> TSQUERY
+
+
+
+
+
+Returns true if the right argument might be contained in left argument.
+
+
+
+
+
+
+
+tsquery <@ tsquery
+
+
+
+
+TSQUERY <@ TSQUERY
+
+
+
+
+
+Returns true if the left argument might be contained in right argument.
+
+
+
+
+
+
+
+
+
+Index For tsquery
+
+
+To speed up operators <@ and @> for
+tsquery one can use a GiST index with
+a tsquery_ops opclass:
+
+
+CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops);
+
+
+
+
+
+
+
+
+
+
+Additional Controls
+
+
+To implement full text searching there must be a function to create a
+tsvector from a document and a tsquery from a
+user query. Also, we need to return results in some order, i.e., we need
+a function which compares documents with respect to their relevance to
+the tsquery.  Full text searching in
+PostgreSQL provides support for all of these
+functions.
+
+
+
+Parsing
+
+
+Full text searching in PostgreSQL provides
+function to_tsvector, which converts a document to
+the tsvector data type. More details are available in 
+linkend="textsearch-tsvector">, but for now consider a simple example:
+
+SELECT to_tsvector('english', 'a fat  cat sat on a mat - it ate a fat rats');
+                     to_tsvector
+-----------------------------------------------------
+ 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
+
+
+
+ 
+In the example above we see that the resulting tsvector does not
+contain the words a, on, or
+it, the word rats became
+rat, and the punctuation sign - was
+ignored. 
+ 
+
+
+The to_tsvector function internally calls a parser
+which breaks the document (a fat  cat sat on a mat - it ate a
+fat rats) into words and corresponding types. The default parser
+recognizes 23 types.  Each word, depending on its type, passes through a
+group of dictionaries ().  At the
+end of this step we obtain lexemes.  For example,
+rats became rat because one of the
+dictionaries recognized that the word rats is a plural
+form of rat.  Some words are treated as "stop words"
+() and ignored since they occur too
+frequently and have little informational value.  In our example these are
+a, on, and it.
+The punctuation sign - was also ignored because its
+type (Space symbols) is not indexed. All information
+about the parser, dictionaries and what types of lexemes to index is
+documented in the full text configuration section (
+linkend="textsearch-tables-configuration">).  It is possible to have
+several different configurations in the same database, and many predefined
+system configurations are available for different languages. In our example
+we used the default configuration english for the
+English language.
+
+
+
+As another example, below is the output from the ts_debug
+function (  ), which shows all details
+of the full text machinery:
+
+SELECT * FROM ts_debug('english','a fat  cat sat on a mat - it ate a fat rats');
+ Alias |  Description  | Token |      Dicts list      |       Lexized token
+-------+---------------+-------+----------------------+---------------------------
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | cat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {cat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | sat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {sat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | on    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | mat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {mat}
+ blank | Space symbols |       |                      |
+ blank | Space symbols | -     |                      |
+ lword | Latin word    | it    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | ate   | {pg_catalog.en_stem} | pg_catalog.en_stem: {ate}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | rats  | {pg_catalog.en_stem} | pg_catalog.en_stem: {rat}
+(24 rows)
+
+
+
+
+Function setweight() is used to label
+tsvector. The typical usage of this is to mark out the
+different parts of a document, perhaps by importance.  Later, this can be
+used for ranking of search results in addition to positional information
+(distance between query terms).  If no ranking is required, positional
+information can be removed from tsvector using the
+strip() function to save space.
+
+
+
+Because to_tsvector(NULL) can
+return NULL, it is recommended to use
+coalesce. Here is the safe method for creating a
+tsvector from a structured document:
+
+UPDATE tt SET ti=
+    setweight(to_tsvector(coalesce(title,'')), 'A')    || ' ' ||
+    setweight(to_tsvector(coalesce(keyword,'')), 'B')  || ' ' ||
+    setweight(to_tsvector(coalesce(abstract,'')), 'C') || ' ' ||
+    setweight(to_tsvector(coalesce(body,'')), 'D');
+
+
+
+
+The following functions allow manual parsing control:
+
+
+
+
+
+
+parse
+
+
+
+
+parse(parser,  document TEXT) returns SETOF tokenout
+
+
+
+
+
+Parses the given document and returns a series
+of records, one for each token produced by parsing. Each record includes
+a tokid giving its type and a token
+which gives its content:
+
+SELECT * FROM parse('default','123 - a number');
+ tokid | token
+-------+--------
+    22 | 123
+    12 |
+    12 | -
+     1 | a
+    12 |
+     1 | number
+
+
+
+
+
+
+
+token_type
+
+
+
+
+token_type(parser ) returns SETOF tokentype
+
+
+
+
+
+Returns a table which describes each kind of token the
+parser might produce as output.  For each token
+type the table gives the tokid which the
+parser uses to label each
+token of that type, the alias which
+names the token type, and a short description:
+
+SELECT * FROM token_type('default');
+ tokid |    alias     |            description
+-------+--------------+-----------------------------------
+     1 | lword        | Latin word
+     2 | nlword       | Non-latin word
+     3 | word         | Word
+     4 | email        | Email
+     5 | url          | URL
+     6 | host         | Host
+     7 | sfloat       | Scientific notation
+     8 | version      | VERSION
+     9 | part_hword   | Part of hyphenated word
+    10 | nlpart_hword | Non-latin part of hyphenated word
+    11 | lpart_hword  | Latin part of hyphenated word
+    12 | blank        | Space symbols
+    13 | tag          | HTML Tag
+    14 | protocol     | Protocol head
+    15 | hword        | Hyphenated word
+    16 | lhword       | Latin hyphenated word
+    17 | nlhword      | Non-latin hyphenated word
+    18 | uri          | URI
+    19 | file         | File or path name
+    20 | float        | Decimal notation
+    21 | int          | Signed integer
+    22 | uint         | Unsigned integer
+    23 | entity       | HTML Entity
+
+
+
+
+
+
+
+
+
+
+
+
+Ranking Search Results
+
+
+Ranking attempts to measure how relevant documents are to a particular
+query by inspecting the number of times each search word appears in the
+document, and whether different search terms occur near each other.  Full
+text searching provides two predefined ranking functions which attempt to
+produce a measure of how a document is relevant to the query.  In spite
+of that, the concept of relevancy is vague and very application-specific.
+These functions try to take into account lexical, proximity, and structural
+information.  Different applications might require additional information
+for ranking, e.g. document modification time.
+
+
+
+The lexical part of ranking reflects how often the query terms appear in
+the document, how close the document query terms are, and in what part of
+the document they occur.  Note that ranking functions that use positional
+information will only work on unstripped tsvectors because stripped
+tsvectors lack positional information.
+
+
+
+The two ranking functions currently available are:
+
+
+
+
+
+
+rank
+
+
+
+
+rank( weights float4[], vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This ranking function offers the ability to weigh word instances more
+heavily depending on how you have classified them.  The weights specify
+how heavily to weigh each category of word:
+
+{D-weight, C-weight, B-weight, A-weight}
+ 
+If no weights are provided,
+then these defaults are used:
+
+{0.1, 0.2, 0.4, 1.0}
+
+Often weights are used to mark words from special areas of the document,
+like the title or an initial abstract, and make them more or less important
+than words in the document body.
+
+
+
+
+
+
+
+rank_cd
+
+
+
+
+rank_cd( weights float4[],  vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This function computes the cover density ranking for
+the given document vector and query, as described in Clarke, Cormack, and
+Tudhope's "Relevance Ranking for One to Three Term Queries" in the
+"Information Processing and Management", 1999.
+
+
+
+
+
+
+
+
+
+Since a longer document has a greater chance of containing a query term
+it is reasonable to take into account document size, i.e. a hundred-word
+document with five instances of a search word is probably more relevant
+than a thousand-word document with five instances.  Both ranking functions
+take an integer normalization option that
+specifies whether a document's length should impact its rank.  The integer
+option controls several behaviors which is done using bit-wise fields and
+| (for example, 2|4):
+
+
+
+0 (the default) ignores the document length
+
+
+1 divides the rank by 1 + the logarithm of the document length
+
+
+2 divides the rank by the length itself
+
+
+
+4 divides the rank by the mean harmonic distance between extents
+
+
+8 divides the rank by the number of unique words in document
+
+
+16 divides the rank by 1 + logarithm of the number of unique words in document
+
+
+
+
+
+
+It is important to note that ranking functions do not use any global
+information so it is impossible to produce a fair normalization to 1% or
+100%, as sometimes required. However, a simple technique like
+rank/(rank+1) can be applied.  Of course, this is just
+a cosmetic change, i.e., the ordering of the search results will not change.
+
+
+
+Several examples are shown below; note that the second example uses
+normalized ranking:
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |   rnk
+-----------------------------------------------+----------
+ Neutrinos in the Sun                          |      3.1
+ The Sudbury Neutrino Detector                 |      2.4
+ A MACHO View of Galactic Dark Matter          |  2.01317
+ Hot Gas and Dark Matter                       |  1.91171
+ The Virgo Cluster: Hot Plasma and Dark Matter |  1.90953
+ Rafting for Solar Neutrinos                   |      1.9
+ NGC 4650A: Strange Galaxy and Dark Matter     |  1.85774
+ Hot Gas and Dark Matter                       |   1.6123
+ Ice Fishing for Cosmic Neutrinos              |      1.6
+ Weak Lensing Distorts the Universe            | 0.818218
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query)/
+(rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) + 1) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE  query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |        rnk
+-----------------------------------------------+-------------------
+ Neutrinos in the Sun                          | 0.756097569485493
+ The Sudbury Neutrino Detector                 | 0.705882361190954
+ A MACHO View of Galactic Dark Matter          | 0.668123210574724
+ Hot Gas and Dark Matter                       |  0.65655958650282
+ The Virgo Cluster: Hot Plasma and Dark Matter | 0.656301290640973
+ Rafting for Solar Neutrinos                   | 0.655172410958162
+ NGC 4650A: Strange Galaxy and Dark Matter     | 0.650072921219637
+ Hot Gas and Dark Matter                       | 0.617195790024749
+ Ice Fishing for Cosmic Neutrinos              | 0.615384618911517
+ Weak Lensing Distorts the Universe            | 0.450010798361481
+
+
+
+
+The first argument in rank_cd ('{0.1, 0.2,
+0.4, 1.0}') is an optional parameter which specifies the
+weights for labels D, C,
+B, and A used in function
+setweight. These default values show that lexemes
+labeled as A are ten times more important than ones
+that are labeled with D.
+
+
+
+Ranking can be expensive since it requires consulting the
+tsvector of all documents, which can be I/O bound and
+therefore slow. Unfortunately, it is almost impossible to avoid since full
+text searching in a database should work without indexes .  Moreover an index can be lossy (a GiST
+index, for example) so it must check documents to avoid false hits.
+
+
+
+Note that the ranking functions above are only examples.  You can write
+your own ranking functions and/or combine additional factors to fit your
+specific needs.
+
+
+
+
+
+
+Highlighting Results
+
+
+headline
+
+
+
+To present search results it is ideal to show a part of each document and
+how it is related to the query. Usually, search engines show fragments of
+the document with marked search terms.  PostgreSQL full
+text searching provides the function headline that
+implements such functionality.
+
+
+
+
+
+
+
+
+headline( conf_name text, document text, query TSQUERY,  options text ) returns text
+
+
+
+
+
+The headline() function accepts a document along with
+a query, and returns one or more ellipsis-separated excerpts from the
+document in which terms from the query are highlighted.  The configuration
+used to parse the document can be specified by its
+conf_name; if none is specified, the current
+configuration is used.
+
+
+
+
+
+
+
+
+If an options string is specified it should
+consist of a comma-separated list of one or more 'option=value' pairs.
+The available options are:
+
+
+
+StartSel, StopSel: the strings with which
+query words appearing in the document should be delimited to distinguish
+them from other excerpted words.
+
+
+MaxWords, MinWords: limit the shortest and
+longest headlines to output
+
+
+ShortWord: this prevents your headline from beginning
+or ending with a word which has this many characters or less. The default
+value of three eliminates the English articles.
+
+
+HighlightAll: boolean flag;  if
+true the whole document will be highlighted
+
+
+
+Any unspecified options receive these defaults:
+
+StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
+
+
+
+
+For example:
+
+
+SELECT headline('a b c', 'c'::tsquery);
+   headline
+--------------
+ a b <b>c</b>
+SELECT headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>');
+ headline
+----------
+ a b  <c>
+
+
+
+
+headline uses the original document, not
+tsvector, so it can be slow and should be used with care.
+A typical mistake is to call headline() for
+every matching document when only ten documents are
+shown. SQL subselects can help here;  below is an
+example:
+
+
+SELECT id,headline(body,q), rank
+FROM (SELECT id,body,q, rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q
+      WHERE ti @@ q
+      ORDER BY rank DESC LIMIT 10) AS foo;
+
+
+
+
+Note that the cascade dropping of the parser function
+causes dropping of the headline used in the full text search
+configuration conf_name.
+
+
+
+
+
+
+
+Dictionaries
+
+
+Dictionaries are used to specify words that should not be considered in
+a search and for the normalization of words to allow the user to use any
+derived form of a word in a query. Also, normalization can reduce the size of
+tsvector. Normalization does not always have linguistic
+meaning and usually depends on application semantics.
+
+
+
+Some examples of normalization:  
+
+
+
+
+ Linguistic - ispell dictionaries try to reduce input words to a
+normalized form; stemmer dictionaries remove word endings
+ 
+
+ Identical URL locations are identified and canonicalized:
+
+
+
+http://www.pgsql.ru/db/mw/index.html
+
+
+http://www.pgsql.ru/db/mw/
+
+
+http://www.pgsql.ru/db/../db/mw/index.html
+
+
+
+
+
+Colour names are substituted by their hexadecimal values, e.g.,
+red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF
+
+
+Remove some numeric fractional digits to reduce the range of possible
+numbers, so 3.14159265359,
+3.1415926, 3.14 will be the same
+after normalization if only two digits are kept after the decimal point.
+
+
+
+
+
+
+A dictionary is a program which accepts lexemes as
+input and returns:
+
+
+an array of lexemes if the input lexeme is known to the dictionary
+
+
+a void array if the dictionary knows the lexeme, but it is a stop word
+
+
+NULL if the dictionary does not recognize the input lexeme
+
+
+
+WARNING:
+Data files used by dictionaries should be in the server_encoding
+so all encodings are consistent across databases.
+
+
+
+Full text searching provides predefined dictionaries for many languages,
+and SQL commands to manipulate them.  There are also
+several predefined template dictionaries that can be used to create new
+dictionaries by overriding their default parameters.  Besides this, it is
+possible to develop custom dictionaries using an API;
+see the dictionary for integers (
+linkend="textsearch-rule-dictionary-example">) as an example.
+
+
+
+The ALTER TEXT SEARCH CONFIGURATION public.pg ADD
+MAPPING command binds specific types of lexemes and a set of
+dictionaries to process them. (Mappings can also be specified as part of
+configuration creation.) Lexemes are processed by a stack of dictionaries
+until some dictionary identifies it as a known word or it turns out to be
+a stop word.  If no dictionary recognizes a lexeme, it will be discarded
+and not indexed. A general rule for configuring a stack of dictionaries
+is to place first the most narrow, most specific dictionary, then the more
+general dictionaries and finish it with a very general dictionary, like
+the snowball stemmer or simple, which
+recognizes everything.  For example, for an astronomy-specific search
+(astro_en configuration) one could bind
+lword (latin word) with a synonym dictionary of astronomical
+terms, a general English dictionary and a snowball English
+stemmer:
+
+ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, en_ispell, en_stem;
+
+
+
+
+Function lexize can be used to test dictionaries,
+for example:
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+(1 row)
+
+Also, the ts_debug function ()
+can be used for this.
+
+
+
+Stop Words
+
+Stop words are words which are very common, appear in almost
+every document, and have no discrimination value. Therefore, they can be ignored
+in the context of full text searching. For example, every English text contains
+words like a although it is useless to store them in an index.
+However, stop words do affect the positions in tsvector,
+which in turn, do affect ranking:
+
+SELECT to_tsvector('english','in the list of stop words');
+        to_tsvector
+----------------------------
+ 'list':3 'stop':5 'word':6
+
+The gaps between positions 1-3 and 3-5 are because of stop words, so ranks
+calculated for documents with and without stop words are quite different:
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+     0.5
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+       1
+
+
+
+
+
+It is up to the specific dictionary how it treats stop words. For example,
+ispell dictionaries first normalize words and then
+look at the list of stop words, while stemmers
+first check the list of stop words. The reason for the different
+behaviour is an attempt to decrease possible noise.
+
+
+
+Here is an example of a dictionary that returns the input word as lowercase
+or NULL if it is a stop word; it also specifies the location
+of the file of stop words.  It uses the simple dictionary as
+a template:
+
+CREATE TEXT SEARCH DICTIONARY public.simple_dict
+    TEMPLATE pg_catalog.simple
+    OPTION 'english.stop';
+
+Relative paths in OPTION resolve relative to
+share/.  Now we can test our
+dictionary:
+
+SELECT lexize('public.simple_dict','YeS');
+ lexize
+--------
+ {yes}
+SELECT lexize('public.simple_dict','The');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+Synonym Dictionary
+
+
+This dictionary template is used to create dictionaries which replace a
+word with a synonym. Phrases are not supported (use the thesaurus
+dictionary () if you need them).  Synonym
+dictionary can be used to overcome linguistic problems, for example, to
+prevent an English stemmer dictionary from reducing the word 'Paris' to
+'pari'.  In that case, it is enough to have a Paris
+paris line in the synonym dictionary and put it before the
+en_stem dictionary:
+
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |      Dicts list      |       Lexized token
+-------+-------------+-------+----------------------+----------------------------
+ lword | Latin word  | Paris | {pg_catalog.en_stem} | pg_catalog.en_stem: {pari}
+(1 row)
+ALTER TEXT SEARCH CONFIGURATION ADD MAPPING ON english FOR lword WITH synonym, en_stem;
+ALTER TEXT SEARCH MAPPING
+Time: 340.867 ms
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |               Dicts list                |        Lexized token
+-------+-------------+-------+-----------------------------------------+-----------------------------
+ lword | Latin word  | Paris | {pg_catalog.synonym,pg_catalog.en_stem} | pg_catalog.synonym: {paris}
+(1 row)
+
+
+
+
+
+
+Thesaurus Dictionary
+
+
+A thesaurus dictionary (sometimes abbreviated as TZ) is
+a collection of words which includes information about the relationships
+of words and phrases, i.e., broader terms (BT), narrower
+terms (NT), preferred terms, non-preferred terms, related
+terms, etc.
+
+
+Basically a thesaurus dictionary replaces all non-preferred terms by one
+preferred term and, optionally, preserves them for indexing.  Thesauruses
+are used during indexing so any change in the thesaurus requires
+reindexing.  The current implementation of the thesaurus
+dictionary is an extension of the synonym dictionary with added
+phrase support.  A thesaurus is a plain file of the
+following format:
+
+# this is a comment
+sample word(s) : indexed word(s)
+...............................
+
+where  the colon (:) symbol acts as a delimiter.
+
+
+
+A thesaurus dictionary uses a subdictionary (which
+should be defined in the full text configuration) to normalize the
+thesaurus text. It is only possible to define one dictionary.  Notice that
+the subdictionary will produce an error if it can
+not recognize a word. In that case, you should remove the definition of
+the word or teach the subdictionary to about it.
+Use an asterisk (*) at the beginning of an indexed word to
+skip the subdictionary. It is still required that sample words are known.
+
+
+
+The thesaurus dictionary looks for the longest match.
+
+
+
+Stop words recognized by the subdictionary are replaced by a 'stop word
+placeholder' to record their position. To break possible ties the thesaurus
+uses the last definition. To illustrate this, consider a thesaurus (with
+a simple subdictionary) with pattern
+'swsw', where 's' designates any stop word and
+'w', any known word:
+
+a one the two : swsw
+the one a two : swsw2
+
+Words 'a' and 'the' are stop words defined in the
+configuration of a subdictionary. The thesaurus considers 'the
+one the two' and 'that one then two' as equal
+and will use definition 'swsw2'.
+
+
+
+As any normal dictionary, it can be assigned to the specific lexeme types.
+Since a thesaurus dictionary has the capability to recognize phrases it
+must remember its state and interact with the parser. A thesaurus dictionary
+uses these assignments to check if it should handle the next word or stop
+accumulation.  The thesaurus dictionary compiler must be configured
+carefully. For example, if the thesaurus dictionary is assigned to handle
+only the lword lexeme, then a thesaurus dictionary
+definition like ' one 7' will not work since lexeme type
+digit is not assigned to the thesaurus dictionary.
+
+
+
+
+
+Thesaurus Configuration
+
+
+To define a new thesaurus dictionary one can use the thesaurus template.
+For example:
+
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_simple
+    TEMPLATE thesaurus_template
+    OPTION 'DictFile="dicts_data/thesaurus.txt.sample", Dictionary="en_stem"';
+
+Here:
+
+
+thesaurus_simple is the thesaurus dictionary name
+
+
+DictFile="/path/to/thesaurus_simple.txt" is the location of the thesaurus file
+
+
+Dictionary="en_stem" defines the dictionary (snowball
+English stemmer) to use for thesaurus normalization. Notice that the
+en_stem dictionary has it is own configuration (for example,
+stop words).
+
+
+
+Now it is possible to bind the thesaurus dictionary thesaurus_simple
+and selected tokens, for example:
+
+
+ALTER TEXT SEARCH russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple;
+
+
+
+
+
+
+Thesaurus Example
+
+
+Consider a simple astronomical thesaurus thesaurus_astro,
+which contains some astronomical word combinations:
+
+supernovae stars : sn
+crab nebulae : crab
+
+Below we create a dictionary and bind some token types with
+an astronomical thesaurus and english stemmer:
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_astro OPTION
+    TEMPLATE thesaurus_template
+    'DictFile="dicts_data/thesaurus_astro.txt", Dictionary="en_stem"';
+ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword
+    WITH thesaurus_astro, en_stem;
+
+Now we can see how it works. Note that lexize cannot
+be used for testing the thesaurus (see description of
+lexize), but we can use
+plainto_tsquery and to_tsvector
+which accept text arguments, not lexemes:
+
+
+SELECT plainto_tsquery('supernova star');
+ plainto_tsquery
+-----------------
+ 'sn'
+SELECT to_tsvector('supernova star');
+ to_tsvector
+-------------
+ 'sn':1
+
+In principle, one can use to_tsquery if you quote
+the argument:
+
+SELECT to_tsquery('''supernova star''');
+ to_tsquery
+------------
+ 'sn'
+
+Notice that supernova star matches supernovae
+stars in thesaurus_astro because we specified the
+en_stem stemmer in the thesaurus definition.
+
+
+To keep an original phrase in full text indexing just add it to the right part
+of the definition:
+
+supernovae stars : sn supernovae stars
+
+SELECT plainto_tsquery('supernova star');
+       plainto_tsquery
+-----------------------------
+ 'sn' & 'supernova' & 'star'
+
+
+
+
+
+
+Ispell Dictionary
+
+
+The Ispell template dictionary for full text allows the
+creation of morphological dictionaries based on 
+url="http://ficus-www.cs.ucla.edu/geoff/ispell.html">Ispell, which
+supports a large number of languages. This dictionary tries to change an
+input word to its normalized form. Also, more modern spelling dictionaries
+are supported - 
+url="http://en.wikipedia.org/wiki/MySpell">MySpell (OO < 2.0.1)
+and Hunspell
+(OO >= 2.0.2).  A large list of dictionaries is available on the 
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+




This is the main PostgreSQL git repository.
RSS
Atom
+   class="parameter">new_dictionary for 
+   class="parameter">old_dictionary anywhere the latter appears.
+   This is done for only the specified token types when FOR
+   appears, or for all mappings of the configuration when it doesn't.
+   The DROP MAPPING form removes all dictionaries for the
+   specified token type(s), causing tokens of those types to be ignored
+   by the text search configuration.  It is an error if there is no mapping
+   for the token types, unless IF EXISTS appears.
+  
+
+ 
+  
+ 
+  Notes
+
+  
+   While changing the text search parser used by a configuration is allowed,
+   this will only work nicely if old and new parsers use the same set of
+   token types.  It is advisable to drop the mappings for any incompatible
+   token types before changing parsers.
+  
+
+ 
+  
+ 
+  Examples
+
+  
+   The following example replaces the english dictionary
+   with the swedish dictionary anywhere that english
+   is used within my_config.
+  
+
+
+ALTER TEXT SEARCH CONFIGURATION my_config
+  ALTER MAPPING REPLACE english WITH swedish;
+  
+ 
+
+ 
+  Compatibility
+
+  
+   There is no ALTER TEXT SEARCH CONFIGURATION statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
+   class="parameter">old_dictionary anywhere the latter appears.
+   This is done for only the specified token types when FOR
+   appears, or for all mappings of the configuration when it doesn't.
+   The DROP MAPPING form removes all dictionaries for the
+   specified token type(s), causing tokens of those types to be ignored
+   by the text search configuration.  It is an error if there is no mapping
+   for the token types, unless IF EXISTS appears.
+  
+
+ 
+  
+ 
+  Notes
+
+  
+   While changing the text search parser used by a configuration is allowed,
+   this will only work nicely if old and new parsers use the same set of
+   token types.  It is advisable to drop the mappings for any incompatible
+   token types before changing parsers.
+  
+
+ 
+  
+ 
+  Examples
+
+  
+   The following example replaces the english dictionary
+   with the swedish dictionary anywhere that english
+   is used within my_config.
+  
+
+
+ALTER TEXT SEARCH CONFIGURATION my_config
+  ALTER MAPPING REPLACE english WITH swedish;
+  
+ 
+
+ 
+  Compatibility
+
+  
+   There is no ALTER TEXT SEARCH CONFIGURATION statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
diff --git a/doc/src/sgml/ref/alter_tsdictionary.sgml b/doc/src/sgml/ref/alter_tsdictionary.sgml

new file mode 100644 (file)

index 0000000..59c3366
--- /dev/null
+++ b/doc/src/sgml/ref/alter_tsdictionary.sgml
@@ -0,0 +1,118 @@
+
+
+
+ 
+  ALTER TEXT SEARCH DICTIONARY
+  SQL - Language Statements
+ 
+
+ 
+  ALTER TEXT SEARCH DICTIONARY
+  change the definition of a text search dictionary
+   
+  
+ 
+  ALTER TEXT SEARCH DICTIONARY
+ 
+
+ 
+
+ALTER TEXT SEARCH DICTIONARY name ( OPTION = init_options )
+ALTER TEXT SEARCH DICTIONARY name RENAME TO newname
+ALTER TEXT SEARCH DICTIONARY name OWNER TO newowner
+
+ 
+  
+ 
+  Description
+
+  
+   ALTER TEXT SEARCH DICTIONARY changes the definition of
+   a text search dictionary.  You can change the dictionary's initialization
+   options, or change the dictionary's name or owner.
+  
+
+  
+   You must be the owner of the dictionary to use
+   ALTER TEXT SEARCH DICTIONARY.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search
+      dictionary.
+     
+    
+   
+
+   
+    init_options
+    
+     
+      A new list of initialization options, or NULL to
+      remove all options.
+     
+    
+   
+
+   
+    newname
+    
+     
+      The new name of the text search dictionary.
+     
+    
+   
+
+   
+    newowner
+    
+     
+      The new owner of the text search dictionary.
+     
+    
+   
+ 
+ 
+  
+ 
+  Examples
+
+  
+   The following example command sets the language and stopword list
+   for a Snowball-based dictionary.
+  
+
+
+ALTER TEXT SEARCH DICTIONARY my_russian ( option = 'Language=russian, StopWords=my_russian' );
+  
+ 
+
+ 
+  Compatibility
+
+  
+   There is no ALTER TEXT SEARCH DICTIONARY statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
diff --git a/doc/src/sgml/ref/alter_tsparser.sgml b/doc/src/sgml/ref/alter_tsparser.sgml

new file mode 100644 (file)

index 0000000..a94d393
--- /dev/null
+++ b/doc/src/sgml/ref/alter_tsparser.sgml
@@ -0,0 +1,82 @@
+
+
+
+ 
+  ALTER TEXT SEARCH PARSER
+  SQL - Language Statements
+ 
+
+ 
+  ALTER TEXT SEARCH PARSER
+  change the definition of a text search parser
+   
+  
+ 
+  ALTER TEXT SEARCH PARSER
+ 
+
+ 
+
+ALTER TEXT SEARCH PARSER name RENAME TO newname
+
+ 
+  
+ 
+  Description
+
+  
+   ALTER TEXT SEARCH PARSER changes the definition of
+   a text search parser.  Currently, the only supported functionality
+   is to change the parser's name.
+  
+
+  
+   You must be a superuser to use ALTER TEXT SEARCH PARSER.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search parser.
+     
+    
+   
+
+   
+    newname
+    
+     
+      The new name of the text search parser.
+     
+    
+   
+ 
+ 
+
+ 
+  Compatibility
+
+  
+   There is no ALTER TEXT SEARCH PARSER statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
diff --git a/doc/src/sgml/ref/alter_tstemplate.sgml b/doc/src/sgml/ref/alter_tstemplate.sgml

new file mode 100644 (file)

index 0000000..4ee9e82
--- /dev/null
+++ b/doc/src/sgml/ref/alter_tstemplate.sgml
@@ -0,0 +1,82 @@
+
+
+
+ 
+  ALTER TEXT SEARCH TEMPLATE
+  SQL - Language Statements
+ 
+
+ 
+  ALTER TEXT SEARCH TEMPLATE
+  change the definition of a text search template
+   
+  
+ 
+  ALTER TEXT SEARCH TEMPLATE
+ 
+
+ 
+
+ALTER TEXT SEARCH TEMPLATE name RENAME TO newname
+
+ 
+  
+ 
+  Description
+
+  
+   ALTER TEXT SEARCH TEMPLATE changes the definition of
+   a text search template.  Currently, the only supported functionality
+   is to change the template's name.
+  
+
+  
+   You must be a superuser to use ALTER TEXT SEARCH TEMPLATE.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search template.
+     
+    
+   
+
+   
+    newname
+    
+     
+      The new name of the text search template.
+     
+    
+   
+ 
+ 
+
+ 
+  Compatibility
+
+  
+   There is no ALTER TEXT SEARCH TEMPLATE statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
diff --git a/doc/src/sgml/ref/comment.sgml b/doc/src/sgml/ref/comment.sgml

index 733f75ab3b78e15da953ddc5441fc26e34b2503f..c8993e915b8e683f4fc722f7feda32eeb7e6a8ae 100644 (file)
--- a/doc/src/sgml/ref/comment.sgml
+++ b/doc/src/sgml/ref/comment.sgml
@@ -1,5 +1,5 @@
  
  
@@ -42,6 +42,10 @@ COMMENT ON
    SCHEMA object_name |
    SEQUENCE object_name |
    TABLESPACE object_name |
+  TEXT SEARCH CONFIGURATION object_name |
+  TEXT SEARCH DICTIONARY object_name |
+  TEXT SEARCH PARSER object_name |
+  TEXT SEARCH TEMPLATE object_name |
    TRIGGER trigger_name ON table_name |
    TYPE object_name |
    VIEW object_name
@@ -65,9 +69,8 @@ COMMENT ON
    
  
    
-    Comments can be
-    easily retrieved with the psql commands
-    \dd, \d+, and \l+.
+    Comments can be viewed using psql's 
+    \d family of commands.
      Other user interfaces to retrieve comments can be built atop
      the same built-in functions that psql uses, namely
      obj_description, col_description,
@@ -93,7 +96,8 @@ COMMENT ON
       
        The name of the object to be commented.  Names of tables,
        aggregates, domains, functions, indexes, operators, operator classes,
-      operator families, sequences, types, and views can be schema-qualified.
+      operator families, sequences, text search objects, types, and views can
+      be schema-qualified.
       
      
     
@@ -255,6 +259,10 @@ COMMENT ON SCHEMA my_schema IS 'Departmental data';
  COMMENT ON SEQUENCE my_sequence IS 'Used to generate primary keys';
  COMMENT ON TABLE my_schema.my_table IS 'Employee Information';
  COMMENT ON TABLESPACE my_tablespace IS 'Tablespace for indexes';
+COMMENT ON TEXT SEARCH CONFIGURATION my_config IS 'Special word filtering';
+COMMENT ON TEXT SEARCH DICTIONARY swedish IS 'Snowball stemmer for swedish language';
+COMMENT ON TEXT SEARCH PARSER my_parser IS 'Splits text into words';
+COMMENT ON TEXT SEARCH TEMPLATE snowball IS 'Snowball stemmer';
  COMMENT ON TRIGGER my_trigger ON my_table IS 'Used for RI';
  COMMENT ON TYPE complex IS 'Complex number data type';
  COMMENT ON VIEW my_view IS 'View of departmental costs';
diff --git a/doc/src/sgml/ref/create_tsconfig.sgml b/doc/src/sgml/ref/create_tsconfig.sgml

new file mode 100644 (file)

index 0000000..49be941
--- /dev/null
+++ b/doc/src/sgml/ref/create_tsconfig.sgml
@@ -0,0 +1,126 @@
+
+
+
+ 
+  CREATE TEXT SEARCH CONFIGURATION
+  SQL - Language Statements
+ 
+
+ 
+  CREATE TEXT SEARCH CONFIGURATION
+  define a new text search configuration
+ 
+
+ 
+  CREATE TEXT SEARCH CONFIGURATION
+ 
+
+ 
+
+CREATE TEXT SEARCH CONFIGURATION name (
+    PARSER = parser_name |
+    COPY = source_config
+)
+
+ 
+
+ 
+  Description
+
+  
+   CREATE TEXT SEARCH CONFIGURATION creates a new text
+   search configuration.  A text search configuration specifies a text
+   search parser that can divide a string into tokens, plus dictionaries
+   that can be used to determine which tokens are of interest for searching.
+  
+
+  
+   If only the parser is specified, then the new text search configuration
+   initially has no mappings from token types to dictionaries, and therefore
+   will ignore all words.  Subsequent ALTER TEXT SEARCH
+   CONFIGURATION commands must be used to create mappings to
+   make the configuration useful.  Alternatively, an existing text search
+   configuration can be copied.
+  
+
+  
+   If a schema name is given then the text search configuration is created in
+   the specified schema.  Otherwise it is created in the current schema.
+  
+
+  
+   The user who defines a text search configuration becomes its owner.
+  
+
+  
+   Refer to  for further information.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name of the text search configuration to be created.  The name can be
+      schema-qualified.
+     
+    
+   
+
+   
+    parser_name
+    
+     
+      The name of the text search parser to use for this configuration.
+     
+    
+   
+
+   
+    source_config
+    
+     
+      The name of an existing text search configuration to copy.
+     
+    
+   
+  
+ 
+  
+ 
+  Notes
+
+  
+   It is allowed to specify both PARSER and COPY,
+   resulting in the specified parser being used with whatever mappings
+   are in the source configuration.  This is generally inadvisable,
+   unless you know that both parsers involved use the same token type set.
+  
+
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no CREATE TEXT SEARCH CONFIGURATION statement
+   in the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
diff --git a/doc/src/sgml/ref/create_tsdictionary.sgml b/doc/src/sgml/ref/create_tsdictionary.sgml

new file mode 100644 (file)

index 0000000..81c6a0c
--- /dev/null
+++ b/doc/src/sgml/ref/create_tsdictionary.sgml
@@ -0,0 +1,111 @@
+
+
+
+ 
+  CREATE TEXT SEARCH DICTIONARY
+  SQL - Language Statements
+ 
+
+ 
+  CREATE TEXT SEARCH DICTIONARY
+  define a new text search dictionary
+ 
+
+ 
+  CREATE TEXT SEARCH DICTIONARY
+ 
+
+ 
+
+CREATE TEXT SEARCH DICTIONARY name (
+    TEMPLATE = template
+    [, OPTION = init_options ]
+)
+
+ 
+
+ 
+  Description
+
+  
+   CREATE TEXT SEARCH DICTIONARY creates a new text search
+   dictionary.  A text search dictionary specifies a way of recognizing
+   interesting or uninteresting words for searching.  A dictionary depends
+   on a text search template, which specifies the functions that actually
+   perform the work.  Typically the dictionary provides some options that
+   control the detailed behavior of the template's functions.
+  
+
+  
+   If a schema name is given then the text search dictionary is created in the
+   specified schema.  Otherwise it is created in the current schema.
+  
+
+  
+   The user who defines a text search dictionary becomes its owner.
+  
+
+  
+   Refer to  for further information.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name of the text search dictionary to be created.  The name can be
+      schema-qualified.
+     
+    
+   
+
+   
+    template
+    
+     
+      The name of the text search template that will define the basic
+      behavior of this dictionary.
+     
+    
+   
+
+   
+    init_options
+    
+     
+      A list of initialization options for the template functions.
+      This is a string containing keyword =
+      value pairs.  The specific keywords allowed
+      vary depending on the text search template.
+     
+    
+   
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no CREATE TEXT SEARCH DICTIONARY statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
diff --git a/doc/src/sgml/ref/create_tsparser.sgml b/doc/src/sgml/ref/create_tsparser.sgml

new file mode 100644 (file)

index 0000000..5f612cf
--- /dev/null
+++ b/doc/src/sgml/ref/create_tsparser.sgml
@@ -0,0 +1,152 @@
+
+
+
+ 
+  CREATE TEXT SEARCH PARSER
+  SQL - Language Statements
+ 
+
+ 
+  CREATE TEXT SEARCH PARSER
+  define a new text search parser
+ 
+
+ 
+  CREATE TEXT SEARCH PARSER
+ 
+
+ 
+
+CREATE TEXT SEARCH PARSER name (
+    START = start_function ,
+    GETTOKEN = gettoken_function ,
+    END = end_function ,
+    LEXTYPES = lextypes_function
+    [, HEADLINE = headline_function ]
+)
+
+ 
+
+ 
+  Description
+
+  
+   CREATE TEXT SEARCH PARSER creates a new text search
+   parser.  A text search parser defines a method for splitting a text
+   string into tokens and assigning types (categories) to the tokens.
+   A parser is not particularly useful by itself, but must be bound into a
+   text search configuration along with some text search dictionaries
+   to be used for searching.
+  
+
+  
+   If a schema name is given then the text search parser is created in the
+   specified schema.  Otherwise it is created in the current schema.
+  
+
+  
+   You must be a superuser to use CREATE TEXT SEARCH PARSER.
+   (This restriction is made because an erroneous text search parser
+   definition could confuse or even crash the server.)
+  
+
+  
+   Refer to  for further information.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name of the text search parser to be created.  The name can be
+      schema-qualified.
+     
+    
+   
+
+   
+    start_function
+    
+     
+      The name of the start function for the parser.
+     
+    
+   
+
+   
+    gettoken_function
+    
+     
+      The name of the get-next-token function for the parser.
+     
+    
+   
+
+   
+    end_function
+    
+     
+      The name of the end function for the parser.
+     
+    
+   
+
+   
+    lextypes_function
+    
+     
+      The name of the lextypes function for the parser (a function that
+      returns information about the set of token types it produces).
+     
+    
+   
+
+   
+    headline_function
+    
+     
+      The name of the headline function for the parser (a function that
+      summarizes a set of tokens).
+     
+    
+   
+  
+
+  
+   The function names can be schema-qualified if necessary.  Argument types
+   are not given, since the argument list for each type of function is
+   predetermined.  All except the headline function are required.
+  
+
+  
+   The arguments can appear in any order, not only the one shown above.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no
+   CREATE TEXT SEARCH PARSER statement in the SQL
+   standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
diff --git a/doc/src/sgml/ref/create_tstemplate.sgml b/doc/src/sgml/ref/create_tstemplate.sgml

new file mode 100644 (file)

index 0000000..5abadb7
--- /dev/null
+++ b/doc/src/sgml/ref/create_tstemplate.sgml
@@ -0,0 +1,125 @@
+
+
+
+ 
+  CREATE TEXT SEARCH TEMPLATE
+  SQL - Language Statements
+ 
+
+ 
+  CREATE TEXT SEARCH TEMPLATE
+  define a new text search template
+ 
+
+ 
+  CREATE TEXT SEARCH TEMPLATE
+ 
+
+ 
+
+CREATE TEXT SEARCH TEMPLATE name (
+    [ INIT = init_function , ]
+    LEXIZE = lexize_function
+)
+
+ 
+
+ 
+  Description
+
+  
+   CREATE TEXT SEARCH TEMPLATE creates a new text search
+   template.  Text search templates define the functions that implement
+   text search dictionaries.  A template is not useful by itself, but must
+   be instantiated as a dictionary to be used.  The dictionary typically
+   specifies parameters to be given to the template functions.
+  
+
+  
+   If a schema name is given then the text search template is created in the
+   specified schema.  Otherwise it is created in the current schema.
+  
+
+  
+   You must be a superuser to use CREATE TEXT SEARCH
+   TEMPLATE.  This restriction is made because an erroneous text
+   search template definition could confuse or even crash the server.
+   The reason for separating templates from dictionaries is that a template
+   encapsulates the unsafe aspects of defining a dictionary.
+   The parameters that can be set when defining a dictionary are safe for
+   unprivileged users to set, and so creating a dictionary need not be a
+   privileged operation.
+  
+
+  
+   Refer to  for further information.
+  
+ 
+  
+ 
+  Parameters
+
+  
+   
+    name
+    
+     
+      The name of the text search template to be created.  The name can be
+      schema-qualified.
+     
+    
+   
+
+   
+    init_function
+    
+     
+      The name of the init function for the template.
+     
+    
+   
+
+   
+    lexize_function
+    
+     
+      The name of the lexize function for the template.
+     
+    
+   
+  
+
+  
+   The function names can be schema-qualified if necessary.  Argument types
+   are not given, since the argument list for each type of function is
+   predetermined.  The lexize function is required, but the init function
+   is optional.
+  
+
+  
+   The arguments can appear in any order, not only the one shown above.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no
+   CREATE TEXT SEARCH TEMPLATE statement in the SQL
+   standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
diff --git a/doc/src/sgml/ref/drop_tsconfig.sgml b/doc/src/sgml/ref/drop_tsconfig.sgml

new file mode 100644 (file)

index 0000000..5cc8e3c
--- /dev/null
+++ b/doc/src/sgml/ref/drop_tsconfig.sgml
@@ -0,0 +1,118 @@
+
+
+
+ 
+  DROP TEXT SEARCH CONFIGURATION
+  SQL - Language Statements
+ 
+
+ 
+  DROP TEXT SEARCH CONFIGURATION
+  remove a text search configuration
+ 
+  
+ 
+  DROP TEXT SEARCH CONFIGURATION
+ 
+
+ 
+
+DROP TEXT SEARCH CONFIGURATION [ IF EXISTS ] name [ CASCADE | RESTRICT ]
+
+ 
+
+ 
+  Description
+
+  
+   DROP TEXT SEARCH CONFIGURATION drops an existing text
+   search configuration.  To execute this command you must be the owner of the
+   configuration.
+  
+ 
+  
+ 
+  Parameters
+
+  
+
+   
+    IF EXISTS
+    
+     
+      Do not throw an error if the text search configuration does not exist.
+      A notice is issued in this case.
+     
+    
+   
+
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search
+      configuration.
+     
+    
+   
+
+   
+    CASCADE
+    
+     
+      Automatically drop objects that depend on the text search configuration.
+     
+    
+   
+
+   
+    RESTRICT
+    
+     
+      Refuse to drop the text search configuration if any objects depend on it.
+      This is the default.
+     
+    
+   
+  
+ 
+
+ 
+  Examples
+
+  
+   Remove the text search configuration my_english:
+
+
+DROP TEXT SEARCH CONFIGURATION my_english;
+
+
+   This command will not succeed if there are any existing indexes
+   that reference the configuration in to_tsvector calls.
+   Add CASCADE to
+   drop such indexes along with the text search configuration.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no DROP TEXT SEARCH CONFIGURATION statement in
+   the SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
+
diff --git a/doc/src/sgml/ref/drop_tsdictionary.sgml b/doc/src/sgml/ref/drop_tsdictionary.sgml

new file mode 100644 (file)

index 0000000..683d4dc
--- /dev/null
+++ b/doc/src/sgml/ref/drop_tsdictionary.sgml
@@ -0,0 +1,117 @@
+
+
+
+ 
+  DROP TEXT SEARCH DICTIONARY
+  SQL - Language Statements
+ 
+
+ 
+  DROP TEXT SEARCH DICTIONARY
+  remove a text search dictionary
+ 
+  
+ 
+  DROP TEXT SEARCH DICTIONARY
+ 
+
+ 
+
+DROP TEXT SEARCH DICTIONARY [ IF EXISTS ] name [ CASCADE | RESTRICT ]
+
+ 
+
+ 
+  Description
+
+  
+   DROP TEXT SEARCH DICTIONARY drops an existing text
+   search dictionary.  To execute this command you must be the owner of the
+   dictionary.
+  
+ 
+  
+ 
+  Parameters
+
+  
+
+   
+    IF EXISTS
+    
+     
+      Do not throw an error if the text search dictionary does not exist.
+      A notice is issued in this case.
+     
+    
+   
+
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search
+      dictionary.
+     
+    
+   
+
+   
+    CASCADE
+    
+     
+      Automatically drop objects that depend on the text search dictionary.
+     
+    
+   
+
+   
+    RESTRICT
+    
+     
+      Refuse to drop the text search dictionary if any objects depend on it.
+      This is the default.
+     
+    
+   
+  
+ 
+
+ 
+  Examples
+
+  
+   Remove the text search dictionary english:
+
+
+DROP TEXT SEARCH DICTIONARY english;
+
+
+   This command will not succeed if there are any existing text search
+   configurations that use the dictionary.  Add CASCADE to
+   drop such configurations along with the dictionary.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no DROP TEXT SEARCH DICTIONARY statement in the
+   SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
+
diff --git a/doc/src/sgml/ref/drop_tsparser.sgml b/doc/src/sgml/ref/drop_tsparser.sgml

new file mode 100644 (file)

index 0000000..7a48228
--- /dev/null
+++ b/doc/src/sgml/ref/drop_tsparser.sgml
@@ -0,0 +1,115 @@
+
+
+
+ 
+  DROP TEXT SEARCH PARSER
+  SQL - Language Statements
+ 
+
+ 
+  DROP TEXT SEARCH PARSER
+  remove a text search parser
+ 
+  
+ 
+  DROP TEXT SEARCH PARSER
+ 
+
+ 
+
+DROP TEXT SEARCH PARSER [ IF EXISTS ] name [ CASCADE | RESTRICT ]
+
+ 
+
+ 
+  Description
+
+  
+   DROP TEXT SEARCH PARSER drops an existing text search
+   parser.  You must be a superuser to use this command.
+  
+ 
+  
+ 
+  Parameters
+
+  
+
+   
+    IF EXISTS
+    
+     
+      Do not throw an error if the text search parser does not exist.
+      A notice is issued in this case.
+     
+    
+   
+
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search parser.
+     
+    
+   
+
+   
+    CASCADE
+    
+     
+      Automatically drop objects that depend on the text search parser.
+     
+    
+   
+
+   
+    RESTRICT
+    
+     
+      Refuse to drop the text search parser if any objects depend on it.
+      This is the default.
+     
+    
+   
+  
+ 
+
+ 
+  Examples
+
+  
+   Remove the text search parser my_parser:
+
+
+DROP TEXT SEARCH PARSER my_parser;
+
+
+   This command will not succeed if there are any existing text search
+   configurations that use the parser.  Add CASCADE to
+   drop such configurations along with the parser.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no DROP TEXT SEARCH PARSER statement in the
+   SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
+
diff --git a/doc/src/sgml/ref/drop_tstemplate.sgml b/doc/src/sgml/ref/drop_tstemplate.sgml

new file mode 100644 (file)

index 0000000..6a776d1
--- /dev/null
+++ b/doc/src/sgml/ref/drop_tstemplate.sgml
@@ -0,0 +1,116 @@
+
+
+
+ 
+  DROP TEXT SEARCH TEMPLATE
+  SQL - Language Statements
+ 
+
+ 
+  DROP TEXT SEARCH TEMPLATE
+  remove a text search template
+ 
+  
+ 
+  DROP TEXT SEARCH TEMPLATE
+ 
+
+ 
+
+DROP TEXT SEARCH TEMPLATE [ IF EXISTS ] name [ CASCADE | RESTRICT ]
+
+ 
+
+ 
+  Description
+
+  
+   DROP TEXT SEARCH TEMPLATE drops an existing text search
+   template.  You must be a superuser to use this command.
+  
+ 
+  
+ 
+  Parameters
+
+  
+
+   
+    IF EXISTS
+    
+     
+      Do not throw an error if the text search template does not exist.
+      A notice is issued in this case.
+     
+    
+   
+
+   
+    name
+    
+     
+      The name (optionally schema-qualified) of an existing text search
+      template.
+     
+    
+   
+
+   
+    CASCADE
+    
+     
+      Automatically drop objects that depend on the text search template.
+     
+    
+   
+
+   
+    RESTRICT
+    
+     
+      Refuse to drop the text search template if any objects depend on it.
+      This is the default.
+     
+    
+   
+  
+ 
+
+ 
+  Examples
+
+  
+   Remove the text search template thesaurus:
+
+
+DROP TEXT SEARCH TEMPLATE thesaurus;
+
+
+   This command will not succeed if there are any existing text search
+   dictionaries that use the template.  Add CASCADE to
+   drop such dictionaries along with the template.
+  
+ 
+ 
+ 
+  Compatibility
+
+  
+   There is no DROP TEXT SEARCH TEMPLATE statement in the
+   SQL standard.
+  
+ 
+
+ 
+  See Also
+
+  
+   
+   
+  
+ 
+
+
diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml

index a52fe7a9644b6c12ddba13d48bc52c6ec1ce10bc..1e8e54653147f8a83b23a884b579e4becbb996ce 100644 (file)
--- a/doc/src/sgml/reference.sgml
+++ b/doc/src/sgml/reference.sgml
@@ -1,4 +1,4 @@
-
+
  
  
   Reference
@@ -50,6 +50,10 @@
     &alterSequence;
     &alterTable;
     &alterTableSpace;
+   &alterTSConfig;
+   &alterTSDictionary;
+   &alterTSParser;
+   &alterTSTemplate;
     &alterTrigger;
     &alterType;
     &alterUser;
@@ -83,6 +87,10 @@
     &createTable;
     &createTableAs;
     &createTableSpace;
+   &createTSConfig;
+   &createTSDictionary;
+   &createTSParser;
+   &createTSTemplate;
     &createTrigger;
     &createType;
     &createUser;
@@ -110,6 +118,10 @@
     &dropSequence;
     &dropTable;
     &dropTableSpace;
+   &dropTSConfig;
+   &dropTSDictionary;
+   &dropTSParser;
+   &dropTSTemplate;
     &dropTrigger;
     &dropType;
     &dropUser;
diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml

new file mode 100644 (file)

index 0000000..a6601d6
--- /dev/null
+++ b/doc/src/sgml/textsearch.sgml
@@ -0,0 +1,3716 @@
+
+Full Text Search
+
+
+
+Introduction
+
+
+Full Text Searching (text search) allows the
+searching of documents that satisfy a query, and
+optionally returns them in some order. The most common search is to find
+all documents containing query terms and return them
+in order of their similarity to the
+query.  Notions of query and
+similarity are very flexible and depend on the specific
+application. The simplest search considers query as a
+set of words and similarity as the frequency of query
+words in the document.  Full text indexing can be done inside the
+database or outside.  Doing indexing inside the database allows easy access
+to document metadata to assist in indexing and display.
+
+
+
+Textual search operators have existed in databases for years.
+PostgreSQL has
+~,~*, LIKE,
+ILIKE operators for textual datatypes, but they lack
+many essential properties required by modern information systems:
+
+
+
+
+There is no linguistic support, even for English.  Regular expressions are
+not sufficient because they cannot easily handle derived words,
+e.g., satisfies and satisfy. You might
+miss documents which contain satisfies, although you
+probably would like to find them when searching for
+satisfy. It is possible to use OR
+to search any of them, but it is tedious and error-prone
+(some words can have several thousand derivatives).
+
+
+
+They provide no ordering (ranking) of search results, which makes them
+ineffective when thousands of matching documents are found.
+
+
+
+They tend to be slow because they process all documents for every search and
+there is no index support.
+
+
+
+
+
+
+Full text indexing allows documents to be preprocessed
+and an index saved for later rapid searching. Preprocessing includes:
+
+
+
+Parsing documents into lexemes. It is
+useful to identify various lexemes, e.g. digits, words, complex words,
+email addresses, so they can be processed differently.  In principle
+lexemes depend on the specific application but for an ordinary search it
+is useful to have a predefined list of lexemes.  
+
+
+
+Dictionaries allow the conversion of lexemes into
+a normalized form so it is not necessary to enter
+search words in a specific form.
+
+
+
+Store preprocessed documents
+optimized for searching.  For example, represent each document as a sorted array
+of lexemes. Along with lexemes it is desirable to store positional
+information to use for proximity ranking, so that a
+document which contains a more "dense" region of query words is assigned
+a higher rank than one with scattered query words.
+
+
+
+
+
+Dictionaries allow fine-grained control over how lexemes are created.  With
+dictionaries you can:
+
+
+Define "stop words" that should not be indexed.
+
+
+
+Map synonyms to a single word using ispell.
+
+
+Map phrases to a single word using a thesaurus.
+
+
+Map different variations of a word to a canonical form using
+an ispell dictionary.
+
+
+Map different variations of a word to a canonical form using
+snowball stemmer rules.
+
+
+
+
+
+
+A data type (), tsvector
+is provided, for storing preprocessed documents,
+along with a type tsquery for representing textual
+queries.  Also, a full text search operator @@ is defined
+for these data types ().  Full text
+searches can be accelerated using indexes (
+linkend="textsearch-indexes">).
+
+
+
+
+What Is a <firstterm>Document</firstterm>?
+
+
+document
+
+
+
+A document can be a simple text file stored in the file system.  The full
+text indexing engine can parse text files and store associations of lexemes
+(words) with their parent document. Later, these associations are used to
+search for documents which contain query words.  In this case, the database
+can be used to store the full text index and for executing searches, and
+some unique identifier can be used to retrieve the document from the file
+system.
+
+
+
+A document can also be any textual database attribute or a combination
+(concatenation), which in turn can be stored in various tables or obtained
+dynamically. In other words, a document can be constructed from different
+parts for indexing and it might not exist as a whole. For example:
+
+SELECT title || ' ' ||  author || ' ' ||  abstract || ' ' || body AS document
+FROM messages
+WHERE mid = 12;
+
+SELECT m.title || ' ' || m.author || ' ' || m.abstract || ' ' || d.body AS document
+FROM messages m, docs d
+WHERE mid = did AND mid = 12;
+
+
+
+
+
+Actually, in the previous example queries, COALESCE
+
+should be used to prevent a NULL attribute from causing
+a NULL result.
+
+
+
+
+
+Data Types
+
+
+
+
+
+tsvector
+
+
+
+
+tsvector
+
+
+
+tsvector is a data type that represents a document and is
+optimized for full text searching. In the simplest case,
+tsvector is a sorted list of lexemes, so even without indexes
+full text searches perform better than standard ~ and
+LIKE operations:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector;
+                      tsvector
+----------------------------------------------------
+ 'a' 'on' 'and' 'ate' 'cat' 'fat' 'mat' 'rat' 'sat'
+
+
+Notice, that space is also a lexeme:
+
+
+SELECT 'space ''    '' is a lexeme'::tsvector;
+             tsvector
+----------------------------------
+ 'a' 'is' '    ' 'space' 'lexeme'
+
+
+Each lexeme, optionally, can have positional information which is used for
+proximity ranking:
+
+SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector;
+                                   tsvector
+-------------------------------------------------------------------------------
+ 'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
+
+
+Each lexeme position also can be labeled as 'A',
+'B', 'C', 'D',
+where 'D' is the default. These labels can be used to group
+lexemes into different importance or
+rankings, for example to reflect document structure.
+Actual values can be assigned at search time and used during the calculation
+of the document rank.  This is very useful for controlling search results.
+
+
+The concatenation operator, e.g. tsvector || tsvector,
+can "construct" a document from several parts. The order is important if
+tsvector contains positional information. Of course,
+it is also possible to build a document using different tables:
+
+
+SELECT 'fat:1 cat:2'::tsvector || 'fat:1 rat:2'::tsvector;
+         ?column?
+---------------------------
+ 'cat':2 'fat':1,3 'rat':4
+SELECT 'fat:1 rat:2'::tsvector || 'fat:1 cat:2'::tsvector;
+         ?column?
+---------------------------
+ 'cat':4 'fat':1,3 'rat':2
+
+
+
+
+
+
+
+
+
+tsquery
+
+
+
+tsquery
+
+
+
+Tsquery is a data type for textual queries which supports
+the boolean operators & (AND), | (OR),
+and parentheses.  A Tsquery consists of lexemes
+(optionally labeled by letters) with boolean operators in between:
+
+
+SELECT 'fat & cat'::tsquery;
+    tsquery
+---------------
+ 'fat' & 'cat'
+SELECT 'fat:ab & cat'::tsquery;
+ tsquery
+------------------
+ 'fat':AB & 'cat'
+
+Labels can be used to restrict the search region, which allows the
+development of different search engines using the same full text index.
+
+
+
+tsqueries can be concatenated using && (AND)
+and || (OR) operators:
+
+SELECT 'a & b'::tsquery && 'c|d'::tsquery;
+         ?column?
+---------------------------
+ 'a' & 'b' & ( 'c' | 'd' )
+
+SELECT 'a & b'::tsquery || 'c|d'::tsquery;
+         ?column?
+---------------------------
+ 'a' & 'b' | ( 'c' | 'd' )
+
+
+
+
+
+
+
+
+
+Performing Searches
+
+
+Full text searching in PostgreSQL provides the
+operator @@ for two data types: tsvector
+(document) and tsquery (query).  Also, this operator
+supports TEXT, VARCHAR, and CHAR
+data types so simple full text searches can be done, but without ranking
+support:
+
+tsvector @@ tsquery
+tsquery  @@ tsvector
+TEXT | VARCHAR | CHAR @@ TEXT | tsquery
+
+
+
+
+The full text operator @@ returns true if
+tsvector contains tsquery:
+
+SELECT 'cat & rat'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector;
+ ?column?
+----------
+ t
+SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector;
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+
+Tables and Indexes
+
+
+The previous section described how to perform full text searches using
+constant strings.  This section shows how to search table data, optionally
+using indexes.
+
+
+
+Searching a Table
+
+
+It is possible to do full text table search with no index.  A simple query
+to find all title entries that contain the word
+friend is:
+
+SELECT title
+FROM pgweb
+WHERE to_tsvector('english', body) @@ to_tsquery('friend')
+
+
+
+
+A more complex query is to select the ten most recent documents which
+contain create and table in the title
+or body:
+
+SELECT title
+FROM pgweb
+WHERE to_tsvector('english', textcat(title, body)) @@ to_tsquery('create & table')
+ORDER BY dlm DESC LIMIT 10;
+
+dlm is the last-modified date in seconds since 1970 so we
+used ORDER BY dlm LIMIT 10 to get the most recent
+matches.  For clarity we omitted the coalesce function
+which prevents the unwanted effect of NULL
+concatenation.
+
+
+
+
+
+Creating Indexes
+
+
+We can create a GIN (
+linkend="textsearch-indexes">) index to speed up the search:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body));
+
+Notice that the 2-argument version of to_tsvector is
+used.  Only text search functions which specify a configuration name can
+be used in expression indexes ().
+Casting to a text search data type (::) is also unsupported.
+This is because the index contents should be unaffected by
+default_text_search_config.  If they were affected, the index
+contents might be inconsistent because they could contain
+tsvectors that were created with different default text search
+configurations.  Recovering a table from a pg_dump would
+also not recreate index tsvectors properly.
+
+
+
+Because the two-argument version of to_tsvector was
+used in the index above, only a query reference that uses the 2-argument
+version of to_tsvector with the same configuration
+name will use that index, i.e. WHERE 'a & b' @@
+to_svector('english', body) will use the index, but WHERE
+'a & b' @@ to_svector(body)) and WHERE 'a & b' @@
+body::tsvector will not.  This guarantees that an index will be used
+only with the same configuration used to create the index rows.
+
+
+
+It is possible to setup more complex expression indexes where the
+configuration name is specified by another column, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(conf_name, body));
+
+where conf_name is a column in the pgweb
+table.  This allows mixed configurations in the same index while
+recording which configuration was used for each index row.
+
+
+
+Indexes can even concatenate columns:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', textcat(title, body)));
+
+
+
+
+A more complex case is to create a separate tsvector column
+to hold the output of to_tsvector().  This example is a
+concatenation of title and body,
+with ranking information.  We assign different labels to them to encode
+information about the origin of each word:
+
+ALTER TABLE pgweb ADD COLUMN textsearch_index tsvector;
+UPDATE pgweb SET textsearch_index =
+    setweight(to_tsvector('english', coalesce(title,'')), 'A') || ' ' ||
+    setweight(to_tsvector('english', coalesce(body,'')),'D');
+
+Then we create a GIN index to speed up the search:
+
+CREATE INDEX textsearch_idx ON pgweb USING gin(textsearch_index);
+
+After vacuuming, we are ready to perform a fast full text search:
+
+SELECT rank_cd(textsearch_index, q) AS rank, title
+FROM pgweb, to_tsquery('create & table') q
+WHERE q @@ textsearch_index
+ORDER BY rank DESC LIMIT 10;
+
+It is necessary to create a trigger to keep the new tsvector
+column current anytime title or body changes.
+Keep in mind that, just like with expression indexes, it is important to
+specify the configuration name when creating text search data types
+inside triggers so the column's contents are not affected by changes to 
+default_text_search_config.
+
+
+
+
+
+
+
+Operators and Functions
+
+
+This section outlines all the functions and operators that are available
+for full text searching.
+
+
+
+Full text search vectors and queries both use lexemes, but for different
+purposes.  A tsvector represents the lexemes (tokens) parsed
+out of a document, with an optional position. A tsquery
+specifies a boolean condition using lexemes.
+
+
+
+All of the following functions that accept a configuration argument can
+use a textual configuration name to select a configuration.  If the option
+is omitted the configuration specified by
+default_text_search_config is used.  For more information on
+configuration, see .
+
+
+
+Search
+
+The operator @@ is used to perform full text
+searches:
+
+
+
+
+
+
+TSVECTOR @@ TSQUERY
+
+
+
+
+
+TSVECTOR @@ TSQUERY
+TSQUERY @@ TSVECTOR
+
+
+
+
+
+Returns true if TSQUERY is contained 
+in TSVECTOR, and false if not:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery;
+ ?column?
+ ----------
+  t
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow'::tsquery;
+ ?column?
+ ----------
+  f
+
+
+
+
+
+
+
+
+
+TEXT @@ TSQUERY
+
+
+
+
+TEXT @@ TSQUERY
+VARCHAR @@ TSQUERY
+CHAR @@ TSQUERY
+
+
+
+
+
+Returns true if TSQUERY is contained
+in TEXT/VARCHAR, and false if not:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery;
+ ?column?
+----------
+ t
+SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsquery;
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+TEXT @@ TEXT
+
+
+
+
+
+TEXT @@ TEXT
+VARCHAR @@ TEXT
+CHAR @@ TEXT
+
+
+
+
+
+Returns true if the right
+argument (the query) is contained in the left argument, and
+false otherwise:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat rat';
+ ?column?
+----------
+ t
+SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat cow';
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+
+For index support of full text operators consult .
+
+
+
+
+
+
+
+tsvector
+
+
+
+
+
+
+to_tsvector
+
+
+
+
+to_tsvector(conf_name,  document TEXT) returns TSVECTOR
+
+
+
+
+
+Parses a document into tokens, reduces the tokens to lexemes, and returns a
+tsvector which lists the lexemes together with their positions in the document
+in lexicographic order.
+
+
+
+
+
+
+
+
+strip
+
+
+
+
+strip(vector TSVECTOR) returns TSVECTOR
+
+
+
+
+
+Returns a vector which lists the same lexemes as the given vector, but
+which lacks any information about where in the document each lexeme
+appeared. While the returned vector is useless for relevance ranking it
+will usually be much smaller.
+
+
+
+
+
+
+
+
+setweight
+
+
+
+
+setweight(vector TSVECTOR, letter) returns TSVECTOR
+
+
+
+
+
+This function returns a copy of the input vector in which every location
+has been labeled with either the letter 'A',
+'B', or 'C', or the default label
+'D' (which is the default for new vectors
+and as such is usually not displayed). These labels are retained
+when vectors are concatenated, allowing words from different parts of a
+document to be weighted differently by ranking functions.
+
+
+
+
+
+
+
+
+
+tsvector concatenation
+
+
+
+
+vector1 || vector2
+concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR
+
+
+
+
+
+Returns a vector which combines the lexemes and positional information of
+the two vectors given as arguments. Positional weight labels (described
+in the previous paragraph) are retained during the concatenation.  This
+has at least two uses. First, if some sections of your document need to be
+parsed with different configurations than others, you can parse them
+separately and then concatenate the resulting vectors.  Second, you can
+weigh words from one section of your document differently than the others
+by parsing the sections into separate vectors and assigning each vector
+a different position label with the setweight()
+function.  You can then concatenate them into a single vector and provide
+a weights argument to the rank() function that assigns
+different weights to positions with different labels.
+
+
+
+
+
+
+
+length(tsvector)
+
+
+
+
+length(vector TSVECTOR) returns INT4
+
+
+
+
+
+Returns the number of lexemes stored in the vector.
+
+
+
+
+
+
+
+text::tsvector
+
+
+
+
+text::TSVECTOR returns TSVECTOR
+
+
+
+
+
+Directly casting text to a tsvector allows you
+to directly inject lexemes into a vector with whatever positions and
+positional weights you choose to specify. The text should be formatted to
+match the way a vector is displayed by SELECT.
+
+
+
+
+
+
+
+
+trigger
+
+
+
+
+tsvector_update_trigger(vector_column_name, filter_name, text_column_name , ... )
+
+
+
+
+
+The tsvector_update_trigger() trigger is used to
+automatically update vector_column_name.
+filter_name is the function name to preprocess
+text_column_name.  There can be many functions
+and text columns specified in a
+tsvector_update_trigger() trigger.  If multiple
+functions are specified, they apply to the following columns until the
+next function appears.  As an example of using a filter, function
+dropatsymbol replaces all entries of the
+@ sign with a space:
+
+
+CREATE FUNCTION dropatsymbol(text)
+RETURNS text 
+AS 'SELECT replace($1, ''@'', '' '');'
+LANGUAGE SQL;
+
+CREATE TRIGGER tsvectorupdate BEFORE UPDATE OR INSERT
+ON tblMessages FOR EACH ROW EXECUTE PROCEDURE
+tsvector_update_trigger(tsvector_column, dropatsymbol, strMessage);
+
+
+
+
+
+
+
+
+
+stat
+
+
+
+
+stat(sqlquery text , weight text ) returns SETOF statinfo
+
+
+
+
+
+
+Here statinfo is a type, defined as:
+
+CREATE TYPE statinfo AS (word text, ndoc int4, nentry int4);
+
+and sqlquery is a query which returns a
+tsvector column's contents.  stat returns
+statistics about a tsvector column, i.e., the number of
+documents, ndoc, and the total number of words in the
+collection, nentry.  It is useful for checking your
+configuration and to find stop word candidates.  For example, to find
+the ten most frequent words:
+
+
+SELECT * FROM stat('SELECT vector from apod')
+ORDER BY ndoc DESC, nentry DESC, word
+LIMIT 10;
+
+
+Optionally, one can specify weight to obtain
+statistics about words with a specific weight:
+
+
+SELECT * FROM stat('SELECT vector FROM apod','a')
+ORDER BY ndoc DESC, nentry DESC, word
+LIMIT 10;
+
+
+
+
+
+
+
+
+
+Btree operations for tsvector
+
+
+
+
+TSVECTOR < TSVECTOR
+TSVECTOR <= TSVECTOR
+TSVECTOR = TSVECTOR
+TSVECTOR >= TSVECTOR
+TSVECTOR > TSVECTOR
+
+
+
+
+
+All btree operations are defined for the tsvector type.
+tsvectors are compared with each other using
+lexicographical ordering.
+
+
+
+
+
+
+
+
+
+
+
+tsquery
+
+
+
+
+
+
+
+to_tsquery
+
+
+
+
+to_tsquery(conf_name, querytext text) returns TSQUERY
+
+
+
+
+
+Accepts querytext, which should consist of single tokens
+separated by the boolean operators & (and), |
+(or) and ! (not), which can be grouped using parentheses.
+In other words, to_tsquery expects already parsed text.
+Each token is reduced to a lexeme using the specified or current configuration.
+A weight class can be assigned to each lexeme entry to restrict the search region
+(see setweight for an explanation). For example:
+
+'fat:a & rats'
+
+The to_tsquery function can also accept a text
+string. In this case querytext should
+be quoted. This may be useful, for example, to use with a thesaurus
+dictionary. In the example below, a thesaurus contains rule supernovae
+stars : sn:
+
+SELECT to_tsquery('''supernovae stars'' & !crab');
+   to_tsquery
+----------------
+ 'sn' & !'crab'
+
+Without quotes to_tsquery will generate a syntax error.
+
+
+
+
+
+
+
+
+
+
+plainto_tsquery
+
+
+
+
+plainto_tsquery(conf_name,  querytext text) returns TSQUERY
+
+
+
+
+
+Transforms unformatted text querytext to tsquery.
+It is the same as to_tsquery but accepts text
+without quotes and will call the parser to break it into tokens.
+plainto_tsquery assumes the & boolean
+operator between words and does not recognize weight classes.
+
+
+
+
+
+
+
+
+
+querytree
+
+
+
+
+querytree(query TSQUERY) returns TEXT
+
+
+
+
+
+This returns the query used for searching an index. It can be used to test
+for an empty query. The SELECT below returns NULL,
+which corresponds to an empty query since GIN indexes do not support queries with negation
+
+(a full index scan is inefficient):
+
+SELECT querytree(to_tsquery('!defined'));
+ querytree
+-----------
+
+
+
+
+
+
+
+
+
+
+text::tsquery casting
+
+
+
+
+text::TSQUERY returns TSQUERY
+
+
+
+
+
+Directly casting text to a tsquery
+allows you to directly inject lexemes into a query using whatever positions
+and positional weight flags you choose to specify. The text should be
+formatted  to match the way a vector is displayed by
+SELECT.
+
+
+
+
+
+
+
+
+numnode
+
+
+
+
+numnode(query TSQUERY) returns INTEGER
+
+
+
+
+
+This returns the number of nodes in a query tree. This function can be
+used to determine if query is meaningful
+(returns > 0), or contains only stop words (returns 0):
+
+SELECT numnode(plainto_tsquery('the any'));
+NOTICE:  query contains only stopword(s) or does not contain lexeme(s),
+ignored
+ numnode
+---------
+       0
+SELECT numnode(plainto_tsquery('the table'));
+ numnode
+---------
+       1
+SELECT numnode(plainto_tsquery('long table'));
+ numnode
+---------
+       3
+
+
+
+
+
+
+
+
+TSQUERY && TSQUERY
+
+
+
+
+TSQUERY && TSQUERY returns TSQUERY
+
+
+
+
+
+Returns AND-ed TSQUERY
+
+
+
+
+
+
+
+TSQUERY || TSQUERY
+
+
+
+
+TSQUERY || TSQUERY returns TSQUERY
+
+
+
+
+
+Returns OR-ed TSQUERY
+
+
+
+
+
+
+
+!! TSQUERY
+
+
+
+
+!! TSQUERY returns TSQUERY
+
+
+
+
+
+negation of TSQUERY
+
+
+
+
+
+
+
+Btree operations for tsquery
+
+
+
+
+TSQUERY < TSQUERY
+TSQUERY <= TSQUERY
+TSQUERY = TSQUERY
+TSQUERY >= TSQUERY
+TSQUERY > TSQUERY
+
+
+
+
+
+All btree operations are defined for the tsquery type.
+tsqueries are compared to each other using lexicographical
+ordering.
+
+
+
+
+
+
+
+Query Rewriting
+
+
+Query rewriting is a set of functions and operators for the
+tsquery data type.  It allows control at search
+query time without reindexing (the opposite of the
+thesaurus).  For example, you can expand the search using synonyms
+(new york, big apple, nyc,
+gotham) or narrow the search to direct the user to some hot
+topic.
+
+
+
+The rewrite() function changes the original query by
+replacing part of the query with some other string of type tsquery,
+as defined by the rewrite rule. Arguments to rewrite()
+can be names of columns of type tsquery.
+
+
+
+CREATE TABLE aliases (t TSQUERY PRIMARY KEY, s TSQUERY);
+INSERT INTO aliases VALUES('a', 'c');
+
+
+
+
+
+
+rewrite - 1
+
+
+
+
+rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery);
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+
+rewrite - 2
+
+
+
+
+rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases;
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+
+rewrite - 3
+
+
+
+
+rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases');
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+What if there are several instances of rewriting? For example, query
+'a & b' can be rewritten as
+'b & c' and 'cc'.
+
+
+SELECT * FROM aliases;
+     t     |  s
+-----------+------
+ 'a'       | 'c'
+ 'x'       | 'z'
+ 'a' & 'b' | 'cc'
+
+This ambiguity can be resolved by specifying a sort order:
+
+SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC');
+ rewrite
+---------
+ 'cc'
+SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC');
+  rewrite
+-----------
+ 'b' & 'c'
+
+
+
+
+Let's consider a real-life astronomical example. We'll expand query
+supernovae using table-driven rewriting rules:
+
+CREATE TABLE aliases (t tsquery primary key, s tsquery);
+INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn'));
+SELECT rewrite(to_tsquery('supernovae'),  'SELECT * FROM aliases') && to_tsquery('crab');
+            ?column?
+---------------------------------
+ ( 'supernova' | 'sn' ) & 'crab'
+
+Notice, that we can change the rewriting rule online:
+
+UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae');
+SELECT rewrite(to_tsquery('supernovae'),  'SELECT * FROM aliases') && to_tsquery('crab');
+                  ?column?
+---------------------------------------------
+ ( 'supernova' | 'sn' & !'nebula' ) & 'crab'
+
+
+
+
+
+Operators For tsquery
+
+
+Rewriting can be slow for many rewriting rules since it checks every rule
+for a possible hit. To filter out obvious non-candidate rules there are containment
+operators for the tsquery type. In the example below, we select only those
+rules which might contain the original query:
+
+SELECT rewrite(ARRAY['a & b'::tsquery, t,s])
+FROM aliases
+WHERE 'a & b' @> t;
+  rewrite
+-----------
+ 'b' & 'c'
+
+
+
+
+
+Two operators are defined for tsquery:
+
+
+
+
+
+
+TSQUERY @> TSQUERY
+
+
+
+
+TSQUERY @> TSQUERY
+
+
+
+
+
+Returns true if the right argument might be contained in left argument.
+
+
+
+
+
+
+
+tsquery <@ tsquery
+
+
+
+
+TSQUERY <@ TSQUERY
+
+
+
+
+
+Returns true if the left argument might be contained in right argument.
+
+
+
+
+
+
+
+
+
+Index For tsquery
+
+
+To speed up operators <@ and @> for
+tsquery one can use a GiST index with
+a tsquery_ops opclass:
+
+
+CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops);
+
+
+
+
+
+
+
+
+
+
+Additional Controls
+
+
+To implement full text searching there must be a function to create a
+tsvector from a document and a tsquery from a
+user query. Also, we need to return results in some order, i.e., we need
+a function which compares documents with respect to their relevance to
+the tsquery.  Full text searching in
+PostgreSQL provides support for all of these
+functions.
+
+
+
+Parsing
+
+
+Full text searching in PostgreSQL provides
+function to_tsvector, which converts a document to
+the tsvector data type. More details are available in 
+linkend="textsearch-tsvector">, but for now consider a simple example:
+
+SELECT to_tsvector('english', 'a fat  cat sat on a mat - it ate a fat rats');
+                     to_tsvector
+-----------------------------------------------------
+ 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
+
+
+
+ 
+In the example above we see that the resulting tsvector does not
+contain the words a, on, or
+it, the word rats became
+rat, and the punctuation sign - was
+ignored. 
+ 
+
+
+The to_tsvector function internally calls a parser
+which breaks the document (a fat  cat sat on a mat - it ate a
+fat rats) into words and corresponding types. The default parser
+recognizes 23 types.  Each word, depending on its type, passes through a
+group of dictionaries ().  At the
+end of this step we obtain lexemes.  For example,
+rats became rat because one of the
+dictionaries recognized that the word rats is a plural
+form of rat.  Some words are treated as "stop words"
+() and ignored since they occur too
+frequently and have little informational value.  In our example these are
+a, on, and it.
+The punctuation sign - was also ignored because its
+type (Space symbols) is not indexed. All information
+about the parser, dictionaries and what types of lexemes to index is
+documented in the full text configuration section (
+linkend="textsearch-tables-configuration">).  It is possible to have
+several different configurations in the same database, and many predefined
+system configurations are available for different languages. In our example
+we used the default configuration english for the
+English language.
+
+
+
+As another example, below is the output from the ts_debug
+function (  ), which shows all details
+of the full text machinery:
+
+SELECT * FROM ts_debug('english','a fat  cat sat on a mat - it ate a fat rats');
+ Alias |  Description  | Token |      Dicts list      |       Lexized token
+-------+---------------+-------+----------------------+---------------------------
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | cat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {cat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | sat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {sat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | on    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | mat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {mat}
+ blank | Space symbols |       |                      |
+ blank | Space symbols | -     |                      |
+ lword | Latin word    | it    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | ate   | {pg_catalog.en_stem} | pg_catalog.en_stem: {ate}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | rats  | {pg_catalog.en_stem} | pg_catalog.en_stem: {rat}
+(24 rows)
+
+
+
+
+Function setweight() is used to label
+tsvector. The typical usage of this is to mark out the
+different parts of a document, perhaps by importance.  Later, this can be
+used for ranking of search results in addition to positional information
+(distance between query terms).  If no ranking is required, positional
+information can be removed from tsvector using the
+strip() function to save space.
+
+
+
+Because to_tsvector(NULL) can
+return NULL, it is recommended to use
+coalesce. Here is the safe method for creating a
+tsvector from a structured document:
+
+UPDATE tt SET ti=
+    setweight(to_tsvector(coalesce(title,'')), 'A')    || ' ' ||
+    setweight(to_tsvector(coalesce(keyword,'')), 'B')  || ' ' ||
+    setweight(to_tsvector(coalesce(abstract,'')), 'C') || ' ' ||
+    setweight(to_tsvector(coalesce(body,'')), 'D');
+
+
+
+
+The following functions allow manual parsing control:
+
+
+
+
+
+
+parse
+
+
+
+
+parse(parser,  document TEXT) returns SETOF tokenout
+
+
+
+
+
+Parses the given document and returns a series
+of records, one for each token produced by parsing. Each record includes
+a tokid giving its type and a token
+which gives its content:
+
+SELECT * FROM parse('default','123 - a number');
+ tokid | token
+-------+--------
+    22 | 123
+    12 |
+    12 | -
+     1 | a
+    12 |
+     1 | number
+
+
+
+
+
+
+
+token_type
+
+
+
+
+token_type(parser ) returns SETOF tokentype
+
+
+
+
+
+Returns a table which describes each kind of token the
+parser might produce as output.  For each token
+type the table gives the tokid which the
+parser uses to label each
+token of that type, the alias which
+names the token type, and a short description:
+
+SELECT * FROM token_type('default');
+ tokid |    alias     |            description
+-------+--------------+-----------------------------------
+     1 | lword        | Latin word
+     2 | nlword       | Non-latin word
+     3 | word         | Word
+     4 | email        | Email
+     5 | url          | URL
+     6 | host         | Host
+     7 | sfloat       | Scientific notation
+     8 | version      | VERSION
+     9 | part_hword   | Part of hyphenated word
+    10 | nlpart_hword | Non-latin part of hyphenated word
+    11 | lpart_hword  | Latin part of hyphenated word
+    12 | blank        | Space symbols
+    13 | tag          | HTML Tag
+    14 | protocol     | Protocol head
+    15 | hword        | Hyphenated word
+    16 | lhword       | Latin hyphenated word
+    17 | nlhword      | Non-latin hyphenated word
+    18 | uri          | URI
+    19 | file         | File or path name
+    20 | float        | Decimal notation
+    21 | int          | Signed integer
+    22 | uint         | Unsigned integer
+    23 | entity       | HTML Entity
+
+
+
+
+
+
+
+
+
+
+
+
+Ranking Search Results
+
+
+Ranking attempts to measure how relevant documents are to a particular
+query by inspecting the number of times each search word appears in the
+document, and whether different search terms occur near each other.  Full
+text searching provides two predefined ranking functions which attempt to
+produce a measure of how a document is relevant to the query.  In spite
+of that, the concept of relevancy is vague and very application-specific.
+These functions try to take into account lexical, proximity, and structural
+information.  Different applications might require additional information
+for ranking, e.g. document modification time.
+
+
+
+The lexical part of ranking reflects how often the query terms appear in
+the document, how close the document query terms are, and in what part of
+the document they occur.  Note that ranking functions that use positional
+information will only work on unstripped tsvectors because stripped
+tsvectors lack positional information.
+
+
+
+The two ranking functions currently available are:
+
+
+
+
+
+
+rank
+
+
+
+
+rank( weights float4[], vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This ranking function offers the ability to weigh word instances more
+heavily depending on how you have classified them.  The weights specify
+how heavily to weigh each category of word:
+
+{D-weight, C-weight, B-weight, A-weight}
+ 
+If no weights are provided,
+then these defaults are used:
+
+{0.1, 0.2, 0.4, 1.0}
+
+Often weights are used to mark words from special areas of the document,
+like the title or an initial abstract, and make them more or less important
+than words in the document body.
+
+
+
+
+
+
+
+rank_cd
+
+
+
+
+rank_cd( weights float4[],  vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This function computes the cover density ranking for
+the given document vector and query, as described in Clarke, Cormack, and
+Tudhope's "Relevance Ranking for One to Three Term Queries" in the
+"Information Processing and Management", 1999.
+
+
+
+
+
+
+
+
+
+Since a longer document has a greater chance of containing a query term
+it is reasonable to take into account document size, i.e. a hundred-word
+document with five instances of a search word is probably more relevant
+than a thousand-word document with five instances.  Both ranking functions
+take an integer normalization option that
+specifies whether a document's length should impact its rank.  The integer
+option controls several behaviors which is done using bit-wise fields and
+| (for example, 2|4):
+
+
+
+0 (the default) ignores the document length
+
+
+1 divides the rank by 1 + the logarithm of the document length
+
+
+2 divides the rank by the length itself
+
+
+
+4 divides the rank by the mean harmonic distance between extents
+
+
+8 divides the rank by the number of unique words in document
+
+
+16 divides the rank by 1 + logarithm of the number of unique words in document
+
+
+
+
+
+
+It is important to note that ranking functions do not use any global
+information so it is impossible to produce a fair normalization to 1% or
+100%, as sometimes required. However, a simple technique like
+rank/(rank+1) can be applied.  Of course, this is just
+a cosmetic change, i.e., the ordering of the search results will not change.
+
+
+
+Several examples are shown below; note that the second example uses
+normalized ranking:
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |   rnk
+-----------------------------------------------+----------
+ Neutrinos in the Sun                          |      3.1
+ The Sudbury Neutrino Detector                 |      2.4
+ A MACHO View of Galactic Dark Matter          |  2.01317
+ Hot Gas and Dark Matter                       |  1.91171
+ The Virgo Cluster: Hot Plasma and Dark Matter |  1.90953
+ Rafting for Solar Neutrinos                   |      1.9
+ NGC 4650A: Strange Galaxy and Dark Matter     |  1.85774
+ Hot Gas and Dark Matter                       |   1.6123
+ Ice Fishing for Cosmic Neutrinos              |      1.6
+ Weak Lensing Distorts the Universe            | 0.818218
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query)/
+(rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) + 1) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE  query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |        rnk
+-----------------------------------------------+-------------------
+ Neutrinos in the Sun                          | 0.756097569485493
+ The Sudbury Neutrino Detector                 | 0.705882361190954
+ A MACHO View of Galactic Dark Matter          | 0.668123210574724
+ Hot Gas and Dark Matter                       |  0.65655958650282
+ The Virgo Cluster: Hot Plasma and Dark Matter | 0.656301290640973
+ Rafting for Solar Neutrinos                   | 0.655172410958162
+ NGC 4650A: Strange Galaxy and Dark Matter     | 0.650072921219637
+ Hot Gas and Dark Matter                       | 0.617195790024749
+ Ice Fishing for Cosmic Neutrinos              | 0.615384618911517
+ Weak Lensing Distorts the Universe            | 0.450010798361481
+
+
+
+
+The first argument in rank_cd ('{0.1, 0.2,
+0.4, 1.0}') is an optional parameter which specifies the
+weights for labels D, C,
+B, and A used in function
+setweight. These default values show that lexemes
+labeled as A are ten times more important than ones
+that are labeled with D.
+
+
+
+Ranking can be expensive since it requires consulting the
+tsvector of all documents, which can be I/O bound and
+therefore slow. Unfortunately, it is almost impossible to avoid since full
+text searching in a database should work without indexes .  Moreover an index can be lossy (a GiST
+index, for example) so it must check documents to avoid false hits.
+
+
+
+Note that the ranking functions above are only examples.  You can write
+your own ranking functions and/or combine additional factors to fit your
+specific needs.
+
+
+
+
+
+
+Highlighting Results
+
+
+headline
+
+
+
+To present search results it is ideal to show a part of each document and
+how it is related to the query. Usually, search engines show fragments of
+the document with marked search terms.  PostgreSQL full
+text searching provides the function headline that
+implements such functionality.
+
+
+
+
+
+
+
+
+headline( conf_name text, document text, query TSQUERY,  options text ) returns text
+
+
+
+
+
+The headline() function accepts a document along with
+a query, and returns one or more ellipsis-separated excerpts from the
+document in which terms from the query are highlighted.  The configuration
+used to parse the document can be specified by its
+conf_name; if none is specified, the current
+configuration is used.
+
+
+
+
+
+
+
+
+If an options string is specified it should
+consist of a comma-separated list of one or more 'option=value' pairs.
+The available options are:
+
+
+
+StartSel, StopSel: the strings with which
+query words appearing in the document should be delimited to distinguish
+them from other excerpted words.
+
+
+MaxWords, MinWords: limit the shortest and
+longest headlines to output
+
+
+ShortWord: this prevents your headline from beginning
+or ending with a word which has this many characters or less. The default
+value of three eliminates the English articles.
+
+
+HighlightAll: boolean flag;  if
+true the whole document will be highlighted
+
+
+
+Any unspecified options receive these defaults:
+
+StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
+
+
+
+
+For example:
+
+
+SELECT headline('a b c', 'c'::tsquery);
+   headline
+--------------
+ a b <b>c</b>
+SELECT headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>');
+ headline
+----------
+ a b  <c>
+
+
+
+
+headline uses the original document, not
+tsvector, so it can be slow and should be used with care.
+A typical mistake is to call headline() for
+every matching document when only ten documents are
+shown. SQL subselects can help here;  below is an
+example:
+
+
+SELECT id,headline(body,q), rank
+FROM (SELECT id,body,q, rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q
+      WHERE ti @@ q
+      ORDER BY rank DESC LIMIT 10) AS foo;
+
+
+
+
+Note that the cascade dropping of the parser function
+causes dropping of the headline used in the full text search
+configuration conf_name.
+
+
+
+
+
+
+
+Dictionaries
+
+
+Dictionaries are used to specify words that should not be considered in
+a search and for the normalization of words to allow the user to use any
+derived form of a word in a query. Also, normalization can reduce the size of
+tsvector. Normalization does not always have linguistic
+meaning and usually depends on application semantics.
+
+
+
+Some examples of normalization:  
+
+
+
+
+ Linguistic - ispell dictionaries try to reduce input words to a
+normalized form; stemmer dictionaries remove word endings
+ 
+
+ Identical URL locations are identified and canonicalized:
+
+
+
+http://www.pgsql.ru/db/mw/index.html
+
+
+http://www.pgsql.ru/db/mw/
+
+
+http://www.pgsql.ru/db/../db/mw/index.html
+
+
+
+
+
+Colour names are substituted by their hexadecimal values, e.g.,
+red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF
+
+
+Remove some numeric fractional digits to reduce the range of possible
+numbers, so 3.14159265359,
+3.1415926, 3.14 will be the same
+after normalization if only two digits are kept after the decimal point.
+
+
+
+
+
+
+A dictionary is a program which accepts lexemes as
+input and returns:
+
+
+an array of lexemes if the input lexeme is known to the dictionary
+
+
+a void array if the dictionary knows the lexeme, but it is a stop word
+
+
+NULL if the dictionary does not recognize the input lexeme
+
+
+
+WARNING:
+Data files used by dictionaries should be in the server_encoding
+so all encodings are consistent across databases.
+
+
+
+Full text searching provides predefined dictionaries for many languages,
+and SQL commands to manipulate them.  There are also
+several predefined template dictionaries that can be used to create new
+dictionaries by overriding their default parameters.  Besides this, it is
+possible to develop custom dictionaries using an API;
+see the dictionary for integers (
+linkend="textsearch-rule-dictionary-example">) as an example.
+
+
+
+The ALTER TEXT SEARCH CONFIGURATION public.pg ADD
+MAPPING command binds specific types of lexemes and a set of
+dictionaries to process them. (Mappings can also be specified as part of
+configuration creation.) Lexemes are processed by a stack of dictionaries
+until some dictionary identifies it as a known word or it turns out to be
+a stop word.  If no dictionary recognizes a lexeme, it will be discarded
+and not indexed. A general rule for configuring a stack of dictionaries
+is to place first the most narrow, most specific dictionary, then the more
+general dictionaries and finish it with a very general dictionary, like
+the snowball stemmer or simple, which
+recognizes everything.  For example, for an astronomy-specific search
+(astro_en configuration) one could bind
+lword (latin word) with a synonym dictionary of astronomical
+terms, a general English dictionary and a snowball English
+stemmer:
+
+ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, en_ispell, en_stem;
+
+
+
+
+Function lexize can be used to test dictionaries,
+for example:
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+(1 row)
+
+Also, the ts_debug function ()
+can be used for this.
+
+
+
+Stop Words
+
+Stop words are words which are very common, appear in almost
+every document, and have no discrimination value. Therefore, they can be ignored
+in the context of full text searching. For example, every English text contains
+words like a although it is useless to store them in an index.
+However, stop words do affect the positions in tsvector,
+which in turn, do affect ranking:
+
+SELECT to_tsvector('english','in the list of stop words');
+        to_tsvector
+----------------------------
+ 'list':3 'stop':5 'word':6
+
+The gaps between positions 1-3 and 3-5 are because of stop words, so ranks
+calculated for documents with and without stop words are quite different:
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+     0.5
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+       1
+
+
+
+
+
+It is up to the specific dictionary how it treats stop words. For example,
+ispell dictionaries first normalize words and then
+look at the list of stop words, while stemmers
+first check the list of stop words. The reason for the different
+behaviour is an attempt to decrease possible noise.
+
+
+
+Here is an example of a dictionary that returns the input word as lowercase
+or NULL if it is a stop word; it also specifies the location
+of the file of stop words.  It uses the simple dictionary as
+a template:
+
+CREATE TEXT SEARCH DICTIONARY public.simple_dict
+    TEMPLATE pg_catalog.simple
+    OPTION 'english.stop';
+
+Relative paths in OPTION resolve relative to
+share/.  Now we can test our
+dictionary:
+
+SELECT lexize('public.simple_dict','YeS');
+ lexize
+--------
+ {yes}
+SELECT lexize('public.simple_dict','The');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+Synonym Dictionary
+
+
+This dictionary template is used to create dictionaries which replace a
+word with a synonym. Phrases are not supported (use the thesaurus
+dictionary () if you need them).  Synonym
+dictionary can be used to overcome linguistic problems, for example, to
+prevent an English stemmer dictionary from reducing the word 'Paris' to
+'pari'.  In that case, it is enough to have a Paris
+paris line in the synonym dictionary and put it before the
+en_stem dictionary:
+
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |      Dicts list      |       Lexized token
+-------+-------------+-------+----------------------+----------------------------
+ lword | Latin word  | Paris | {pg_catalog.en_stem} | pg_catalog.en_stem: {pari}
+(1 row)
+ALTER TEXT SEARCH CONFIGURATION ADD MAPPING ON english FOR lword WITH synonym, en_stem;
+ALTER TEXT SEARCH MAPPING
+Time: 340.867 ms
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |               Dicts list                |        Lexized token
+-------+-------------+-------+-----------------------------------------+-----------------------------
+ lword | Latin word  | Paris | {pg_catalog.synonym,pg_catalog.en_stem} | pg_catalog.synonym: {paris}
+(1 row)
+
+
+
+
+
+
+Thesaurus Dictionary
+
+
+A thesaurus dictionary (sometimes abbreviated as TZ) is
+a collection of words which includes information about the relationships
+of words and phrases, i.e., broader terms (BT), narrower
+terms (NT), preferred terms, non-preferred terms, related
+terms, etc.
+
+
+Basically a thesaurus dictionary replaces all non-preferred terms by one
+preferred term and, optionally, preserves them for indexing.  Thesauruses
+are used during indexing so any change in the thesaurus requires
+reindexing.  The current implementation of the thesaurus
+dictionary is an extension of the synonym dictionary with added
+phrase support.  A thesaurus is a plain file of the
+following format:
+
+# this is a comment
+sample word(s) : indexed word(s)
+...............................
+
+where  the colon (:) symbol acts as a delimiter.
+
+
+
+A thesaurus dictionary uses a subdictionary (which
+should be defined in the full text configuration) to normalize the
+thesaurus text. It is only possible to define one dictionary.  Notice that
+the subdictionary will produce an error if it can
+not recognize a word. In that case, you should remove the definition of
+the word or teach the subdictionary to about it.
+Use an asterisk (*) at the beginning of an indexed word to
+skip the subdictionary. It is still required that sample words are known.
+
+
+
+The thesaurus dictionary looks for the longest match.
+
+
+
+Stop words recognized by the subdictionary are replaced by a 'stop word
+placeholder' to record their position. To break possible ties the thesaurus
+uses the last definition. To illustrate this, consider a thesaurus (with
+a simple subdictionary) with pattern
+'swsw', where 's' designates any stop word and
+'w', any known word:
+
+a one the two : swsw
+the one a two : swsw2
+
+Words 'a' and 'the' are stop words defined in the
+configuration of a subdictionary. The thesaurus considers 'the
+one the two' and 'that one then two' as equal
+and will use definition 'swsw2'.
+
+
+
+As any normal dictionary, it can be assigned to the specific lexeme types.
+Since a thesaurus dictionary has the capability to recognize phrases it
+must remember its state and interact with the parser. A thesaurus dictionary
+uses these assignments to check if it should handle the next word or stop
+accumulation.  The thesaurus dictionary compiler must be configured
+carefully. For example, if the thesaurus dictionary is assigned to handle
+only the lword lexeme, then a thesaurus dictionary
+definition like ' one 7' will not work since lexeme type
+digit is not assigned to the thesaurus dictionary.
+
+
+
+
+
+Thesaurus Configuration
+
+
+To define a new thesaurus dictionary one can use the thesaurus template.
+For example:
+
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_simple
+    TEMPLATE thesaurus_template
+    OPTION 'DictFile="dicts_data/thesaurus.txt.sample", Dictionary="en_stem"';
+
+Here:
+
+
+thesaurus_simple is the thesaurus dictionary name
+
+
+DictFile="/path/to/thesaurus_simple.txt" is the location of the thesaurus file
+
+
+Dictionary="en_stem" defines the dictionary (snowball
+English stemmer) to use for thesaurus normalization. Notice that the
+en_stem dictionary has it is own configuration (for example,
+stop words).
+
+
+
+Now it is possible to bind the thesaurus dictionary thesaurus_simple
+and selected tokens, for example:
+
+
+ALTER TEXT SEARCH russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple;
+
+
+
+
+
+
+Thesaurus Example
+
+
+Consider a simple astronomical thesaurus thesaurus_astro,
+which contains some astronomical word combinations:
+
+supernovae stars : sn
+crab nebulae : crab
+
+Below we create a dictionary and bind some token types with
+an astronomical thesaurus and english stemmer:
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_astro OPTION
+    TEMPLATE thesaurus_template
+    'DictFile="dicts_data/thesaurus_astro.txt", Dictionary="en_stem"';
+ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword
+    WITH thesaurus_astro, en_stem;
+
+Now we can see how it works. Note that lexize cannot
+be used for testing the thesaurus (see description of
+lexize), but we can use
+plainto_tsquery and to_tsvector
+which accept text arguments, not lexemes:
+
+
+SELECT plainto_tsquery('supernova star');
+ plainto_tsquery
+-----------------
+ 'sn'
+SELECT to_tsvector('supernova star');
+ to_tsvector
+-------------
+ 'sn':1
+
+In principle, one can use to_tsquery if you quote
+the argument:
+
+SELECT to_tsquery('''supernova star''');
+ to_tsquery
+------------
+ 'sn'
+
+Notice that supernova star matches supernovae
+stars in thesaurus_astro because we specified the
+en_stem stemmer in the thesaurus definition.
+
+
+To keep an original phrase in full text indexing just add it to the right part
+of the definition:
+
+supernovae stars : sn supernovae stars
+
+SELECT plainto_tsquery('supernova star');
+       plainto_tsquery
+-----------------------------
+ 'sn' & 'supernova' & 'star'
+
+
+
+
+
+
+Ispell Dictionary
+
+
+The Ispell template dictionary for full text allows the
+creation of morphological dictionaries based on 
+url="http://ficus-www.cs.ucla.edu/geoff/ispell.html">Ispell, which
+supports a large number of languages. This dictionary tries to change an
+input word to its normalized form. Also, more modern spelling dictionaries
+are supported - 
+url="http://en.wikipedia.org/wiki/MySpell">MySpell (OO < 2.0.1)
+and Hunspell
+(OO >= 2.0.2).  A large list of dictionaries is available on the 
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+




This is the main PostgreSQL git repository.
RSS
Atom
+linkend="textsearch-indexes">).
+
+
+
+
+What Is a <firstterm>Document</firstterm>?
+
+
+document
+
+
+
+A document can be a simple text file stored in the file system.  The full
+text indexing engine can parse text files and store associations of lexemes
+(words) with their parent document. Later, these associations are used to
+search for documents which contain query words.  In this case, the database
+can be used to store the full text index and for executing searches, and
+some unique identifier can be used to retrieve the document from the file
+system.
+
+
+
+A document can also be any textual database attribute or a combination
+(concatenation), which in turn can be stored in various tables or obtained
+dynamically. In other words, a document can be constructed from different
+parts for indexing and it might not exist as a whole. For example:
+
+SELECT title || ' ' ||  author || ' ' ||  abstract || ' ' || body AS document
+FROM messages
+WHERE mid = 12;
+
+SELECT m.title || ' ' || m.author || ' ' || m.abstract || ' ' || d.body AS document
+FROM messages m, docs d
+WHERE mid = did AND mid = 12;
+
+
+
+
+
+Actually, in the previous example queries, COALESCE
+
+should be used to prevent a NULL attribute from causing
+a NULL result.
+
+
+
+
+
+Data Types
+
+
+
+
+
+tsvector
+
+
+
+
+tsvector
+
+
+
+tsvector is a data type that represents a document and is
+optimized for full text searching. In the simplest case,
+tsvector is a sorted list of lexemes, so even without indexes
+full text searches perform better than standard ~ and
+LIKE operations:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector;
+                      tsvector
+----------------------------------------------------
+ 'a' 'on' 'and' 'ate' 'cat' 'fat' 'mat' 'rat' 'sat'
+
+
+Notice, that space is also a lexeme:
+
+
+SELECT 'space ''    '' is a lexeme'::tsvector;
+             tsvector
+----------------------------------
+ 'a' 'is' '    ' 'space' 'lexeme'
+
+
+Each lexeme, optionally, can have positional information which is used for
+proximity ranking:
+
+SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector;
+                                   tsvector
+-------------------------------------------------------------------------------
+ 'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
+
+
+Each lexeme position also can be labeled as 'A',
+'B', 'C', 'D',
+where 'D' is the default. These labels can be used to group
+lexemes into different importance or
+rankings, for example to reflect document structure.
+Actual values can be assigned at search time and used during the calculation
+of the document rank.  This is very useful for controlling search results.
+
+
+The concatenation operator, e.g. tsvector || tsvector,
+can "construct" a document from several parts. The order is important if
+tsvector contains positional information. Of course,
+it is also possible to build a document using different tables:
+
+
+SELECT 'fat:1 cat:2'::tsvector || 'fat:1 rat:2'::tsvector;
+         ?column?
+---------------------------
+ 'cat':2 'fat':1,3 'rat':4
+SELECT 'fat:1 rat:2'::tsvector || 'fat:1 cat:2'::tsvector;
+         ?column?
+---------------------------
+ 'cat':4 'fat':1,3 'rat':2
+
+
+
+
+
+
+
+
+
+tsquery
+
+
+
+tsquery
+
+
+
+Tsquery is a data type for textual queries which supports
+the boolean operators & (AND), | (OR),
+and parentheses.  A Tsquery consists of lexemes
+(optionally labeled by letters) with boolean operators in between:
+
+
+SELECT 'fat & cat'::tsquery;
+    tsquery
+---------------
+ 'fat' & 'cat'
+SELECT 'fat:ab & cat'::tsquery;
+ tsquery
+------------------
+ 'fat':AB & 'cat'
+
+Labels can be used to restrict the search region, which allows the
+development of different search engines using the same full text index.
+
+
+
+tsqueries can be concatenated using && (AND)
+and || (OR) operators:
+
+SELECT 'a & b'::tsquery && 'c|d'::tsquery;
+         ?column?
+---------------------------
+ 'a' & 'b' & ( 'c' | 'd' )
+
+SELECT 'a & b'::tsquery || 'c|d'::tsquery;
+         ?column?
+---------------------------
+ 'a' & 'b' | ( 'c' | 'd' )
+
+
+
+
+
+
+
+
+
+Performing Searches
+
+
+Full text searching in PostgreSQL provides the
+operator @@ for two data types: tsvector
+(document) and tsquery (query).  Also, this operator
+supports TEXT, VARCHAR, and CHAR
+data types so simple full text searches can be done, but without ranking
+support:
+
+tsvector @@ tsquery
+tsquery  @@ tsvector
+TEXT | VARCHAR | CHAR @@ TEXT | tsquery
+
+
+
+
+The full text operator @@ returns true if
+tsvector contains tsquery:
+
+SELECT 'cat & rat'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector;
+ ?column?
+----------
+ t
+SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector;
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+
+Tables and Indexes
+
+
+The previous section described how to perform full text searches using
+constant strings.  This section shows how to search table data, optionally
+using indexes.
+
+
+
+Searching a Table
+
+
+It is possible to do full text table search with no index.  A simple query
+to find all title entries that contain the word
+friend is:
+
+SELECT title
+FROM pgweb
+WHERE to_tsvector('english', body) @@ to_tsquery('friend')
+
+
+
+
+A more complex query is to select the ten most recent documents which
+contain create and table in the title
+or body:
+
+SELECT title
+FROM pgweb
+WHERE to_tsvector('english', textcat(title, body)) @@ to_tsquery('create & table')
+ORDER BY dlm DESC LIMIT 10;
+
+dlm is the last-modified date in seconds since 1970 so we
+used ORDER BY dlm LIMIT 10 to get the most recent
+matches.  For clarity we omitted the coalesce function
+which prevents the unwanted effect of NULL
+concatenation.
+
+
+
+
+
+Creating Indexes
+
+
+We can create a GIN (
+linkend="textsearch-indexes">) index to speed up the search:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body));
+
+Notice that the 2-argument version of to_tsvector is
+used.  Only text search functions which specify a configuration name can
+be used in expression indexes ().
+Casting to a text search data type (::) is also unsupported.
+This is because the index contents should be unaffected by
+default_text_search_config.  If they were affected, the index
+contents might be inconsistent because they could contain
+tsvectors that were created with different default text search
+configurations.  Recovering a table from a pg_dump would
+also not recreate index tsvectors properly.
+
+
+
+Because the two-argument version of to_tsvector was
+used in the index above, only a query reference that uses the 2-argument
+version of to_tsvector with the same configuration
+name will use that index, i.e. WHERE 'a & b' @@
+to_svector('english', body) will use the index, but WHERE
+'a & b' @@ to_svector(body)) and WHERE 'a & b' @@
+body::tsvector will not.  This guarantees that an index will be used
+only with the same configuration used to create the index rows.
+
+
+
+It is possible to setup more complex expression indexes where the
+configuration name is specified by another column, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(conf_name, body));
+
+where conf_name is a column in the pgweb
+table.  This allows mixed configurations in the same index while
+recording which configuration was used for each index row.
+
+
+
+Indexes can even concatenate columns:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', textcat(title, body)));
+
+
+
+
+A more complex case is to create a separate tsvector column
+to hold the output of to_tsvector().  This example is a
+concatenation of title and body,
+with ranking information.  We assign different labels to them to encode
+information about the origin of each word:
+
+ALTER TABLE pgweb ADD COLUMN textsearch_index tsvector;
+UPDATE pgweb SET textsearch_index =
+    setweight(to_tsvector('english', coalesce(title,'')), 'A') || ' ' ||
+    setweight(to_tsvector('english', coalesce(body,'')),'D');
+
+Then we create a GIN index to speed up the search:
+
+CREATE INDEX textsearch_idx ON pgweb USING gin(textsearch_index);
+
+After vacuuming, we are ready to perform a fast full text search:
+
+SELECT rank_cd(textsearch_index, q) AS rank, title
+FROM pgweb, to_tsquery('create & table') q
+WHERE q @@ textsearch_index
+ORDER BY rank DESC LIMIT 10;
+
+It is necessary to create a trigger to keep the new tsvector
+column current anytime title or body changes.
+Keep in mind that, just like with expression indexes, it is important to
+specify the configuration name when creating text search data types
+inside triggers so the column's contents are not affected by changes to 
+default_text_search_config.
+
+
+
+
+
+
+
+Operators and Functions
+
+
+This section outlines all the functions and operators that are available
+for full text searching.
+
+
+
+Full text search vectors and queries both use lexemes, but for different
+purposes.  A tsvector represents the lexemes (tokens) parsed
+out of a document, with an optional position. A tsquery
+specifies a boolean condition using lexemes.
+
+
+
+All of the following functions that accept a configuration argument can
+use a textual configuration name to select a configuration.  If the option
+is omitted the configuration specified by
+default_text_search_config is used.  For more information on
+configuration, see .
+
+
+
+Search
+
+The operator @@ is used to perform full text
+searches:
+
+
+
+
+
+
+TSVECTOR @@ TSQUERY
+
+
+
+
+
+TSVECTOR @@ TSQUERY
+TSQUERY @@ TSVECTOR
+
+
+
+
+
+Returns true if TSQUERY is contained 
+in TSVECTOR, and false if not:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery;
+ ?column?
+ ----------
+  t
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow'::tsquery;
+ ?column?
+ ----------
+  f
+
+
+
+
+
+
+
+
+
+TEXT @@ TSQUERY
+
+
+
+
+TEXT @@ TSQUERY
+VARCHAR @@ TSQUERY
+CHAR @@ TSQUERY
+
+
+
+
+
+Returns true if TSQUERY is contained
+in TEXT/VARCHAR, and false if not:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery;
+ ?column?
+----------
+ t
+SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsquery;
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+TEXT @@ TEXT
+
+
+
+
+
+TEXT @@ TEXT
+VARCHAR @@ TEXT
+CHAR @@ TEXT
+
+
+
+
+
+Returns true if the right
+argument (the query) is contained in the left argument, and
+false otherwise:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat rat';
+ ?column?
+----------
+ t
+SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat cow';
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+
+For index support of full text operators consult .
+
+
+
+
+
+
+
+tsvector
+
+
+
+
+
+
+to_tsvector
+
+
+
+
+to_tsvector(conf_name,  document TEXT) returns TSVECTOR
+
+
+
+
+
+Parses a document into tokens, reduces the tokens to lexemes, and returns a
+tsvector which lists the lexemes together with their positions in the document
+in lexicographic order.
+
+
+
+
+
+
+
+
+strip
+
+
+
+
+strip(vector TSVECTOR) returns TSVECTOR
+
+
+
+
+
+Returns a vector which lists the same lexemes as the given vector, but
+which lacks any information about where in the document each lexeme
+appeared. While the returned vector is useless for relevance ranking it
+will usually be much smaller.
+
+
+
+
+
+
+
+
+setweight
+
+
+
+
+setweight(vector TSVECTOR, letter) returns TSVECTOR
+
+
+
+
+
+This function returns a copy of the input vector in which every location
+has been labeled with either the letter 'A',
+'B', or 'C', or the default label
+'D' (which is the default for new vectors
+and as such is usually not displayed). These labels are retained
+when vectors are concatenated, allowing words from different parts of a
+document to be weighted differently by ranking functions.
+
+
+
+
+
+
+
+
+
+tsvector concatenation
+
+
+
+
+vector1 || vector2
+concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR
+
+
+
+
+
+Returns a vector which combines the lexemes and positional information of
+the two vectors given as arguments. Positional weight labels (described
+in the previous paragraph) are retained during the concatenation.  This
+has at least two uses. First, if some sections of your document need to be
+parsed with different configurations than others, you can parse them
+separately and then concatenate the resulting vectors.  Second, you can
+weigh words from one section of your document differently than the others
+by parsing the sections into separate vectors and assigning each vector
+a different position label with the setweight()
+function.  You can then concatenate them into a single vector and provide
+a weights argument to the rank() function that assigns
+different weights to positions with different labels.
+
+
+
+
+
+
+
+length(tsvector)
+
+
+
+
+length(vector TSVECTOR) returns INT4
+
+
+
+
+
+Returns the number of lexemes stored in the vector.
+
+
+
+
+
+
+
+text::tsvector
+
+
+
+
+text::TSVECTOR returns TSVECTOR
+
+
+
+
+
+Directly casting text to a tsvector allows you
+to directly inject lexemes into a vector with whatever positions and
+positional weights you choose to specify. The text should be formatted to
+match the way a vector is displayed by SELECT.
+
+
+
+
+
+
+
+
+trigger
+
+
+
+
+tsvector_update_trigger(vector_column_name, filter_name, text_column_name , ... )
+
+
+
+
+
+The tsvector_update_trigger() trigger is used to
+automatically update vector_column_name.
+filter_name is the function name to preprocess
+text_column_name.  There can be many functions
+and text columns specified in a
+tsvector_update_trigger() trigger.  If multiple
+functions are specified, they apply to the following columns until the
+next function appears.  As an example of using a filter, function
+dropatsymbol replaces all entries of the
+@ sign with a space:
+
+
+CREATE FUNCTION dropatsymbol(text)
+RETURNS text 
+AS 'SELECT replace($1, ''@'', '' '');'
+LANGUAGE SQL;
+
+CREATE TRIGGER tsvectorupdate BEFORE UPDATE OR INSERT
+ON tblMessages FOR EACH ROW EXECUTE PROCEDURE
+tsvector_update_trigger(tsvector_column, dropatsymbol, strMessage);
+
+
+
+
+
+
+
+
+
+stat
+
+
+
+
+stat(sqlquery text , weight text ) returns SETOF statinfo
+
+
+
+
+
+
+Here statinfo is a type, defined as:
+
+CREATE TYPE statinfo AS (word text, ndoc int4, nentry int4);
+
+and sqlquery is a query which returns a
+tsvector column's contents.  stat returns
+statistics about a tsvector column, i.e., the number of
+documents, ndoc, and the total number of words in the
+collection, nentry.  It is useful for checking your
+configuration and to find stop word candidates.  For example, to find
+the ten most frequent words:
+
+
+SELECT * FROM stat('SELECT vector from apod')
+ORDER BY ndoc DESC, nentry DESC, word
+LIMIT 10;
+
+
+Optionally, one can specify weight to obtain
+statistics about words with a specific weight:
+
+
+SELECT * FROM stat('SELECT vector FROM apod','a')
+ORDER BY ndoc DESC, nentry DESC, word
+LIMIT 10;
+
+
+
+
+
+
+
+
+
+Btree operations for tsvector
+
+
+
+
+TSVECTOR < TSVECTOR
+TSVECTOR <= TSVECTOR
+TSVECTOR = TSVECTOR
+TSVECTOR >= TSVECTOR
+TSVECTOR > TSVECTOR
+
+
+
+
+
+All btree operations are defined for the tsvector type.
+tsvectors are compared with each other using
+lexicographical ordering.
+
+
+
+
+
+
+
+
+
+
+
+tsquery
+
+
+
+
+
+
+
+to_tsquery
+
+
+
+
+to_tsquery(conf_name, querytext text) returns TSQUERY
+
+
+
+
+
+Accepts querytext, which should consist of single tokens
+separated by the boolean operators & (and), |
+(or) and ! (not), which can be grouped using parentheses.
+In other words, to_tsquery expects already parsed text.
+Each token is reduced to a lexeme using the specified or current configuration.
+A weight class can be assigned to each lexeme entry to restrict the search region
+(see setweight for an explanation). For example:
+
+'fat:a & rats'
+
+The to_tsquery function can also accept a text
+string. In this case querytext should
+be quoted. This may be useful, for example, to use with a thesaurus
+dictionary. In the example below, a thesaurus contains rule supernovae
+stars : sn:
+
+SELECT to_tsquery('''supernovae stars'' & !crab');
+   to_tsquery
+----------------
+ 'sn' & !'crab'
+
+Without quotes to_tsquery will generate a syntax error.
+
+
+
+
+
+
+
+
+
+
+plainto_tsquery
+
+
+
+
+plainto_tsquery(conf_name,  querytext text) returns TSQUERY
+
+
+
+
+
+Transforms unformatted text querytext to tsquery.
+It is the same as to_tsquery but accepts text
+without quotes and will call the parser to break it into tokens.
+plainto_tsquery assumes the & boolean
+operator between words and does not recognize weight classes.
+
+
+
+
+
+
+
+
+
+querytree
+
+
+
+
+querytree(query TSQUERY) returns TEXT
+
+
+
+
+
+This returns the query used for searching an index. It can be used to test
+for an empty query. The SELECT below returns NULL,
+which corresponds to an empty query since GIN indexes do not support queries with negation
+
+(a full index scan is inefficient):
+
+SELECT querytree(to_tsquery('!defined'));
+ querytree
+-----------
+
+
+
+
+
+
+
+
+
+
+text::tsquery casting
+
+
+
+
+text::TSQUERY returns TSQUERY
+
+
+
+
+
+Directly casting text to a tsquery
+allows you to directly inject lexemes into a query using whatever positions
+and positional weight flags you choose to specify. The text should be
+formatted  to match the way a vector is displayed by
+SELECT.
+
+
+
+
+
+
+
+
+numnode
+
+
+
+
+numnode(query TSQUERY) returns INTEGER
+
+
+
+
+
+This returns the number of nodes in a query tree. This function can be
+used to determine if query is meaningful
+(returns > 0), or contains only stop words (returns 0):
+
+SELECT numnode(plainto_tsquery('the any'));
+NOTICE:  query contains only stopword(s) or does not contain lexeme(s),
+ignored
+ numnode
+---------
+       0
+SELECT numnode(plainto_tsquery('the table'));
+ numnode
+---------
+       1
+SELECT numnode(plainto_tsquery('long table'));
+ numnode
+---------
+       3
+
+
+
+
+
+
+
+
+TSQUERY && TSQUERY
+
+
+
+
+TSQUERY && TSQUERY returns TSQUERY
+
+
+
+
+
+Returns AND-ed TSQUERY
+
+
+
+
+
+
+
+TSQUERY || TSQUERY
+
+
+
+
+TSQUERY || TSQUERY returns TSQUERY
+
+
+
+
+
+Returns OR-ed TSQUERY
+
+
+
+
+
+
+
+!! TSQUERY
+
+
+
+
+!! TSQUERY returns TSQUERY
+
+
+
+
+
+negation of TSQUERY
+
+
+
+
+
+
+
+Btree operations for tsquery
+
+
+
+
+TSQUERY < TSQUERY
+TSQUERY <= TSQUERY
+TSQUERY = TSQUERY
+TSQUERY >= TSQUERY
+TSQUERY > TSQUERY
+
+
+
+
+
+All btree operations are defined for the tsquery type.
+tsqueries are compared to each other using lexicographical
+ordering.
+
+
+
+
+
+
+
+Query Rewriting
+
+
+Query rewriting is a set of functions and operators for the
+tsquery data type.  It allows control at search
+query time without reindexing (the opposite of the
+thesaurus).  For example, you can expand the search using synonyms
+(new york, big apple, nyc,
+gotham) or narrow the search to direct the user to some hot
+topic.
+
+
+
+The rewrite() function changes the original query by
+replacing part of the query with some other string of type tsquery,
+as defined by the rewrite rule. Arguments to rewrite()
+can be names of columns of type tsquery.
+
+
+
+CREATE TABLE aliases (t TSQUERY PRIMARY KEY, s TSQUERY);
+INSERT INTO aliases VALUES('a', 'c');
+
+
+
+
+
+
+rewrite - 1
+
+
+
+
+rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery);
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+
+rewrite - 2
+
+
+
+
+rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases;
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+
+rewrite - 3
+
+
+
+
+rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases');
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+What if there are several instances of rewriting? For example, query
+'a & b' can be rewritten as
+'b & c' and 'cc'.
+
+
+SELECT * FROM aliases;
+     t     |  s
+-----------+------
+ 'a'       | 'c'
+ 'x'       | 'z'
+ 'a' & 'b' | 'cc'
+
+This ambiguity can be resolved by specifying a sort order:
+
+SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC');
+ rewrite
+---------
+ 'cc'
+SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC');
+  rewrite
+-----------
+ 'b' & 'c'
+
+
+
+
+Let's consider a real-life astronomical example. We'll expand query
+supernovae using table-driven rewriting rules:
+
+CREATE TABLE aliases (t tsquery primary key, s tsquery);
+INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn'));
+SELECT rewrite(to_tsquery('supernovae'),  'SELECT * FROM aliases') && to_tsquery('crab');
+            ?column?
+---------------------------------
+ ( 'supernova' | 'sn' ) & 'crab'
+
+Notice, that we can change the rewriting rule online:
+
+UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae');
+SELECT rewrite(to_tsquery('supernovae'),  'SELECT * FROM aliases') && to_tsquery('crab');
+                  ?column?
+---------------------------------------------
+ ( 'supernova' | 'sn' & !'nebula' ) & 'crab'
+
+
+
+
+
+Operators For tsquery
+
+
+Rewriting can be slow for many rewriting rules since it checks every rule
+for a possible hit. To filter out obvious non-candidate rules there are containment
+operators for the tsquery type. In the example below, we select only those
+rules which might contain the original query:
+
+SELECT rewrite(ARRAY['a & b'::tsquery, t,s])
+FROM aliases
+WHERE 'a & b' @> t;
+  rewrite
+-----------
+ 'b' & 'c'
+
+
+
+
+
+Two operators are defined for tsquery:
+
+
+
+
+
+
+TSQUERY @> TSQUERY
+
+
+
+
+TSQUERY @> TSQUERY
+
+
+
+
+
+Returns true if the right argument might be contained in left argument.
+
+
+
+
+
+
+
+tsquery <@ tsquery
+
+
+
+
+TSQUERY <@ TSQUERY
+
+
+
+
+
+Returns true if the left argument might be contained in right argument.
+
+
+
+
+
+
+
+
+
+Index For tsquery
+
+
+To speed up operators <@ and @> for
+tsquery one can use a GiST index with
+a tsquery_ops opclass:
+
+
+CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops);
+
+
+
+
+
+
+
+
+
+
+Additional Controls
+
+
+To implement full text searching there must be a function to create a
+tsvector from a document and a tsquery from a
+user query. Also, we need to return results in some order, i.e., we need
+a function which compares documents with respect to their relevance to
+the tsquery.  Full text searching in
+PostgreSQL provides support for all of these
+functions.
+
+
+
+Parsing
+
+
+Full text searching in PostgreSQL provides
+function to_tsvector, which converts a document to
+the tsvector data type. More details are available in 
+linkend="textsearch-tsvector">, but for now consider a simple example:
+
+SELECT to_tsvector('english', 'a fat  cat sat on a mat - it ate a fat rats');
+                     to_tsvector
+-----------------------------------------------------
+ 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
+
+
+
+ 
+In the example above we see that the resulting tsvector does not
+contain the words a, on, or
+it, the word rats became
+rat, and the punctuation sign - was
+ignored. 
+ 
+
+
+The to_tsvector function internally calls a parser
+which breaks the document (a fat  cat sat on a mat - it ate a
+fat rats) into words and corresponding types. The default parser
+recognizes 23 types.  Each word, depending on its type, passes through a
+group of dictionaries ().  At the
+end of this step we obtain lexemes.  For example,
+rats became rat because one of the
+dictionaries recognized that the word rats is a plural
+form of rat.  Some words are treated as "stop words"
+() and ignored since they occur too
+frequently and have little informational value.  In our example these are
+a, on, and it.
+The punctuation sign - was also ignored because its
+type (Space symbols) is not indexed. All information
+about the parser, dictionaries and what types of lexemes to index is
+documented in the full text configuration section (
+linkend="textsearch-tables-configuration">).  It is possible to have
+several different configurations in the same database, and many predefined
+system configurations are available for different languages. In our example
+we used the default configuration english for the
+English language.
+
+
+
+As another example, below is the output from the ts_debug
+function (  ), which shows all details
+of the full text machinery:
+
+SELECT * FROM ts_debug('english','a fat  cat sat on a mat - it ate a fat rats');
+ Alias |  Description  | Token |      Dicts list      |       Lexized token
+-------+---------------+-------+----------------------+---------------------------
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | cat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {cat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | sat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {sat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | on    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | mat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {mat}
+ blank | Space symbols |       |                      |
+ blank | Space symbols | -     |                      |
+ lword | Latin word    | it    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | ate   | {pg_catalog.en_stem} | pg_catalog.en_stem: {ate}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | rats  | {pg_catalog.en_stem} | pg_catalog.en_stem: {rat}
+(24 rows)
+
+
+
+
+Function setweight() is used to label
+tsvector. The typical usage of this is to mark out the
+different parts of a document, perhaps by importance.  Later, this can be
+used for ranking of search results in addition to positional information
+(distance between query terms).  If no ranking is required, positional
+information can be removed from tsvector using the
+strip() function to save space.
+
+
+
+Because to_tsvector(NULL) can
+return NULL, it is recommended to use
+coalesce. Here is the safe method for creating a
+tsvector from a structured document:
+
+UPDATE tt SET ti=
+    setweight(to_tsvector(coalesce(title,'')), 'A')    || ' ' ||
+    setweight(to_tsvector(coalesce(keyword,'')), 'B')  || ' ' ||
+    setweight(to_tsvector(coalesce(abstract,'')), 'C') || ' ' ||
+    setweight(to_tsvector(coalesce(body,'')), 'D');
+
+
+
+
+The following functions allow manual parsing control:
+
+
+
+
+
+
+parse
+
+
+
+
+parse(parser,  document TEXT) returns SETOF tokenout
+
+
+
+
+
+Parses the given document and returns a series
+of records, one for each token produced by parsing. Each record includes
+a tokid giving its type and a token
+which gives its content:
+
+SELECT * FROM parse('default','123 - a number');
+ tokid | token
+-------+--------
+    22 | 123
+    12 |
+    12 | -
+     1 | a
+    12 |
+     1 | number
+
+
+
+
+
+
+
+token_type
+
+
+
+
+token_type(parser ) returns SETOF tokentype
+
+
+
+
+
+Returns a table which describes each kind of token the
+parser might produce as output.  For each token
+type the table gives the tokid which the
+parser uses to label each
+token of that type, the alias which
+names the token type, and a short description:
+
+SELECT * FROM token_type('default');
+ tokid |    alias     |            description
+-------+--------------+-----------------------------------
+     1 | lword        | Latin word
+     2 | nlword       | Non-latin word
+     3 | word         | Word
+     4 | email        | Email
+     5 | url          | URL
+     6 | host         | Host
+     7 | sfloat       | Scientific notation
+     8 | version      | VERSION
+     9 | part_hword   | Part of hyphenated word
+    10 | nlpart_hword | Non-latin part of hyphenated word
+    11 | lpart_hword  | Latin part of hyphenated word
+    12 | blank        | Space symbols
+    13 | tag          | HTML Tag
+    14 | protocol     | Protocol head
+    15 | hword        | Hyphenated word
+    16 | lhword       | Latin hyphenated word
+    17 | nlhword      | Non-latin hyphenated word
+    18 | uri          | URI
+    19 | file         | File or path name
+    20 | float        | Decimal notation
+    21 | int          | Signed integer
+    22 | uint         | Unsigned integer
+    23 | entity       | HTML Entity
+
+
+
+
+
+
+
+
+
+
+
+
+Ranking Search Results
+
+
+Ranking attempts to measure how relevant documents are to a particular
+query by inspecting the number of times each search word appears in the
+document, and whether different search terms occur near each other.  Full
+text searching provides two predefined ranking functions which attempt to
+produce a measure of how a document is relevant to the query.  In spite
+of that, the concept of relevancy is vague and very application-specific.
+These functions try to take into account lexical, proximity, and structural
+information.  Different applications might require additional information
+for ranking, e.g. document modification time.
+
+
+
+The lexical part of ranking reflects how often the query terms appear in
+the document, how close the document query terms are, and in what part of
+the document they occur.  Note that ranking functions that use positional
+information will only work on unstripped tsvectors because stripped
+tsvectors lack positional information.
+
+
+
+The two ranking functions currently available are:
+
+
+
+
+
+
+rank
+
+
+
+
+rank( weights float4[], vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This ranking function offers the ability to weigh word instances more
+heavily depending on how you have classified them.  The weights specify
+how heavily to weigh each category of word:
+
+{D-weight, C-weight, B-weight, A-weight}
+ 
+If no weights are provided,
+then these defaults are used:
+
+{0.1, 0.2, 0.4, 1.0}
+
+Often weights are used to mark words from special areas of the document,
+like the title or an initial abstract, and make them more or less important
+than words in the document body.
+
+
+
+
+
+
+
+rank_cd
+
+
+
+
+rank_cd( weights float4[],  vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This function computes the cover density ranking for
+the given document vector and query, as described in Clarke, Cormack, and
+Tudhope's "Relevance Ranking for One to Three Term Queries" in the
+"Information Processing and Management", 1999.
+
+
+
+
+
+
+
+
+
+Since a longer document has a greater chance of containing a query term
+it is reasonable to take into account document size, i.e. a hundred-word
+document with five instances of a search word is probably more relevant
+than a thousand-word document with five instances.  Both ranking functions
+take an integer normalization option that
+specifies whether a document's length should impact its rank.  The integer
+option controls several behaviors which is done using bit-wise fields and
+| (for example, 2|4):
+
+
+
+0 (the default) ignores the document length
+
+
+1 divides the rank by 1 + the logarithm of the document length
+
+
+2 divides the rank by the length itself
+
+
+
+4 divides the rank by the mean harmonic distance between extents
+
+
+8 divides the rank by the number of unique words in document
+
+
+16 divides the rank by 1 + logarithm of the number of unique words in document
+
+
+
+
+
+
+It is important to note that ranking functions do not use any global
+information so it is impossible to produce a fair normalization to 1% or
+100%, as sometimes required. However, a simple technique like
+rank/(rank+1) can be applied.  Of course, this is just
+a cosmetic change, i.e., the ordering of the search results will not change.
+
+
+
+Several examples are shown below; note that the second example uses
+normalized ranking:
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |   rnk
+-----------------------------------------------+----------
+ Neutrinos in the Sun                          |      3.1
+ The Sudbury Neutrino Detector                 |      2.4
+ A MACHO View of Galactic Dark Matter          |  2.01317
+ Hot Gas and Dark Matter                       |  1.91171
+ The Virgo Cluster: Hot Plasma and Dark Matter |  1.90953
+ Rafting for Solar Neutrinos                   |      1.9
+ NGC 4650A: Strange Galaxy and Dark Matter     |  1.85774
+ Hot Gas and Dark Matter                       |   1.6123
+ Ice Fishing for Cosmic Neutrinos              |      1.6
+ Weak Lensing Distorts the Universe            | 0.818218
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query)/
+(rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) + 1) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE  query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |        rnk
+-----------------------------------------------+-------------------
+ Neutrinos in the Sun                          | 0.756097569485493
+ The Sudbury Neutrino Detector                 | 0.705882361190954
+ A MACHO View of Galactic Dark Matter          | 0.668123210574724
+ Hot Gas and Dark Matter                       |  0.65655958650282
+ The Virgo Cluster: Hot Plasma and Dark Matter | 0.656301290640973
+ Rafting for Solar Neutrinos                   | 0.655172410958162
+ NGC 4650A: Strange Galaxy and Dark Matter     | 0.650072921219637
+ Hot Gas and Dark Matter                       | 0.617195790024749
+ Ice Fishing for Cosmic Neutrinos              | 0.615384618911517
+ Weak Lensing Distorts the Universe            | 0.450010798361481
+
+
+
+
+The first argument in rank_cd ('{0.1, 0.2,
+0.4, 1.0}') is an optional parameter which specifies the
+weights for labels D, C,
+B, and A used in function
+setweight. These default values show that lexemes
+labeled as A are ten times more important than ones
+that are labeled with D.
+
+
+
+Ranking can be expensive since it requires consulting the
+tsvector of all documents, which can be I/O bound and
+therefore slow. Unfortunately, it is almost impossible to avoid since full
+text searching in a database should work without indexes .  Moreover an index can be lossy (a GiST
+index, for example) so it must check documents to avoid false hits.
+
+
+
+Note that the ranking functions above are only examples.  You can write
+your own ranking functions and/or combine additional factors to fit your
+specific needs.
+
+
+
+
+
+
+Highlighting Results
+
+
+headline
+
+
+
+To present search results it is ideal to show a part of each document and
+how it is related to the query. Usually, search engines show fragments of
+the document with marked search terms.  PostgreSQL full
+text searching provides the function headline that
+implements such functionality.
+
+
+
+
+
+
+
+
+headline( conf_name text, document text, query TSQUERY,  options text ) returns text
+
+
+
+
+
+The headline() function accepts a document along with
+a query, and returns one or more ellipsis-separated excerpts from the
+document in which terms from the query are highlighted.  The configuration
+used to parse the document can be specified by its
+conf_name; if none is specified, the current
+configuration is used.
+
+
+
+
+
+
+
+
+If an options string is specified it should
+consist of a comma-separated list of one or more 'option=value' pairs.
+The available options are:
+
+
+
+StartSel, StopSel: the strings with which
+query words appearing in the document should be delimited to distinguish
+them from other excerpted words.
+
+
+MaxWords, MinWords: limit the shortest and
+longest headlines to output
+
+
+ShortWord: this prevents your headline from beginning
+or ending with a word which has this many characters or less. The default
+value of three eliminates the English articles.
+
+
+HighlightAll: boolean flag;  if
+true the whole document will be highlighted
+
+
+
+Any unspecified options receive these defaults:
+
+StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
+
+
+
+
+For example:
+
+
+SELECT headline('a b c', 'c'::tsquery);
+   headline
+--------------
+ a b <b>c</b>
+SELECT headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>');
+ headline
+----------
+ a b  <c>
+
+
+
+
+headline uses the original document, not
+tsvector, so it can be slow and should be used with care.
+A typical mistake is to call headline() for
+every matching document when only ten documents are
+shown. SQL subselects can help here;  below is an
+example:
+
+
+SELECT id,headline(body,q), rank
+FROM (SELECT id,body,q, rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q
+      WHERE ti @@ q
+      ORDER BY rank DESC LIMIT 10) AS foo;
+
+
+
+
+Note that the cascade dropping of the parser function
+causes dropping of the headline used in the full text search
+configuration conf_name.
+
+
+
+
+
+
+
+Dictionaries
+
+
+Dictionaries are used to specify words that should not be considered in
+a search and for the normalization of words to allow the user to use any
+derived form of a word in a query. Also, normalization can reduce the size of
+tsvector. Normalization does not always have linguistic
+meaning and usually depends on application semantics.
+
+
+
+Some examples of normalization:  
+
+
+
+
+ Linguistic - ispell dictionaries try to reduce input words to a
+normalized form; stemmer dictionaries remove word endings
+ 
+
+ Identical URL locations are identified and canonicalized:
+
+
+
+http://www.pgsql.ru/db/mw/index.html
+
+
+http://www.pgsql.ru/db/mw/
+
+
+http://www.pgsql.ru/db/../db/mw/index.html
+
+
+
+
+
+Colour names are substituted by their hexadecimal values, e.g.,
+red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF
+
+
+Remove some numeric fractional digits to reduce the range of possible
+numbers, so 3.14159265359,
+3.1415926, 3.14 will be the same
+after normalization if only two digits are kept after the decimal point.
+
+
+
+
+
+
+A dictionary is a program which accepts lexemes as
+input and returns:
+
+
+an array of lexemes if the input lexeme is known to the dictionary
+
+
+a void array if the dictionary knows the lexeme, but it is a stop word
+
+
+NULL if the dictionary does not recognize the input lexeme
+
+
+
+WARNING:
+Data files used by dictionaries should be in the server_encoding
+so all encodings are consistent across databases.
+
+
+
+Full text searching provides predefined dictionaries for many languages,
+and SQL commands to manipulate them.  There are also
+several predefined template dictionaries that can be used to create new
+dictionaries by overriding their default parameters.  Besides this, it is
+possible to develop custom dictionaries using an API;
+see the dictionary for integers (
+linkend="textsearch-rule-dictionary-example">) as an example.
+
+
+
+The ALTER TEXT SEARCH CONFIGURATION public.pg ADD
+MAPPING command binds specific types of lexemes and a set of
+dictionaries to process them. (Mappings can also be specified as part of
+configuration creation.) Lexemes are processed by a stack of dictionaries
+until some dictionary identifies it as a known word or it turns out to be
+a stop word.  If no dictionary recognizes a lexeme, it will be discarded
+and not indexed. A general rule for configuring a stack of dictionaries
+is to place first the most narrow, most specific dictionary, then the more
+general dictionaries and finish it with a very general dictionary, like
+the snowball stemmer or simple, which
+recognizes everything.  For example, for an astronomy-specific search
+(astro_en configuration) one could bind
+lword (latin word) with a synonym dictionary of astronomical
+terms, a general English dictionary and a snowball English
+stemmer:
+
+ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, en_ispell, en_stem;
+
+
+
+
+Function lexize can be used to test dictionaries,
+for example:
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+(1 row)
+
+Also, the ts_debug function ()
+can be used for this.
+
+
+
+Stop Words
+
+Stop words are words which are very common, appear in almost
+every document, and have no discrimination value. Therefore, they can be ignored
+in the context of full text searching. For example, every English text contains
+words like a although it is useless to store them in an index.
+However, stop words do affect the positions in tsvector,
+which in turn, do affect ranking:
+
+SELECT to_tsvector('english','in the list of stop words');
+        to_tsvector
+----------------------------
+ 'list':3 'stop':5 'word':6
+
+The gaps between positions 1-3 and 3-5 are because of stop words, so ranks
+calculated for documents with and without stop words are quite different:
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+     0.5
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+       1
+
+
+
+
+
+It is up to the specific dictionary how it treats stop words. For example,
+ispell dictionaries first normalize words and then
+look at the list of stop words, while stemmers
+first check the list of stop words. The reason for the different
+behaviour is an attempt to decrease possible noise.
+
+
+
+Here is an example of a dictionary that returns the input word as lowercase
+or NULL if it is a stop word; it also specifies the location
+of the file of stop words.  It uses the simple dictionary as
+a template:
+
+CREATE TEXT SEARCH DICTIONARY public.simple_dict
+    TEMPLATE pg_catalog.simple
+    OPTION 'english.stop';
+
+Relative paths in OPTION resolve relative to
+share/.  Now we can test our
+dictionary:
+
+SELECT lexize('public.simple_dict','YeS');
+ lexize
+--------
+ {yes}
+SELECT lexize('public.simple_dict','The');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+Synonym Dictionary
+
+
+This dictionary template is used to create dictionaries which replace a
+word with a synonym. Phrases are not supported (use the thesaurus
+dictionary () if you need them).  Synonym
+dictionary can be used to overcome linguistic problems, for example, to
+prevent an English stemmer dictionary from reducing the word 'Paris' to
+'pari'.  In that case, it is enough to have a Paris
+paris line in the synonym dictionary and put it before the
+en_stem dictionary:
+
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |      Dicts list      |       Lexized token
+-------+-------------+-------+----------------------+----------------------------
+ lword | Latin word  | Paris | {pg_catalog.en_stem} | pg_catalog.en_stem: {pari}
+(1 row)
+ALTER TEXT SEARCH CONFIGURATION ADD MAPPING ON english FOR lword WITH synonym, en_stem;
+ALTER TEXT SEARCH MAPPING
+Time: 340.867 ms
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |               Dicts list                |        Lexized token
+-------+-------------+-------+-----------------------------------------+-----------------------------
+ lword | Latin word  | Paris | {pg_catalog.synonym,pg_catalog.en_stem} | pg_catalog.synonym: {paris}
+(1 row)
+
+
+
+
+
+
+Thesaurus Dictionary
+
+
+A thesaurus dictionary (sometimes abbreviated as TZ) is
+a collection of words which includes information about the relationships
+of words and phrases, i.e., broader terms (BT), narrower
+terms (NT), preferred terms, non-preferred terms, related
+terms, etc.
+
+
+Basically a thesaurus dictionary replaces all non-preferred terms by one
+preferred term and, optionally, preserves them for indexing.  Thesauruses
+are used during indexing so any change in the thesaurus requires
+reindexing.  The current implementation of the thesaurus
+dictionary is an extension of the synonym dictionary with added
+phrase support.  A thesaurus is a plain file of the
+following format:
+
+# this is a comment
+sample word(s) : indexed word(s)
+...............................
+
+where  the colon (:) symbol acts as a delimiter.
+
+
+
+A thesaurus dictionary uses a subdictionary (which
+should be defined in the full text configuration) to normalize the
+thesaurus text. It is only possible to define one dictionary.  Notice that
+the subdictionary will produce an error if it can
+not recognize a word. In that case, you should remove the definition of
+the word or teach the subdictionary to about it.
+Use an asterisk (*) at the beginning of an indexed word to
+skip the subdictionary. It is still required that sample words are known.
+
+
+
+The thesaurus dictionary looks for the longest match.
+
+
+
+Stop words recognized by the subdictionary are replaced by a 'stop word
+placeholder' to record their position. To break possible ties the thesaurus
+uses the last definition. To illustrate this, consider a thesaurus (with
+a simple subdictionary) with pattern
+'swsw', where 's' designates any stop word and
+'w', any known word:
+
+a one the two : swsw
+the one a two : swsw2
+
+Words 'a' and 'the' are stop words defined in the
+configuration of a subdictionary. The thesaurus considers 'the
+one the two' and 'that one then two' as equal
+and will use definition 'swsw2'.
+
+
+
+As any normal dictionary, it can be assigned to the specific lexeme types.
+Since a thesaurus dictionary has the capability to recognize phrases it
+must remember its state and interact with the parser. A thesaurus dictionary
+uses these assignments to check if it should handle the next word or stop
+accumulation.  The thesaurus dictionary compiler must be configured
+carefully. For example, if the thesaurus dictionary is assigned to handle
+only the lword lexeme, then a thesaurus dictionary
+definition like ' one 7' will not work since lexeme type
+digit is not assigned to the thesaurus dictionary.
+
+
+
+
+
+Thesaurus Configuration
+
+
+To define a new thesaurus dictionary one can use the thesaurus template.
+For example:
+
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_simple
+    TEMPLATE thesaurus_template
+    OPTION 'DictFile="dicts_data/thesaurus.txt.sample", Dictionary="en_stem"';
+
+Here:
+
+
+thesaurus_simple is the thesaurus dictionary name
+
+
+DictFile="/path/to/thesaurus_simple.txt" is the location of the thesaurus file
+
+
+Dictionary="en_stem" defines the dictionary (snowball
+English stemmer) to use for thesaurus normalization. Notice that the
+en_stem dictionary has it is own configuration (for example,
+stop words).
+
+
+
+Now it is possible to bind the thesaurus dictionary thesaurus_simple
+and selected tokens, for example:
+
+
+ALTER TEXT SEARCH russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple;
+
+
+
+
+
+
+Thesaurus Example
+
+
+Consider a simple astronomical thesaurus thesaurus_astro,
+which contains some astronomical word combinations:
+
+supernovae stars : sn
+crab nebulae : crab
+
+Below we create a dictionary and bind some token types with
+an astronomical thesaurus and english stemmer:
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_astro OPTION
+    TEMPLATE thesaurus_template
+    'DictFile="dicts_data/thesaurus_astro.txt", Dictionary="en_stem"';
+ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword
+    WITH thesaurus_astro, en_stem;
+
+Now we can see how it works. Note that lexize cannot
+be used for testing the thesaurus (see description of
+lexize), but we can use
+plainto_tsquery and to_tsvector
+which accept text arguments, not lexemes:
+
+
+SELECT plainto_tsquery('supernova star');
+ plainto_tsquery
+-----------------
+ 'sn'
+SELECT to_tsvector('supernova star');
+ to_tsvector
+-------------
+ 'sn':1
+
+In principle, one can use to_tsquery if you quote
+the argument:
+
+SELECT to_tsquery('''supernova star''');
+ to_tsquery
+------------
+ 'sn'
+
+Notice that supernova star matches supernovae
+stars in thesaurus_astro because we specified the
+en_stem stemmer in the thesaurus definition.
+
+
+To keep an original phrase in full text indexing just add it to the right part
+of the definition:
+
+supernovae stars : sn supernovae stars
+
+SELECT plainto_tsquery('supernova star');
+       plainto_tsquery
+-----------------------------
+ 'sn' & 'supernova' & 'star'
+
+
+
+
+
+
+Ispell Dictionary
+
+
+The Ispell template dictionary for full text allows the
+creation of morphological dictionaries based on 
+url="http://ficus-www.cs.ucla.edu/geoff/ispell.html">Ispell, which
+supports a large number of languages. This dictionary tries to change an
+input word to its normalized form. Also, more modern spelling dictionaries
+are supported - 
+url="http://en.wikipedia.org/wiki/MySpell">MySpell (OO < 2.0.1)
+and Hunspell
+(OO >= 2.0.2).  A large list of dictionaries is available on the 
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+




This is the main PostgreSQL git repository.
RSS
Atom
+linkend="textsearch-indexes">) index to speed up the search:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body));
+
+Notice that the 2-argument version of to_tsvector is
+used.  Only text search functions which specify a configuration name can
+be used in expression indexes ().
+Casting to a text search data type (::) is also unsupported.
+This is because the index contents should be unaffected by
+default_text_search_config.  If they were affected, the index
+contents might be inconsistent because they could contain
+tsvectors that were created with different default text search
+configurations.  Recovering a table from a pg_dump would
+also not recreate index tsvectors properly.
+
+
+
+Because the two-argument version of to_tsvector was
+used in the index above, only a query reference that uses the 2-argument
+version of to_tsvector with the same configuration
+name will use that index, i.e. WHERE 'a & b' @@
+to_svector('english', body) will use the index, but WHERE
+'a & b' @@ to_svector(body)) and WHERE 'a & b' @@
+body::tsvector will not.  This guarantees that an index will be used
+only with the same configuration used to create the index rows.
+
+
+
+It is possible to setup more complex expression indexes where the
+configuration name is specified by another column, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(conf_name, body));
+
+where conf_name is a column in the pgweb
+table.  This allows mixed configurations in the same index while
+recording which configuration was used for each index row.
+
+
+
+Indexes can even concatenate columns:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', textcat(title, body)));
+
+
+
+
+A more complex case is to create a separate tsvector column
+to hold the output of to_tsvector().  This example is a
+concatenation of title and body,
+with ranking information.  We assign different labels to them to encode
+information about the origin of each word:
+
+ALTER TABLE pgweb ADD COLUMN textsearch_index tsvector;
+UPDATE pgweb SET textsearch_index =
+    setweight(to_tsvector('english', coalesce(title,'')), 'A') || ' ' ||
+    setweight(to_tsvector('english', coalesce(body,'')),'D');
+
+Then we create a GIN index to speed up the search:
+
+CREATE INDEX textsearch_idx ON pgweb USING gin(textsearch_index);
+
+After vacuuming, we are ready to perform a fast full text search:
+
+SELECT rank_cd(textsearch_index, q) AS rank, title
+FROM pgweb, to_tsquery('create & table') q
+WHERE q @@ textsearch_index
+ORDER BY rank DESC LIMIT 10;
+
+It is necessary to create a trigger to keep the new tsvector
+column current anytime title or body changes.
+Keep in mind that, just like with expression indexes, it is important to
+specify the configuration name when creating text search data types
+inside triggers so the column's contents are not affected by changes to 
+default_text_search_config.
+
+
+
+
+
+
+
+Operators and Functions
+
+
+This section outlines all the functions and operators that are available
+for full text searching.
+
+
+
+Full text search vectors and queries both use lexemes, but for different
+purposes.  A tsvector represents the lexemes (tokens) parsed
+out of a document, with an optional position. A tsquery
+specifies a boolean condition using lexemes.
+
+
+
+All of the following functions that accept a configuration argument can
+use a textual configuration name to select a configuration.  If the option
+is omitted the configuration specified by
+default_text_search_config is used.  For more information on
+configuration, see .
+
+
+
+Search
+
+The operator @@ is used to perform full text
+searches:
+
+
+
+
+
+
+TSVECTOR @@ TSQUERY
+
+
+
+
+
+TSVECTOR @@ TSQUERY
+TSQUERY @@ TSVECTOR
+
+
+
+
+
+Returns true if TSQUERY is contained 
+in TSVECTOR, and false if not:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery;
+ ?column?
+ ----------
+  t
+SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow'::tsquery;
+ ?column?
+ ----------
+  f
+
+
+
+
+
+
+
+
+
+TEXT @@ TSQUERY
+
+
+
+
+TEXT @@ TSQUERY
+VARCHAR @@ TSQUERY
+CHAR @@ TSQUERY
+
+
+
+
+
+Returns true if TSQUERY is contained
+in TEXT/VARCHAR, and false if not:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery;
+ ?column?
+----------
+ t
+SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsquery;
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+TEXT @@ TEXT
+
+
+
+
+
+TEXT @@ TEXT
+VARCHAR @@ TEXT
+CHAR @@ TEXT
+
+
+
+
+
+Returns true if the right
+argument (the query) is contained in the left argument, and
+false otherwise:
+
+SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat rat';
+ ?column?
+----------
+ t
+SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat cow';
+ ?column?
+----------
+ f
+
+
+
+
+
+
+
+
+
+
+For index support of full text operators consult .
+
+
+
+
+
+
+
+tsvector
+
+
+
+
+
+
+to_tsvector
+
+
+
+
+to_tsvector(conf_name,  document TEXT) returns TSVECTOR
+
+
+
+
+
+Parses a document into tokens, reduces the tokens to lexemes, and returns a
+tsvector which lists the lexemes together with their positions in the document
+in lexicographic order.
+
+
+
+
+
+
+
+
+strip
+
+
+
+
+strip(vector TSVECTOR) returns TSVECTOR
+
+
+
+
+
+Returns a vector which lists the same lexemes as the given vector, but
+which lacks any information about where in the document each lexeme
+appeared. While the returned vector is useless for relevance ranking it
+will usually be much smaller.
+
+
+
+
+
+
+
+
+setweight
+
+
+
+
+setweight(vector TSVECTOR, letter) returns TSVECTOR
+
+
+
+
+
+This function returns a copy of the input vector in which every location
+has been labeled with either the letter 'A',
+'B', or 'C', or the default label
+'D' (which is the default for new vectors
+and as such is usually not displayed). These labels are retained
+when vectors are concatenated, allowing words from different parts of a
+document to be weighted differently by ranking functions.
+
+
+
+
+
+
+
+
+
+tsvector concatenation
+
+
+
+
+vector1 || vector2
+concat(vector1 TSVECTOR, vector2 TSVECTOR) returns TSVECTOR
+
+
+
+
+
+Returns a vector which combines the lexemes and positional information of
+the two vectors given as arguments. Positional weight labels (described
+in the previous paragraph) are retained during the concatenation.  This
+has at least two uses. First, if some sections of your document need to be
+parsed with different configurations than others, you can parse them
+separately and then concatenate the resulting vectors.  Second, you can
+weigh words from one section of your document differently than the others
+by parsing the sections into separate vectors and assigning each vector
+a different position label with the setweight()
+function.  You can then concatenate them into a single vector and provide
+a weights argument to the rank() function that assigns
+different weights to positions with different labels.
+
+
+
+
+
+
+
+length(tsvector)
+
+
+
+
+length(vector TSVECTOR) returns INT4
+
+
+
+
+
+Returns the number of lexemes stored in the vector.
+
+
+
+
+
+
+
+text::tsvector
+
+
+
+
+text::TSVECTOR returns TSVECTOR
+
+
+
+
+
+Directly casting text to a tsvector allows you
+to directly inject lexemes into a vector with whatever positions and
+positional weights you choose to specify. The text should be formatted to
+match the way a vector is displayed by SELECT.
+
+
+
+
+
+
+
+
+trigger
+
+
+
+
+tsvector_update_trigger(vector_column_name, filter_name, text_column_name , ... )
+
+
+
+
+
+The tsvector_update_trigger() trigger is used to
+automatically update vector_column_name.
+filter_name is the function name to preprocess
+text_column_name.  There can be many functions
+and text columns specified in a
+tsvector_update_trigger() trigger.  If multiple
+functions are specified, they apply to the following columns until the
+next function appears.  As an example of using a filter, function
+dropatsymbol replaces all entries of the
+@ sign with a space:
+
+
+CREATE FUNCTION dropatsymbol(text)
+RETURNS text 
+AS 'SELECT replace($1, ''@'', '' '');'
+LANGUAGE SQL;
+
+CREATE TRIGGER tsvectorupdate BEFORE UPDATE OR INSERT
+ON tblMessages FOR EACH ROW EXECUTE PROCEDURE
+tsvector_update_trigger(tsvector_column, dropatsymbol, strMessage);
+
+
+
+
+
+
+
+
+
+stat
+
+
+
+
+stat(sqlquery text , weight text ) returns SETOF statinfo
+
+
+
+
+
+
+Here statinfo is a type, defined as:
+
+CREATE TYPE statinfo AS (word text, ndoc int4, nentry int4);
+
+and sqlquery is a query which returns a
+tsvector column's contents.  stat returns
+statistics about a tsvector column, i.e., the number of
+documents, ndoc, and the total number of words in the
+collection, nentry.  It is useful for checking your
+configuration and to find stop word candidates.  For example, to find
+the ten most frequent words:
+
+
+SELECT * FROM stat('SELECT vector from apod')
+ORDER BY ndoc DESC, nentry DESC, word
+LIMIT 10;
+
+
+Optionally, one can specify weight to obtain
+statistics about words with a specific weight:
+
+
+SELECT * FROM stat('SELECT vector FROM apod','a')
+ORDER BY ndoc DESC, nentry DESC, word
+LIMIT 10;
+
+
+
+
+
+
+
+
+
+Btree operations for tsvector
+
+
+
+
+TSVECTOR < TSVECTOR
+TSVECTOR <= TSVECTOR
+TSVECTOR = TSVECTOR
+TSVECTOR >= TSVECTOR
+TSVECTOR > TSVECTOR
+
+
+
+
+
+All btree operations are defined for the tsvector type.
+tsvectors are compared with each other using
+lexicographical ordering.
+
+
+
+
+
+
+
+
+
+
+
+tsquery
+
+
+
+
+
+
+
+to_tsquery
+
+
+
+
+to_tsquery(conf_name, querytext text) returns TSQUERY
+
+
+
+
+
+Accepts querytext, which should consist of single tokens
+separated by the boolean operators & (and), |
+(or) and ! (not), which can be grouped using parentheses.
+In other words, to_tsquery expects already parsed text.
+Each token is reduced to a lexeme using the specified or current configuration.
+A weight class can be assigned to each lexeme entry to restrict the search region
+(see setweight for an explanation). For example:
+
+'fat:a & rats'
+
+The to_tsquery function can also accept a text
+string. In this case querytext should
+be quoted. This may be useful, for example, to use with a thesaurus
+dictionary. In the example below, a thesaurus contains rule supernovae
+stars : sn:
+
+SELECT to_tsquery('''supernovae stars'' & !crab');
+   to_tsquery
+----------------
+ 'sn' & !'crab'
+
+Without quotes to_tsquery will generate a syntax error.
+
+
+
+
+
+
+
+
+
+
+plainto_tsquery
+
+
+
+
+plainto_tsquery(conf_name,  querytext text) returns TSQUERY
+
+
+
+
+
+Transforms unformatted text querytext to tsquery.
+It is the same as to_tsquery but accepts text
+without quotes and will call the parser to break it into tokens.
+plainto_tsquery assumes the & boolean
+operator between words and does not recognize weight classes.
+
+
+
+
+
+
+
+
+
+querytree
+
+
+
+
+querytree(query TSQUERY) returns TEXT
+
+
+
+
+
+This returns the query used for searching an index. It can be used to test
+for an empty query. The SELECT below returns NULL,
+which corresponds to an empty query since GIN indexes do not support queries with negation
+
+(a full index scan is inefficient):
+
+SELECT querytree(to_tsquery('!defined'));
+ querytree
+-----------
+
+
+
+
+
+
+
+
+
+
+text::tsquery casting
+
+
+
+
+text::TSQUERY returns TSQUERY
+
+
+
+
+
+Directly casting text to a tsquery
+allows you to directly inject lexemes into a query using whatever positions
+and positional weight flags you choose to specify. The text should be
+formatted  to match the way a vector is displayed by
+SELECT.
+
+
+
+
+
+
+
+
+numnode
+
+
+
+
+numnode(query TSQUERY) returns INTEGER
+
+
+
+
+
+This returns the number of nodes in a query tree. This function can be
+used to determine if query is meaningful
+(returns > 0), or contains only stop words (returns 0):
+
+SELECT numnode(plainto_tsquery('the any'));
+NOTICE:  query contains only stopword(s) or does not contain lexeme(s),
+ignored
+ numnode
+---------
+       0
+SELECT numnode(plainto_tsquery('the table'));
+ numnode
+---------
+       1
+SELECT numnode(plainto_tsquery('long table'));
+ numnode
+---------
+       3
+
+
+
+
+
+
+
+
+TSQUERY && TSQUERY
+
+
+
+
+TSQUERY && TSQUERY returns TSQUERY
+
+
+
+
+
+Returns AND-ed TSQUERY
+
+
+
+
+
+
+
+TSQUERY || TSQUERY
+
+
+
+
+TSQUERY || TSQUERY returns TSQUERY
+
+
+
+
+
+Returns OR-ed TSQUERY
+
+
+
+
+
+
+
+!! TSQUERY
+
+
+
+
+!! TSQUERY returns TSQUERY
+
+
+
+
+
+negation of TSQUERY
+
+
+
+
+
+
+
+Btree operations for tsquery
+
+
+
+
+TSQUERY < TSQUERY
+TSQUERY <= TSQUERY
+TSQUERY = TSQUERY
+TSQUERY >= TSQUERY
+TSQUERY > TSQUERY
+
+
+
+
+
+All btree operations are defined for the tsquery type.
+tsqueries are compared to each other using lexicographical
+ordering.
+
+
+
+
+
+
+
+Query Rewriting
+
+
+Query rewriting is a set of functions and operators for the
+tsquery data type.  It allows control at search
+query time without reindexing (the opposite of the
+thesaurus).  For example, you can expand the search using synonyms
+(new york, big apple, nyc,
+gotham) or narrow the search to direct the user to some hot
+topic.
+
+
+
+The rewrite() function changes the original query by
+replacing part of the query with some other string of type tsquery,
+as defined by the rewrite rule. Arguments to rewrite()
+can be names of columns of type tsquery.
+
+
+
+CREATE TABLE aliases (t TSQUERY PRIMARY KEY, s TSQUERY);
+INSERT INTO aliases VALUES('a', 'c');
+
+
+
+
+
+
+rewrite - 1
+
+
+
+
+rewrite (query TSQUERY, target TSQUERY, sample TSQUERY) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery);
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+
+rewrite - 2
+
+
+
+
+rewrite(ARRAY[query TSQUERY, target TSQUERY, sample TSQUERY]) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases;
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+
+rewrite - 3
+
+
+
+
+rewrite (query TSQUERY,'SELECT target ,sample FROM test'::text) returns TSQUERY
+
+
+
+
+
+
+SELECT rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases');
+  rewrite
+  -----------
+   'b' & 'c'
+
+
+
+
+
+
+
+What if there are several instances of rewriting? For example, query
+'a & b' can be rewritten as
+'b & c' and 'cc'.
+
+
+SELECT * FROM aliases;
+     t     |  s
+-----------+------
+ 'a'       | 'c'
+ 'x'       | 'z'
+ 'a' & 'b' | 'cc'
+
+This ambiguity can be resolved by specifying a sort order:
+
+SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC');
+ rewrite
+---------
+ 'cc'
+SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC');
+  rewrite
+-----------
+ 'b' & 'c'
+
+
+
+
+Let's consider a real-life astronomical example. We'll expand query
+supernovae using table-driven rewriting rules:
+
+CREATE TABLE aliases (t tsquery primary key, s tsquery);
+INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn'));
+SELECT rewrite(to_tsquery('supernovae'),  'SELECT * FROM aliases') && to_tsquery('crab');
+            ?column?
+---------------------------------
+ ( 'supernova' | 'sn' ) & 'crab'
+
+Notice, that we can change the rewriting rule online:
+
+UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae');
+SELECT rewrite(to_tsquery('supernovae'),  'SELECT * FROM aliases') && to_tsquery('crab');
+                  ?column?
+---------------------------------------------
+ ( 'supernova' | 'sn' & !'nebula' ) & 'crab'
+
+
+
+
+
+Operators For tsquery
+
+
+Rewriting can be slow for many rewriting rules since it checks every rule
+for a possible hit. To filter out obvious non-candidate rules there are containment
+operators for the tsquery type. In the example below, we select only those
+rules which might contain the original query:
+
+SELECT rewrite(ARRAY['a & b'::tsquery, t,s])
+FROM aliases
+WHERE 'a & b' @> t;
+  rewrite
+-----------
+ 'b' & 'c'
+
+
+
+
+
+Two operators are defined for tsquery:
+
+
+
+
+
+
+TSQUERY @> TSQUERY
+
+
+
+
+TSQUERY @> TSQUERY
+
+
+
+
+
+Returns true if the right argument might be contained in left argument.
+
+
+
+
+
+
+
+tsquery <@ tsquery
+
+
+
+
+TSQUERY <@ TSQUERY
+
+
+
+
+
+Returns true if the left argument might be contained in right argument.
+
+
+
+
+
+
+
+
+
+Index For tsquery
+
+
+To speed up operators <@ and @> for
+tsquery one can use a GiST index with
+a tsquery_ops opclass:
+
+
+CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops);
+
+
+
+
+
+
+
+
+
+
+Additional Controls
+
+
+To implement full text searching there must be a function to create a
+tsvector from a document and a tsquery from a
+user query. Also, we need to return results in some order, i.e., we need
+a function which compares documents with respect to their relevance to
+the tsquery.  Full text searching in
+PostgreSQL provides support for all of these
+functions.
+
+
+
+Parsing
+
+
+Full text searching in PostgreSQL provides
+function to_tsvector, which converts a document to
+the tsvector data type. More details are available in 
+linkend="textsearch-tsvector">, but for now consider a simple example:
+
+SELECT to_tsvector('english', 'a fat  cat sat on a mat - it ate a fat rats');
+                     to_tsvector
+-----------------------------------------------------
+ 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
+
+
+
+ 
+In the example above we see that the resulting tsvector does not
+contain the words a, on, or
+it, the word rats became
+rat, and the punctuation sign - was
+ignored. 
+ 
+
+
+The to_tsvector function internally calls a parser
+which breaks the document (a fat  cat sat on a mat - it ate a
+fat rats) into words and corresponding types. The default parser
+recognizes 23 types.  Each word, depending on its type, passes through a
+group of dictionaries ().  At the
+end of this step we obtain lexemes.  For example,
+rats became rat because one of the
+dictionaries recognized that the word rats is a plural
+form of rat.  Some words are treated as "stop words"
+() and ignored since they occur too
+frequently and have little informational value.  In our example these are
+a, on, and it.
+The punctuation sign - was also ignored because its
+type (Space symbols) is not indexed. All information
+about the parser, dictionaries and what types of lexemes to index is
+documented in the full text configuration section (
+linkend="textsearch-tables-configuration">).  It is possible to have
+several different configurations in the same database, and many predefined
+system configurations are available for different languages. In our example
+we used the default configuration english for the
+English language.
+
+
+
+As another example, below is the output from the ts_debug
+function (  ), which shows all details
+of the full text machinery:
+
+SELECT * FROM ts_debug('english','a fat  cat sat on a mat - it ate a fat rats');
+ Alias |  Description  | Token |      Dicts list      |       Lexized token
+-------+---------------+-------+----------------------+---------------------------
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | cat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {cat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | sat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {sat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | on    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | mat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {mat}
+ blank | Space symbols |       |                      |
+ blank | Space symbols | -     |                      |
+ lword | Latin word    | it    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | ate   | {pg_catalog.en_stem} | pg_catalog.en_stem: {ate}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | rats  | {pg_catalog.en_stem} | pg_catalog.en_stem: {rat}
+(24 rows)
+
+
+
+
+Function setweight() is used to label
+tsvector. The typical usage of this is to mark out the
+different parts of a document, perhaps by importance.  Later, this can be
+used for ranking of search results in addition to positional information
+(distance between query terms).  If no ranking is required, positional
+information can be removed from tsvector using the
+strip() function to save space.
+
+
+
+Because to_tsvector(NULL) can
+return NULL, it is recommended to use
+coalesce. Here is the safe method for creating a
+tsvector from a structured document:
+
+UPDATE tt SET ti=
+    setweight(to_tsvector(coalesce(title,'')), 'A')    || ' ' ||
+    setweight(to_tsvector(coalesce(keyword,'')), 'B')  || ' ' ||
+    setweight(to_tsvector(coalesce(abstract,'')), 'C') || ' ' ||
+    setweight(to_tsvector(coalesce(body,'')), 'D');
+
+
+
+
+The following functions allow manual parsing control:
+
+
+
+
+
+
+parse
+
+
+
+
+parse(parser,  document TEXT) returns SETOF tokenout
+
+
+
+
+
+Parses the given document and returns a series
+of records, one for each token produced by parsing. Each record includes
+a tokid giving its type and a token
+which gives its content:
+
+SELECT * FROM parse('default','123 - a number');
+ tokid | token
+-------+--------
+    22 | 123
+    12 |
+    12 | -
+     1 | a
+    12 |
+     1 | number
+
+
+
+
+
+
+
+token_type
+
+
+
+
+token_type(parser ) returns SETOF tokentype
+
+
+
+
+
+Returns a table which describes each kind of token the
+parser might produce as output.  For each token
+type the table gives the tokid which the
+parser uses to label each
+token of that type, the alias which
+names the token type, and a short description:
+
+SELECT * FROM token_type('default');
+ tokid |    alias     |            description
+-------+--------------+-----------------------------------
+     1 | lword        | Latin word
+     2 | nlword       | Non-latin word
+     3 | word         | Word
+     4 | email        | Email
+     5 | url          | URL
+     6 | host         | Host
+     7 | sfloat       | Scientific notation
+     8 | version      | VERSION
+     9 | part_hword   | Part of hyphenated word
+    10 | nlpart_hword | Non-latin part of hyphenated word
+    11 | lpart_hword  | Latin part of hyphenated word
+    12 | blank        | Space symbols
+    13 | tag          | HTML Tag
+    14 | protocol     | Protocol head
+    15 | hword        | Hyphenated word
+    16 | lhword       | Latin hyphenated word
+    17 | nlhword      | Non-latin hyphenated word
+    18 | uri          | URI
+    19 | file         | File or path name
+    20 | float        | Decimal notation
+    21 | int          | Signed integer
+    22 | uint         | Unsigned integer
+    23 | entity       | HTML Entity
+
+
+
+
+
+
+
+
+
+
+
+
+Ranking Search Results
+
+
+Ranking attempts to measure how relevant documents are to a particular
+query by inspecting the number of times each search word appears in the
+document, and whether different search terms occur near each other.  Full
+text searching provides two predefined ranking functions which attempt to
+produce a measure of how a document is relevant to the query.  In spite
+of that, the concept of relevancy is vague and very application-specific.
+These functions try to take into account lexical, proximity, and structural
+information.  Different applications might require additional information
+for ranking, e.g. document modification time.
+
+
+
+The lexical part of ranking reflects how often the query terms appear in
+the document, how close the document query terms are, and in what part of
+the document they occur.  Note that ranking functions that use positional
+information will only work on unstripped tsvectors because stripped
+tsvectors lack positional information.
+
+
+
+The two ranking functions currently available are:
+
+
+
+
+
+
+rank
+
+
+
+
+rank( weights float4[], vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This ranking function offers the ability to weigh word instances more
+heavily depending on how you have classified them.  The weights specify
+how heavily to weigh each category of word:
+
+{D-weight, C-weight, B-weight, A-weight}
+ 
+If no weights are provided,
+then these defaults are used:
+
+{0.1, 0.2, 0.4, 1.0}
+
+Often weights are used to mark words from special areas of the document,
+like the title or an initial abstract, and make them more or less important
+than words in the document body.
+
+
+
+
+
+
+
+rank_cd
+
+
+
+
+rank_cd( weights float4[],  vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This function computes the cover density ranking for
+the given document vector and query, as described in Clarke, Cormack, and
+Tudhope's "Relevance Ranking for One to Three Term Queries" in the
+"Information Processing and Management", 1999.
+
+
+
+
+
+
+
+
+
+Since a longer document has a greater chance of containing a query term
+it is reasonable to take into account document size, i.e. a hundred-word
+document with five instances of a search word is probably more relevant
+than a thousand-word document with five instances.  Both ranking functions
+take an integer normalization option that
+specifies whether a document's length should impact its rank.  The integer
+option controls several behaviors which is done using bit-wise fields and
+| (for example, 2|4):
+
+
+
+0 (the default) ignores the document length
+
+
+1 divides the rank by 1 + the logarithm of the document length
+
+
+2 divides the rank by the length itself
+
+
+
+4 divides the rank by the mean harmonic distance between extents
+
+
+8 divides the rank by the number of unique words in document
+
+
+16 divides the rank by 1 + logarithm of the number of unique words in document
+
+
+
+
+
+
+It is important to note that ranking functions do not use any global
+information so it is impossible to produce a fair normalization to 1% or
+100%, as sometimes required. However, a simple technique like
+rank/(rank+1) can be applied.  Of course, this is just
+a cosmetic change, i.e., the ordering of the search results will not change.
+
+
+
+Several examples are shown below; note that the second example uses
+normalized ranking:
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |   rnk
+-----------------------------------------------+----------
+ Neutrinos in the Sun                          |      3.1
+ The Sudbury Neutrino Detector                 |      2.4
+ A MACHO View of Galactic Dark Matter          |  2.01317
+ Hot Gas and Dark Matter                       |  1.91171
+ The Virgo Cluster: Hot Plasma and Dark Matter |  1.90953
+ Rafting for Solar Neutrinos                   |      1.9
+ NGC 4650A: Strange Galaxy and Dark Matter     |  1.85774
+ Hot Gas and Dark Matter                       |   1.6123
+ Ice Fishing for Cosmic Neutrinos              |      1.6
+ Weak Lensing Distorts the Universe            | 0.818218
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query)/
+(rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) + 1) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE  query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |        rnk
+-----------------------------------------------+-------------------
+ Neutrinos in the Sun                          | 0.756097569485493
+ The Sudbury Neutrino Detector                 | 0.705882361190954
+ A MACHO View of Galactic Dark Matter          | 0.668123210574724
+ Hot Gas and Dark Matter                       |  0.65655958650282
+ The Virgo Cluster: Hot Plasma and Dark Matter | 0.656301290640973
+ Rafting for Solar Neutrinos                   | 0.655172410958162
+ NGC 4650A: Strange Galaxy and Dark Matter     | 0.650072921219637
+ Hot Gas and Dark Matter                       | 0.617195790024749
+ Ice Fishing for Cosmic Neutrinos              | 0.615384618911517
+ Weak Lensing Distorts the Universe            | 0.450010798361481
+
+
+
+
+The first argument in rank_cd ('{0.1, 0.2,
+0.4, 1.0}') is an optional parameter which specifies the
+weights for labels D, C,
+B, and A used in function
+setweight. These default values show that lexemes
+labeled as A are ten times more important than ones
+that are labeled with D.
+
+
+
+Ranking can be expensive since it requires consulting the
+tsvector of all documents, which can be I/O bound and
+therefore slow. Unfortunately, it is almost impossible to avoid since full
+text searching in a database should work without indexes .  Moreover an index can be lossy (a GiST
+index, for example) so it must check documents to avoid false hits.
+
+
+
+Note that the ranking functions above are only examples.  You can write
+your own ranking functions and/or combine additional factors to fit your
+specific needs.
+
+
+
+
+
+
+Highlighting Results
+
+
+headline
+
+
+
+To present search results it is ideal to show a part of each document and
+how it is related to the query. Usually, search engines show fragments of
+the document with marked search terms.  PostgreSQL full
+text searching provides the function headline that
+implements such functionality.
+
+
+
+
+
+
+
+
+headline( conf_name text, document text, query TSQUERY,  options text ) returns text
+
+
+
+
+
+The headline() function accepts a document along with
+a query, and returns one or more ellipsis-separated excerpts from the
+document in which terms from the query are highlighted.  The configuration
+used to parse the document can be specified by its
+conf_name; if none is specified, the current
+configuration is used.
+
+
+
+
+
+
+
+
+If an options string is specified it should
+consist of a comma-separated list of one or more 'option=value' pairs.
+The available options are:
+
+
+
+StartSel, StopSel: the strings with which
+query words appearing in the document should be delimited to distinguish
+them from other excerpted words.
+
+
+MaxWords, MinWords: limit the shortest and
+longest headlines to output
+
+
+ShortWord: this prevents your headline from beginning
+or ending with a word which has this many characters or less. The default
+value of three eliminates the English articles.
+
+
+HighlightAll: boolean flag;  if
+true the whole document will be highlighted
+
+
+
+Any unspecified options receive these defaults:
+
+StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
+
+
+
+
+For example:
+
+
+SELECT headline('a b c', 'c'::tsquery);
+   headline
+--------------
+ a b <b>c</b>
+SELECT headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>');
+ headline
+----------
+ a b  <c>
+
+
+
+
+headline uses the original document, not
+tsvector, so it can be slow and should be used with care.
+A typical mistake is to call headline() for
+every matching document when only ten documents are
+shown. SQL subselects can help here;  below is an
+example:
+
+
+SELECT id,headline(body,q), rank
+FROM (SELECT id,body,q, rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q
+      WHERE ti @@ q
+      ORDER BY rank DESC LIMIT 10) AS foo;
+
+
+
+
+Note that the cascade dropping of the parser function
+causes dropping of the headline used in the full text search
+configuration conf_name.
+
+
+
+
+
+
+
+Dictionaries
+
+
+Dictionaries are used to specify words that should not be considered in
+a search and for the normalization of words to allow the user to use any
+derived form of a word in a query. Also, normalization can reduce the size of
+tsvector. Normalization does not always have linguistic
+meaning and usually depends on application semantics.
+
+
+
+Some examples of normalization:  
+
+
+
+
+ Linguistic - ispell dictionaries try to reduce input words to a
+normalized form; stemmer dictionaries remove word endings
+ 
+
+ Identical URL locations are identified and canonicalized:
+
+
+
+http://www.pgsql.ru/db/mw/index.html
+
+
+http://www.pgsql.ru/db/mw/
+
+
+http://www.pgsql.ru/db/../db/mw/index.html
+
+
+
+
+
+Colour names are substituted by their hexadecimal values, e.g.,
+red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF
+
+
+Remove some numeric fractional digits to reduce the range of possible
+numbers, so 3.14159265359,
+3.1415926, 3.14 will be the same
+after normalization if only two digits are kept after the decimal point.
+
+
+
+
+
+
+A dictionary is a program which accepts lexemes as
+input and returns:
+
+
+an array of lexemes if the input lexeme is known to the dictionary
+
+
+a void array if the dictionary knows the lexeme, but it is a stop word
+
+
+NULL if the dictionary does not recognize the input lexeme
+
+
+
+WARNING:
+Data files used by dictionaries should be in the server_encoding
+so all encodings are consistent across databases.
+
+
+
+Full text searching provides predefined dictionaries for many languages,
+and SQL commands to manipulate them.  There are also
+several predefined template dictionaries that can be used to create new
+dictionaries by overriding their default parameters.  Besides this, it is
+possible to develop custom dictionaries using an API;
+see the dictionary for integers (
+linkend="textsearch-rule-dictionary-example">) as an example.
+
+
+
+The ALTER TEXT SEARCH CONFIGURATION public.pg ADD
+MAPPING command binds specific types of lexemes and a set of
+dictionaries to process them. (Mappings can also be specified as part of
+configuration creation.) Lexemes are processed by a stack of dictionaries
+until some dictionary identifies it as a known word or it turns out to be
+a stop word.  If no dictionary recognizes a lexeme, it will be discarded
+and not indexed. A general rule for configuring a stack of dictionaries
+is to place first the most narrow, most specific dictionary, then the more
+general dictionaries and finish it with a very general dictionary, like
+the snowball stemmer or simple, which
+recognizes everything.  For example, for an astronomy-specific search
+(astro_en configuration) one could bind
+lword (latin word) with a synonym dictionary of astronomical
+terms, a general English dictionary and a snowball English
+stemmer:
+
+ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, en_ispell, en_stem;
+
+
+
+
+Function lexize can be used to test dictionaries,
+for example:
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+(1 row)
+
+Also, the ts_debug function ()
+can be used for this.
+
+
+
+Stop Words
+
+Stop words are words which are very common, appear in almost
+every document, and have no discrimination value. Therefore, they can be ignored
+in the context of full text searching. For example, every English text contains
+words like a although it is useless to store them in an index.
+However, stop words do affect the positions in tsvector,
+which in turn, do affect ranking:
+
+SELECT to_tsvector('english','in the list of stop words');
+        to_tsvector
+----------------------------
+ 'list':3 'stop':5 'word':6
+
+The gaps between positions 1-3 and 3-5 are because of stop words, so ranks
+calculated for documents with and without stop words are quite different:
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+     0.5
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+       1
+
+
+
+
+
+It is up to the specific dictionary how it treats stop words. For example,
+ispell dictionaries first normalize words and then
+look at the list of stop words, while stemmers
+first check the list of stop words. The reason for the different
+behaviour is an attempt to decrease possible noise.
+
+
+
+Here is an example of a dictionary that returns the input word as lowercase
+or NULL if it is a stop word; it also specifies the location
+of the file of stop words.  It uses the simple dictionary as
+a template:
+
+CREATE TEXT SEARCH DICTIONARY public.simple_dict
+    TEMPLATE pg_catalog.simple
+    OPTION 'english.stop';
+
+Relative paths in OPTION resolve relative to
+share/.  Now we can test our
+dictionary:
+
+SELECT lexize('public.simple_dict','YeS');
+ lexize
+--------
+ {yes}
+SELECT lexize('public.simple_dict','The');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+Synonym Dictionary
+
+
+This dictionary template is used to create dictionaries which replace a
+word with a synonym. Phrases are not supported (use the thesaurus
+dictionary () if you need them).  Synonym
+dictionary can be used to overcome linguistic problems, for example, to
+prevent an English stemmer dictionary from reducing the word 'Paris' to
+'pari'.  In that case, it is enough to have a Paris
+paris line in the synonym dictionary and put it before the
+en_stem dictionary:
+
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |      Dicts list      |       Lexized token
+-------+-------------+-------+----------------------+----------------------------
+ lword | Latin word  | Paris | {pg_catalog.en_stem} | pg_catalog.en_stem: {pari}
+(1 row)
+ALTER TEXT SEARCH CONFIGURATION ADD MAPPING ON english FOR lword WITH synonym, en_stem;
+ALTER TEXT SEARCH MAPPING
+Time: 340.867 ms
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |               Dicts list                |        Lexized token
+-------+-------------+-------+-----------------------------------------+-----------------------------
+ lword | Latin word  | Paris | {pg_catalog.synonym,pg_catalog.en_stem} | pg_catalog.synonym: {paris}
+(1 row)
+
+
+
+
+
+
+Thesaurus Dictionary
+
+
+A thesaurus dictionary (sometimes abbreviated as TZ) is
+a collection of words which includes information about the relationships
+of words and phrases, i.e., broader terms (BT), narrower
+terms (NT), preferred terms, non-preferred terms, related
+terms, etc.
+
+
+Basically a thesaurus dictionary replaces all non-preferred terms by one
+preferred term and, optionally, preserves them for indexing.  Thesauruses
+are used during indexing so any change in the thesaurus requires
+reindexing.  The current implementation of the thesaurus
+dictionary is an extension of the synonym dictionary with added
+phrase support.  A thesaurus is a plain file of the
+following format:
+
+# this is a comment
+sample word(s) : indexed word(s)
+...............................
+
+where  the colon (:) symbol acts as a delimiter.
+
+
+
+A thesaurus dictionary uses a subdictionary (which
+should be defined in the full text configuration) to normalize the
+thesaurus text. It is only possible to define one dictionary.  Notice that
+the subdictionary will produce an error if it can
+not recognize a word. In that case, you should remove the definition of
+the word or teach the subdictionary to about it.
+Use an asterisk (*) at the beginning of an indexed word to
+skip the subdictionary. It is still required that sample words are known.
+
+
+
+The thesaurus dictionary looks for the longest match.
+
+
+
+Stop words recognized by the subdictionary are replaced by a 'stop word
+placeholder' to record their position. To break possible ties the thesaurus
+uses the last definition. To illustrate this, consider a thesaurus (with
+a simple subdictionary) with pattern
+'swsw', where 's' designates any stop word and
+'w', any known word:
+
+a one the two : swsw
+the one a two : swsw2
+
+Words 'a' and 'the' are stop words defined in the
+configuration of a subdictionary. The thesaurus considers 'the
+one the two' and 'that one then two' as equal
+and will use definition 'swsw2'.
+
+
+
+As any normal dictionary, it can be assigned to the specific lexeme types.
+Since a thesaurus dictionary has the capability to recognize phrases it
+must remember its state and interact with the parser. A thesaurus dictionary
+uses these assignments to check if it should handle the next word or stop
+accumulation.  The thesaurus dictionary compiler must be configured
+carefully. For example, if the thesaurus dictionary is assigned to handle
+only the lword lexeme, then a thesaurus dictionary
+definition like ' one 7' will not work since lexeme type
+digit is not assigned to the thesaurus dictionary.
+
+
+
+
+
+Thesaurus Configuration
+
+
+To define a new thesaurus dictionary one can use the thesaurus template.
+For example:
+
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_simple
+    TEMPLATE thesaurus_template
+    OPTION 'DictFile="dicts_data/thesaurus.txt.sample", Dictionary="en_stem"';
+
+Here:
+
+
+thesaurus_simple is the thesaurus dictionary name
+
+
+DictFile="/path/to/thesaurus_simple.txt" is the location of the thesaurus file
+
+
+Dictionary="en_stem" defines the dictionary (snowball
+English stemmer) to use for thesaurus normalization. Notice that the
+en_stem dictionary has it is own configuration (for example,
+stop words).
+
+
+
+Now it is possible to bind the thesaurus dictionary thesaurus_simple
+and selected tokens, for example:
+
+
+ALTER TEXT SEARCH russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple;
+
+
+
+
+
+
+Thesaurus Example
+
+
+Consider a simple astronomical thesaurus thesaurus_astro,
+which contains some astronomical word combinations:
+
+supernovae stars : sn
+crab nebulae : crab
+
+Below we create a dictionary and bind some token types with
+an astronomical thesaurus and english stemmer:
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_astro OPTION
+    TEMPLATE thesaurus_template
+    'DictFile="dicts_data/thesaurus_astro.txt", Dictionary="en_stem"';
+ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword
+    WITH thesaurus_astro, en_stem;
+
+Now we can see how it works. Note that lexize cannot
+be used for testing the thesaurus (see description of
+lexize), but we can use
+plainto_tsquery and to_tsvector
+which accept text arguments, not lexemes:
+
+
+SELECT plainto_tsquery('supernova star');
+ plainto_tsquery
+-----------------
+ 'sn'
+SELECT to_tsvector('supernova star');
+ to_tsvector
+-------------
+ 'sn':1
+
+In principle, one can use to_tsquery if you quote
+the argument:
+
+SELECT to_tsquery('''supernova star''');
+ to_tsquery
+------------
+ 'sn'
+
+Notice that supernova star matches supernovae
+stars in thesaurus_astro because we specified the
+en_stem stemmer in the thesaurus definition.
+
+
+To keep an original phrase in full text indexing just add it to the right part
+of the definition:
+
+supernovae stars : sn supernovae stars
+
+SELECT plainto_tsquery('supernova star');
+       plainto_tsquery
+-----------------------------
+ 'sn' & 'supernova' & 'star'
+
+
+
+
+
+
+Ispell Dictionary
+
+
+The Ispell template dictionary for full text allows the
+creation of morphological dictionaries based on 
+url="http://ficus-www.cs.ucla.edu/geoff/ispell.html">Ispell, which
+supports a large number of languages. This dictionary tries to change an
+input word to its normalized form. Also, more modern spelling dictionaries
+are supported - 
+url="http://en.wikipedia.org/wiki/MySpell">MySpell (OO < 2.0.1)
+and Hunspell
+(OO >= 2.0.2).  A large list of dictionaries is available on the 
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+




This is the main PostgreSQL git repository.
RSS
Atom
+linkend="textsearch-tsvector">, but for now consider a simple example:
+
+SELECT to_tsvector('english', 'a fat  cat sat on a mat - it ate a fat rats');
+                     to_tsvector
+-----------------------------------------------------
+ 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4
+
+
+
+ 
+In the example above we see that the resulting tsvector does not
+contain the words a, on, or
+it, the word rats became
+rat, and the punctuation sign - was
+ignored. 
+ 
+
+
+The to_tsvector function internally calls a parser
+which breaks the document (a fat  cat sat on a mat - it ate a
+fat rats) into words and corresponding types. The default parser
+recognizes 23 types.  Each word, depending on its type, passes through a
+group of dictionaries ().  At the
+end of this step we obtain lexemes.  For example,
+rats became rat because one of the
+dictionaries recognized that the word rats is a plural
+form of rat.  Some words are treated as "stop words"
+() and ignored since they occur too
+frequently and have little informational value.  In our example these are
+a, on, and it.
+The punctuation sign - was also ignored because its
+type (Space symbols) is not indexed. All information
+about the parser, dictionaries and what types of lexemes to index is
+documented in the full text configuration section (
+linkend="textsearch-tables-configuration">).  It is possible to have
+several different configurations in the same database, and many predefined
+system configurations are available for different languages. In our example
+we used the default configuration english for the
+English language.
+
+
+
+As another example, below is the output from the ts_debug
+function (  ), which shows all details
+of the full text machinery:
+
+SELECT * FROM ts_debug('english','a fat  cat sat on a mat - it ate a fat rats');
+ Alias |  Description  | Token |      Dicts list      |       Lexized token
+-------+---------------+-------+----------------------+---------------------------
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | cat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {cat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | sat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {sat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | on    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | mat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {mat}
+ blank | Space symbols |       |                      |
+ blank | Space symbols | -     |                      |
+ lword | Latin word    | it    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | ate   | {pg_catalog.en_stem} | pg_catalog.en_stem: {ate}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | rats  | {pg_catalog.en_stem} | pg_catalog.en_stem: {rat}
+(24 rows)
+
+
+
+
+Function setweight() is used to label
+tsvector. The typical usage of this is to mark out the
+different parts of a document, perhaps by importance.  Later, this can be
+used for ranking of search results in addition to positional information
+(distance between query terms).  If no ranking is required, positional
+information can be removed from tsvector using the
+strip() function to save space.
+
+
+
+Because to_tsvector(NULL) can
+return NULL, it is recommended to use
+coalesce. Here is the safe method for creating a
+tsvector from a structured document:
+
+UPDATE tt SET ti=
+    setweight(to_tsvector(coalesce(title,'')), 'A')    || ' ' ||
+    setweight(to_tsvector(coalesce(keyword,'')), 'B')  || ' ' ||
+    setweight(to_tsvector(coalesce(abstract,'')), 'C') || ' ' ||
+    setweight(to_tsvector(coalesce(body,'')), 'D');
+
+
+
+
+The following functions allow manual parsing control:
+
+
+
+
+
+
+parse
+
+
+
+
+parse(parser,  document TEXT) returns SETOF tokenout
+
+
+
+
+
+Parses the given document and returns a series
+of records, one for each token produced by parsing. Each record includes
+a tokid giving its type and a token
+which gives its content:
+
+SELECT * FROM parse('default','123 - a number');
+ tokid | token
+-------+--------
+    22 | 123
+    12 |
+    12 | -
+     1 | a
+    12 |
+     1 | number
+
+
+
+
+
+
+
+token_type
+
+
+
+
+token_type(parser ) returns SETOF tokentype
+
+
+
+
+
+Returns a table which describes each kind of token the
+parser might produce as output.  For each token
+type the table gives the tokid which the
+parser uses to label each
+token of that type, the alias which
+names the token type, and a short description:
+
+SELECT * FROM token_type('default');
+ tokid |    alias     |            description
+-------+--------------+-----------------------------------
+     1 | lword        | Latin word
+     2 | nlword       | Non-latin word
+     3 | word         | Word
+     4 | email        | Email
+     5 | url          | URL
+     6 | host         | Host
+     7 | sfloat       | Scientific notation
+     8 | version      | VERSION
+     9 | part_hword   | Part of hyphenated word
+    10 | nlpart_hword | Non-latin part of hyphenated word
+    11 | lpart_hword  | Latin part of hyphenated word
+    12 | blank        | Space symbols
+    13 | tag          | HTML Tag
+    14 | protocol     | Protocol head
+    15 | hword        | Hyphenated word
+    16 | lhword       | Latin hyphenated word
+    17 | nlhword      | Non-latin hyphenated word
+    18 | uri          | URI
+    19 | file         | File or path name
+    20 | float        | Decimal notation
+    21 | int          | Signed integer
+    22 | uint         | Unsigned integer
+    23 | entity       | HTML Entity
+
+
+
+
+
+
+
+
+
+
+
+
+Ranking Search Results
+
+
+Ranking attempts to measure how relevant documents are to a particular
+query by inspecting the number of times each search word appears in the
+document, and whether different search terms occur near each other.  Full
+text searching provides two predefined ranking functions which attempt to
+produce a measure of how a document is relevant to the query.  In spite
+of that, the concept of relevancy is vague and very application-specific.
+These functions try to take into account lexical, proximity, and structural
+information.  Different applications might require additional information
+for ranking, e.g. document modification time.
+
+
+
+The lexical part of ranking reflects how often the query terms appear in
+the document, how close the document query terms are, and in what part of
+the document they occur.  Note that ranking functions that use positional
+information will only work on unstripped tsvectors because stripped
+tsvectors lack positional information.
+
+
+
+The two ranking functions currently available are:
+
+
+
+
+
+
+rank
+
+
+
+
+rank( weights float4[], vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This ranking function offers the ability to weigh word instances more
+heavily depending on how you have classified them.  The weights specify
+how heavily to weigh each category of word:
+
+{D-weight, C-weight, B-weight, A-weight}
+ 
+If no weights are provided,
+then these defaults are used:
+
+{0.1, 0.2, 0.4, 1.0}
+
+Often weights are used to mark words from special areas of the document,
+like the title or an initial abstract, and make them more or less important
+than words in the document body.
+
+
+
+
+
+
+
+rank_cd
+
+
+
+
+rank_cd( weights float4[],  vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This function computes the cover density ranking for
+the given document vector and query, as described in Clarke, Cormack, and
+Tudhope's "Relevance Ranking for One to Three Term Queries" in the
+"Information Processing and Management", 1999.
+
+
+
+
+
+
+
+
+
+Since a longer document has a greater chance of containing a query term
+it is reasonable to take into account document size, i.e. a hundred-word
+document with five instances of a search word is probably more relevant
+than a thousand-word document with five instances.  Both ranking functions
+take an integer normalization option that
+specifies whether a document's length should impact its rank.  The integer
+option controls several behaviors which is done using bit-wise fields and
+| (for example, 2|4):
+
+
+
+0 (the default) ignores the document length
+
+
+1 divides the rank by 1 + the logarithm of the document length
+
+
+2 divides the rank by the length itself
+
+
+
+4 divides the rank by the mean harmonic distance between extents
+
+
+8 divides the rank by the number of unique words in document
+
+
+16 divides the rank by 1 + logarithm of the number of unique words in document
+
+
+
+
+
+
+It is important to note that ranking functions do not use any global
+information so it is impossible to produce a fair normalization to 1% or
+100%, as sometimes required. However, a simple technique like
+rank/(rank+1) can be applied.  Of course, this is just
+a cosmetic change, i.e., the ordering of the search results will not change.
+
+
+
+Several examples are shown below; note that the second example uses
+normalized ranking:
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |   rnk
+-----------------------------------------------+----------
+ Neutrinos in the Sun                          |      3.1
+ The Sudbury Neutrino Detector                 |      2.4
+ A MACHO View of Galactic Dark Matter          |  2.01317
+ Hot Gas and Dark Matter                       |  1.91171
+ The Virgo Cluster: Hot Plasma and Dark Matter |  1.90953
+ Rafting for Solar Neutrinos                   |      1.9
+ NGC 4650A: Strange Galaxy and Dark Matter     |  1.85774
+ Hot Gas and Dark Matter                       |   1.6123
+ Ice Fishing for Cosmic Neutrinos              |      1.6
+ Weak Lensing Distorts the Universe            | 0.818218
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query)/
+(rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) + 1) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE  query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |        rnk
+-----------------------------------------------+-------------------
+ Neutrinos in the Sun                          | 0.756097569485493
+ The Sudbury Neutrino Detector                 | 0.705882361190954
+ A MACHO View of Galactic Dark Matter          | 0.668123210574724
+ Hot Gas and Dark Matter                       |  0.65655958650282
+ The Virgo Cluster: Hot Plasma and Dark Matter | 0.656301290640973
+ Rafting for Solar Neutrinos                   | 0.655172410958162
+ NGC 4650A: Strange Galaxy and Dark Matter     | 0.650072921219637
+ Hot Gas and Dark Matter                       | 0.617195790024749
+ Ice Fishing for Cosmic Neutrinos              | 0.615384618911517
+ Weak Lensing Distorts the Universe            | 0.450010798361481
+
+
+
+
+The first argument in rank_cd ('{0.1, 0.2,
+0.4, 1.0}') is an optional parameter which specifies the
+weights for labels D, C,
+B, and A used in function
+setweight. These default values show that lexemes
+labeled as A are ten times more important than ones
+that are labeled with D.
+
+
+
+Ranking can be expensive since it requires consulting the
+tsvector of all documents, which can be I/O bound and
+therefore slow. Unfortunately, it is almost impossible to avoid since full
+text searching in a database should work without indexes .  Moreover an index can be lossy (a GiST
+index, for example) so it must check documents to avoid false hits.
+
+
+
+Note that the ranking functions above are only examples.  You can write
+your own ranking functions and/or combine additional factors to fit your
+specific needs.
+
+
+
+
+
+
+Highlighting Results
+
+
+headline
+
+
+
+To present search results it is ideal to show a part of each document and
+how it is related to the query. Usually, search engines show fragments of
+the document with marked search terms.  PostgreSQL full
+text searching provides the function headline that
+implements such functionality.
+
+
+
+
+
+
+
+
+headline( conf_name text, document text, query TSQUERY,  options text ) returns text
+
+
+
+
+
+The headline() function accepts a document along with
+a query, and returns one or more ellipsis-separated excerpts from the
+document in which terms from the query are highlighted.  The configuration
+used to parse the document can be specified by its
+conf_name; if none is specified, the current
+configuration is used.
+
+
+
+
+
+
+
+
+If an options string is specified it should
+consist of a comma-separated list of one or more 'option=value' pairs.
+The available options are:
+
+
+
+StartSel, StopSel: the strings with which
+query words appearing in the document should be delimited to distinguish
+them from other excerpted words.
+
+
+MaxWords, MinWords: limit the shortest and
+longest headlines to output
+
+
+ShortWord: this prevents your headline from beginning
+or ending with a word which has this many characters or less. The default
+value of three eliminates the English articles.
+
+
+HighlightAll: boolean flag;  if
+true the whole document will be highlighted
+
+
+
+Any unspecified options receive these defaults:
+
+StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
+
+
+
+
+For example:
+
+
+SELECT headline('a b c', 'c'::tsquery);
+   headline
+--------------
+ a b <b>c</b>
+SELECT headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>');
+ headline
+----------
+ a b  <c>
+
+
+
+
+headline uses the original document, not
+tsvector, so it can be slow and should be used with care.
+A typical mistake is to call headline() for
+every matching document when only ten documents are
+shown. SQL subselects can help here;  below is an
+example:
+
+
+SELECT id,headline(body,q), rank
+FROM (SELECT id,body,q, rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q
+      WHERE ti @@ q
+      ORDER BY rank DESC LIMIT 10) AS foo;
+
+
+
+
+Note that the cascade dropping of the parser function
+causes dropping of the headline used in the full text search
+configuration conf_name.
+
+
+
+
+
+
+
+Dictionaries
+
+
+Dictionaries are used to specify words that should not be considered in
+a search and for the normalization of words to allow the user to use any
+derived form of a word in a query. Also, normalization can reduce the size of
+tsvector. Normalization does not always have linguistic
+meaning and usually depends on application semantics.
+
+
+
+Some examples of normalization:  
+
+
+
+
+ Linguistic - ispell dictionaries try to reduce input words to a
+normalized form; stemmer dictionaries remove word endings
+ 
+
+ Identical URL locations are identified and canonicalized:
+
+
+
+http://www.pgsql.ru/db/mw/index.html
+
+
+http://www.pgsql.ru/db/mw/
+
+
+http://www.pgsql.ru/db/../db/mw/index.html
+
+
+
+
+
+Colour names are substituted by their hexadecimal values, e.g.,
+red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF
+
+
+Remove some numeric fractional digits to reduce the range of possible
+numbers, so 3.14159265359,
+3.1415926, 3.14 will be the same
+after normalization if only two digits are kept after the decimal point.
+
+
+
+
+
+
+A dictionary is a program which accepts lexemes as
+input and returns:
+
+
+an array of lexemes if the input lexeme is known to the dictionary
+
+
+a void array if the dictionary knows the lexeme, but it is a stop word
+
+
+NULL if the dictionary does not recognize the input lexeme
+
+
+
+WARNING:
+Data files used by dictionaries should be in the server_encoding
+so all encodings are consistent across databases.
+
+
+
+Full text searching provides predefined dictionaries for many languages,
+and SQL commands to manipulate them.  There are also
+several predefined template dictionaries that can be used to create new
+dictionaries by overriding their default parameters.  Besides this, it is
+possible to develop custom dictionaries using an API;
+see the dictionary for integers (
+linkend="textsearch-rule-dictionary-example">) as an example.
+
+
+
+The ALTER TEXT SEARCH CONFIGURATION public.pg ADD
+MAPPING command binds specific types of lexemes and a set of
+dictionaries to process them. (Mappings can also be specified as part of
+configuration creation.) Lexemes are processed by a stack of dictionaries
+until some dictionary identifies it as a known word or it turns out to be
+a stop word.  If no dictionary recognizes a lexeme, it will be discarded
+and not indexed. A general rule for configuring a stack of dictionaries
+is to place first the most narrow, most specific dictionary, then the more
+general dictionaries and finish it with a very general dictionary, like
+the snowball stemmer or simple, which
+recognizes everything.  For example, for an astronomy-specific search
+(astro_en configuration) one could bind
+lword (latin word) with a synonym dictionary of astronomical
+terms, a general English dictionary and a snowball English
+stemmer:
+
+ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, en_ispell, en_stem;
+
+
+
+
+Function lexize can be used to test dictionaries,
+for example:
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+(1 row)
+
+Also, the ts_debug function ()
+can be used for this.
+
+
+
+Stop Words
+
+Stop words are words which are very common, appear in almost
+every document, and have no discrimination value. Therefore, they can be ignored
+in the context of full text searching. For example, every English text contains
+words like a although it is useless to store them in an index.
+However, stop words do affect the positions in tsvector,
+which in turn, do affect ranking:
+
+SELECT to_tsvector('english','in the list of stop words');
+        to_tsvector
+----------------------------
+ 'list':3 'stop':5 'word':6
+
+The gaps between positions 1-3 and 3-5 are because of stop words, so ranks
+calculated for documents with and without stop words are quite different:
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+     0.5
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+       1
+
+
+
+
+
+It is up to the specific dictionary how it treats stop words. For example,
+ispell dictionaries first normalize words and then
+look at the list of stop words, while stemmers
+first check the list of stop words. The reason for the different
+behaviour is an attempt to decrease possible noise.
+
+
+
+Here is an example of a dictionary that returns the input word as lowercase
+or NULL if it is a stop word; it also specifies the location
+of the file of stop words.  It uses the simple dictionary as
+a template:
+
+CREATE TEXT SEARCH DICTIONARY public.simple_dict
+    TEMPLATE pg_catalog.simple
+    OPTION 'english.stop';
+
+Relative paths in OPTION resolve relative to
+share/.  Now we can test our
+dictionary:
+
+SELECT lexize('public.simple_dict','YeS');
+ lexize
+--------
+ {yes}
+SELECT lexize('public.simple_dict','The');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+Synonym Dictionary
+
+
+This dictionary template is used to create dictionaries which replace a
+word with a synonym. Phrases are not supported (use the thesaurus
+dictionary () if you need them).  Synonym
+dictionary can be used to overcome linguistic problems, for example, to
+prevent an English stemmer dictionary from reducing the word 'Paris' to
+'pari'.  In that case, it is enough to have a Paris
+paris line in the synonym dictionary and put it before the
+en_stem dictionary:
+
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |      Dicts list      |       Lexized token
+-------+-------------+-------+----------------------+----------------------------
+ lword | Latin word  | Paris | {pg_catalog.en_stem} | pg_catalog.en_stem: {pari}
+(1 row)
+ALTER TEXT SEARCH CONFIGURATION ADD MAPPING ON english FOR lword WITH synonym, en_stem;
+ALTER TEXT SEARCH MAPPING
+Time: 340.867 ms
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |               Dicts list                |        Lexized token
+-------+-------------+-------+-----------------------------------------+-----------------------------
+ lword | Latin word  | Paris | {pg_catalog.synonym,pg_catalog.en_stem} | pg_catalog.synonym: {paris}
+(1 row)
+
+
+
+
+
+
+Thesaurus Dictionary
+
+
+A thesaurus dictionary (sometimes abbreviated as TZ) is
+a collection of words which includes information about the relationships
+of words and phrases, i.e., broader terms (BT), narrower
+terms (NT), preferred terms, non-preferred terms, related
+terms, etc.
+
+
+Basically a thesaurus dictionary replaces all non-preferred terms by one
+preferred term and, optionally, preserves them for indexing.  Thesauruses
+are used during indexing so any change in the thesaurus requires
+reindexing.  The current implementation of the thesaurus
+dictionary is an extension of the synonym dictionary with added
+phrase support.  A thesaurus is a plain file of the
+following format:
+
+# this is a comment
+sample word(s) : indexed word(s)
+...............................
+
+where  the colon (:) symbol acts as a delimiter.
+
+
+
+A thesaurus dictionary uses a subdictionary (which
+should be defined in the full text configuration) to normalize the
+thesaurus text. It is only possible to define one dictionary.  Notice that
+the subdictionary will produce an error if it can
+not recognize a word. In that case, you should remove the definition of
+the word or teach the subdictionary to about it.
+Use an asterisk (*) at the beginning of an indexed word to
+skip the subdictionary. It is still required that sample words are known.
+
+
+
+The thesaurus dictionary looks for the longest match.
+
+
+
+Stop words recognized by the subdictionary are replaced by a 'stop word
+placeholder' to record their position. To break possible ties the thesaurus
+uses the last definition. To illustrate this, consider a thesaurus (with
+a simple subdictionary) with pattern
+'swsw', where 's' designates any stop word and
+'w', any known word:
+
+a one the two : swsw
+the one a two : swsw2
+
+Words 'a' and 'the' are stop words defined in the
+configuration of a subdictionary. The thesaurus considers 'the
+one the two' and 'that one then two' as equal
+and will use definition 'swsw2'.
+
+
+
+As any normal dictionary, it can be assigned to the specific lexeme types.
+Since a thesaurus dictionary has the capability to recognize phrases it
+must remember its state and interact with the parser. A thesaurus dictionary
+uses these assignments to check if it should handle the next word or stop
+accumulation.  The thesaurus dictionary compiler must be configured
+carefully. For example, if the thesaurus dictionary is assigned to handle
+only the lword lexeme, then a thesaurus dictionary
+definition like ' one 7' will not work since lexeme type
+digit is not assigned to the thesaurus dictionary.
+
+
+
+
+
+Thesaurus Configuration
+
+
+To define a new thesaurus dictionary one can use the thesaurus template.
+For example:
+
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_simple
+    TEMPLATE thesaurus_template
+    OPTION 'DictFile="dicts_data/thesaurus.txt.sample", Dictionary="en_stem"';
+
+Here:
+
+
+thesaurus_simple is the thesaurus dictionary name
+
+
+DictFile="/path/to/thesaurus_simple.txt" is the location of the thesaurus file
+
+
+Dictionary="en_stem" defines the dictionary (snowball
+English stemmer) to use for thesaurus normalization. Notice that the
+en_stem dictionary has it is own configuration (for example,
+stop words).
+
+
+
+Now it is possible to bind the thesaurus dictionary thesaurus_simple
+and selected tokens, for example:
+
+
+ALTER TEXT SEARCH russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple;
+
+
+
+
+
+
+Thesaurus Example
+
+
+Consider a simple astronomical thesaurus thesaurus_astro,
+which contains some astronomical word combinations:
+
+supernovae stars : sn
+crab nebulae : crab
+
+Below we create a dictionary and bind some token types with
+an astronomical thesaurus and english stemmer:
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_astro OPTION
+    TEMPLATE thesaurus_template
+    'DictFile="dicts_data/thesaurus_astro.txt", Dictionary="en_stem"';
+ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword
+    WITH thesaurus_astro, en_stem;
+
+Now we can see how it works. Note that lexize cannot
+be used for testing the thesaurus (see description of
+lexize), but we can use
+plainto_tsquery and to_tsvector
+which accept text arguments, not lexemes:
+
+
+SELECT plainto_tsquery('supernova star');
+ plainto_tsquery
+-----------------
+ 'sn'
+SELECT to_tsvector('supernova star');
+ to_tsvector
+-------------
+ 'sn':1
+
+In principle, one can use to_tsquery if you quote
+the argument:
+
+SELECT to_tsquery('''supernova star''');
+ to_tsquery
+------------
+ 'sn'
+
+Notice that supernova star matches supernovae
+stars in thesaurus_astro because we specified the
+en_stem stemmer in the thesaurus definition.
+
+
+To keep an original phrase in full text indexing just add it to the right part
+of the definition:
+
+supernovae stars : sn supernovae stars
+
+SELECT plainto_tsquery('supernova star');
+       plainto_tsquery
+-----------------------------
+ 'sn' & 'supernova' & 'star'
+
+
+
+
+
+
+Ispell Dictionary
+
+
+The Ispell template dictionary for full text allows the
+creation of morphological dictionaries based on 
+url="http://ficus-www.cs.ucla.edu/geoff/ispell.html">Ispell, which
+supports a large number of languages. This dictionary tries to change an
+input word to its normalized form. Also, more modern spelling dictionaries
+are supported - 
+url="http://en.wikipedia.org/wiki/MySpell">MySpell (OO < 2.0.1)
+and Hunspell
+(OO >= 2.0.2).  A large list of dictionaries is available on the 
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+




This is the main PostgreSQL git repository.
RSS
Atom
+linkend="textsearch-tables-configuration">).  It is possible to have
+several different configurations in the same database, and many predefined
+system configurations are available for different languages. In our example
+we used the default configuration english for the
+English language.
+
+
+
+As another example, below is the output from the ts_debug
+function (  ), which shows all details
+of the full text machinery:
+
+SELECT * FROM ts_debug('english','a fat  cat sat on a mat - it ate a fat rats');
+ Alias |  Description  | Token |      Dicts list      |       Lexized token
+-------+---------------+-------+----------------------+---------------------------
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | cat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {cat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | sat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {sat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | on    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | mat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {mat}
+ blank | Space symbols |       |                      |
+ blank | Space symbols | -     |                      |
+ lword | Latin word    | it    | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | ate   | {pg_catalog.en_stem} | pg_catalog.en_stem: {ate}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | a     | {pg_catalog.en_stem} | pg_catalog.en_stem: {}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | fat   | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat}
+ blank | Space symbols |       |                      |
+ lword | Latin word    | rats  | {pg_catalog.en_stem} | pg_catalog.en_stem: {rat}
+(24 rows)
+
+
+
+
+Function setweight() is used to label
+tsvector. The typical usage of this is to mark out the
+different parts of a document, perhaps by importance.  Later, this can be
+used for ranking of search results in addition to positional information
+(distance between query terms).  If no ranking is required, positional
+information can be removed from tsvector using the
+strip() function to save space.
+
+
+
+Because to_tsvector(NULL) can
+return NULL, it is recommended to use
+coalesce. Here is the safe method for creating a
+tsvector from a structured document:
+
+UPDATE tt SET ti=
+    setweight(to_tsvector(coalesce(title,'')), 'A')    || ' ' ||
+    setweight(to_tsvector(coalesce(keyword,'')), 'B')  || ' ' ||
+    setweight(to_tsvector(coalesce(abstract,'')), 'C') || ' ' ||
+    setweight(to_tsvector(coalesce(body,'')), 'D');
+
+
+
+
+The following functions allow manual parsing control:
+
+
+
+
+
+
+parse
+
+
+
+
+parse(parser,  document TEXT) returns SETOF tokenout
+
+
+
+
+
+Parses the given document and returns a series
+of records, one for each token produced by parsing. Each record includes
+a tokid giving its type and a token
+which gives its content:
+
+SELECT * FROM parse('default','123 - a number');
+ tokid | token
+-------+--------
+    22 | 123
+    12 |
+    12 | -
+     1 | a
+    12 |
+     1 | number
+
+
+
+
+
+
+
+token_type
+
+
+
+
+token_type(parser ) returns SETOF tokentype
+
+
+
+
+
+Returns a table which describes each kind of token the
+parser might produce as output.  For each token
+type the table gives the tokid which the
+parser uses to label each
+token of that type, the alias which
+names the token type, and a short description:
+
+SELECT * FROM token_type('default');
+ tokid |    alias     |            description
+-------+--------------+-----------------------------------
+     1 | lword        | Latin word
+     2 | nlword       | Non-latin word
+     3 | word         | Word
+     4 | email        | Email
+     5 | url          | URL
+     6 | host         | Host
+     7 | sfloat       | Scientific notation
+     8 | version      | VERSION
+     9 | part_hword   | Part of hyphenated word
+    10 | nlpart_hword | Non-latin part of hyphenated word
+    11 | lpart_hword  | Latin part of hyphenated word
+    12 | blank        | Space symbols
+    13 | tag          | HTML Tag
+    14 | protocol     | Protocol head
+    15 | hword        | Hyphenated word
+    16 | lhword       | Latin hyphenated word
+    17 | nlhword      | Non-latin hyphenated word
+    18 | uri          | URI
+    19 | file         | File or path name
+    20 | float        | Decimal notation
+    21 | int          | Signed integer
+    22 | uint         | Unsigned integer
+    23 | entity       | HTML Entity
+
+
+
+
+
+
+
+
+
+
+
+
+Ranking Search Results
+
+
+Ranking attempts to measure how relevant documents are to a particular
+query by inspecting the number of times each search word appears in the
+document, and whether different search terms occur near each other.  Full
+text searching provides two predefined ranking functions which attempt to
+produce a measure of how a document is relevant to the query.  In spite
+of that, the concept of relevancy is vague and very application-specific.
+These functions try to take into account lexical, proximity, and structural
+information.  Different applications might require additional information
+for ranking, e.g. document modification time.
+
+
+
+The lexical part of ranking reflects how often the query terms appear in
+the document, how close the document query terms are, and in what part of
+the document they occur.  Note that ranking functions that use positional
+information will only work on unstripped tsvectors because stripped
+tsvectors lack positional information.
+
+
+
+The two ranking functions currently available are:
+
+
+
+
+
+
+rank
+
+
+
+
+rank( weights float4[], vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This ranking function offers the ability to weigh word instances more
+heavily depending on how you have classified them.  The weights specify
+how heavily to weigh each category of word:
+
+{D-weight, C-weight, B-weight, A-weight}
+ 
+If no weights are provided,
+then these defaults are used:
+
+{0.1, 0.2, 0.4, 1.0}
+
+Often weights are used to mark words from special areas of the document,
+like the title or an initial abstract, and make them more or less important
+than words in the document body.
+
+
+
+
+
+
+
+rank_cd
+
+
+
+
+rank_cd( weights float4[],  vector TSVECTOR, query TSQUERY,  normalization int4 ) returns float4
+
+
+
+
+
+This function computes the cover density ranking for
+the given document vector and query, as described in Clarke, Cormack, and
+Tudhope's "Relevance Ranking for One to Three Term Queries" in the
+"Information Processing and Management", 1999.
+
+
+
+
+
+
+
+
+
+Since a longer document has a greater chance of containing a query term
+it is reasonable to take into account document size, i.e. a hundred-word
+document with five instances of a search word is probably more relevant
+than a thousand-word document with five instances.  Both ranking functions
+take an integer normalization option that
+specifies whether a document's length should impact its rank.  The integer
+option controls several behaviors which is done using bit-wise fields and
+| (for example, 2|4):
+
+
+
+0 (the default) ignores the document length
+
+
+1 divides the rank by 1 + the logarithm of the document length
+
+
+2 divides the rank by the length itself
+
+
+
+4 divides the rank by the mean harmonic distance between extents
+
+
+8 divides the rank by the number of unique words in document
+
+
+16 divides the rank by 1 + logarithm of the number of unique words in document
+
+
+
+
+
+
+It is important to note that ranking functions do not use any global
+information so it is impossible to produce a fair normalization to 1% or
+100%, as sometimes required. However, a simple technique like
+rank/(rank+1) can be applied.  Of course, this is just
+a cosmetic change, i.e., the ordering of the search results will not change.
+
+
+
+Several examples are shown below; note that the second example uses
+normalized ranking:
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |   rnk
+-----------------------------------------------+----------
+ Neutrinos in the Sun                          |      3.1
+ The Sudbury Neutrino Detector                 |      2.4
+ A MACHO View of Galactic Dark Matter          |  2.01317
+ Hot Gas and Dark Matter                       |  1.91171
+ The Virgo Cluster: Hot Plasma and Dark Matter |  1.90953
+ Rafting for Solar Neutrinos                   |      1.9
+ NGC 4650A: Strange Galaxy and Dark Matter     |  1.85774
+ Hot Gas and Dark Matter                       |   1.6123
+ Ice Fishing for Cosmic Neutrinos              |      1.6
+ Weak Lensing Distorts the Universe            | 0.818218
+
+SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query)/
+(rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) + 1) AS rnk
+FROM apod, to_tsquery('neutrino|(dark & matter)') query
+WHERE  query @@ textsearch
+ORDER BY rnk DESC LIMIT 10;
+                     title                     |        rnk
+-----------------------------------------------+-------------------
+ Neutrinos in the Sun                          | 0.756097569485493
+ The Sudbury Neutrino Detector                 | 0.705882361190954
+ A MACHO View of Galactic Dark Matter          | 0.668123210574724
+ Hot Gas and Dark Matter                       |  0.65655958650282
+ The Virgo Cluster: Hot Plasma and Dark Matter | 0.656301290640973
+ Rafting for Solar Neutrinos                   | 0.655172410958162
+ NGC 4650A: Strange Galaxy and Dark Matter     | 0.650072921219637
+ Hot Gas and Dark Matter                       | 0.617195790024749
+ Ice Fishing for Cosmic Neutrinos              | 0.615384618911517
+ Weak Lensing Distorts the Universe            | 0.450010798361481
+
+
+
+
+The first argument in rank_cd ('{0.1, 0.2,
+0.4, 1.0}') is an optional parameter which specifies the
+weights for labels D, C,
+B, and A used in function
+setweight. These default values show that lexemes
+labeled as A are ten times more important than ones
+that are labeled with D.
+
+
+
+Ranking can be expensive since it requires consulting the
+tsvector of all documents, which can be I/O bound and
+therefore slow. Unfortunately, it is almost impossible to avoid since full
+text searching in a database should work without indexes .  Moreover an index can be lossy (a GiST
+index, for example) so it must check documents to avoid false hits.
+
+
+
+Note that the ranking functions above are only examples.  You can write
+your own ranking functions and/or combine additional factors to fit your
+specific needs.
+
+
+
+
+
+
+Highlighting Results
+
+
+headline
+
+
+
+To present search results it is ideal to show a part of each document and
+how it is related to the query. Usually, search engines show fragments of
+the document with marked search terms.  PostgreSQL full
+text searching provides the function headline that
+implements such functionality.
+
+
+
+
+
+
+
+
+headline( conf_name text, document text, query TSQUERY,  options text ) returns text
+
+
+
+
+
+The headline() function accepts a document along with
+a query, and returns one or more ellipsis-separated excerpts from the
+document in which terms from the query are highlighted.  The configuration
+used to parse the document can be specified by its
+conf_name; if none is specified, the current
+configuration is used.
+
+
+
+
+
+
+
+
+If an options string is specified it should
+consist of a comma-separated list of one or more 'option=value' pairs.
+The available options are:
+
+
+
+StartSel, StopSel: the strings with which
+query words appearing in the document should be delimited to distinguish
+them from other excerpted words.
+
+
+MaxWords, MinWords: limit the shortest and
+longest headlines to output
+
+
+ShortWord: this prevents your headline from beginning
+or ending with a word which has this many characters or less. The default
+value of three eliminates the English articles.
+
+
+HighlightAll: boolean flag;  if
+true the whole document will be highlighted
+
+
+
+Any unspecified options receive these defaults:
+
+StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE
+
+
+
+
+For example:
+
+
+SELECT headline('a b c', 'c'::tsquery);
+   headline
+--------------
+ a b <b>c</b>
+SELECT headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>');
+ headline
+----------
+ a b  <c>
+
+
+
+
+headline uses the original document, not
+tsvector, so it can be slow and should be used with care.
+A typical mistake is to call headline() for
+every matching document when only ten documents are
+shown. SQL subselects can help here;  below is an
+example:
+
+
+SELECT id,headline(body,q), rank
+FROM (SELECT id,body,q, rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q
+      WHERE ti @@ q
+      ORDER BY rank DESC LIMIT 10) AS foo;
+
+
+
+
+Note that the cascade dropping of the parser function
+causes dropping of the headline used in the full text search
+configuration conf_name.
+
+
+
+
+
+
+
+Dictionaries
+
+
+Dictionaries are used to specify words that should not be considered in
+a search and for the normalization of words to allow the user to use any
+derived form of a word in a query. Also, normalization can reduce the size of
+tsvector. Normalization does not always have linguistic
+meaning and usually depends on application semantics.
+
+
+
+Some examples of normalization:  
+
+
+
+
+ Linguistic - ispell dictionaries try to reduce input words to a
+normalized form; stemmer dictionaries remove word endings
+ 
+
+ Identical URL locations are identified and canonicalized:
+
+
+
+http://www.pgsql.ru/db/mw/index.html
+
+
+http://www.pgsql.ru/db/mw/
+
+
+http://www.pgsql.ru/db/../db/mw/index.html
+
+
+
+
+
+Colour names are substituted by their hexadecimal values, e.g.,
+red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF
+
+
+Remove some numeric fractional digits to reduce the range of possible
+numbers, so 3.14159265359,
+3.1415926, 3.14 will be the same
+after normalization if only two digits are kept after the decimal point.
+
+
+
+
+
+
+A dictionary is a program which accepts lexemes as
+input and returns:
+
+
+an array of lexemes if the input lexeme is known to the dictionary
+
+
+a void array if the dictionary knows the lexeme, but it is a stop word
+
+
+NULL if the dictionary does not recognize the input lexeme
+
+
+
+WARNING:
+Data files used by dictionaries should be in the server_encoding
+so all encodings are consistent across databases.
+
+
+
+Full text searching provides predefined dictionaries for many languages,
+and SQL commands to manipulate them.  There are also
+several predefined template dictionaries that can be used to create new
+dictionaries by overriding their default parameters.  Besides this, it is
+possible to develop custom dictionaries using an API;
+see the dictionary for integers (
+linkend="textsearch-rule-dictionary-example">) as an example.
+
+
+
+The ALTER TEXT SEARCH CONFIGURATION public.pg ADD
+MAPPING command binds specific types of lexemes and a set of
+dictionaries to process them. (Mappings can also be specified as part of
+configuration creation.) Lexemes are processed by a stack of dictionaries
+until some dictionary identifies it as a known word or it turns out to be
+a stop word.  If no dictionary recognizes a lexeme, it will be discarded
+and not indexed. A general rule for configuring a stack of dictionaries
+is to place first the most narrow, most specific dictionary, then the more
+general dictionaries and finish it with a very general dictionary, like
+the snowball stemmer or simple, which
+recognizes everything.  For example, for an astronomy-specific search
+(astro_en configuration) one could bind
+lword (latin word) with a synonym dictionary of astronomical
+terms, a general English dictionary and a snowball English
+stemmer:
+
+ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, en_ispell, en_stem;
+
+
+
+
+Function lexize can be used to test dictionaries,
+for example:
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+(1 row)
+
+Also, the ts_debug function ()
+can be used for this.
+
+
+
+Stop Words
+
+Stop words are words which are very common, appear in almost
+every document, and have no discrimination value. Therefore, they can be ignored
+in the context of full text searching. For example, every English text contains
+words like a although it is useless to store them in an index.
+However, stop words do affect the positions in tsvector,
+which in turn, do affect ranking:
+
+SELECT to_tsvector('english','in the list of stop words');
+        to_tsvector
+----------------------------
+ 'list':3 'stop':5 'word':6
+
+The gaps between positions 1-3 and 3-5 are because of stop words, so ranks
+calculated for documents with and without stop words are quite different:
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+     0.5
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+       1
+
+
+
+
+
+It is up to the specific dictionary how it treats stop words. For example,
+ispell dictionaries first normalize words and then
+look at the list of stop words, while stemmers
+first check the list of stop words. The reason for the different
+behaviour is an attempt to decrease possible noise.
+
+
+
+Here is an example of a dictionary that returns the input word as lowercase
+or NULL if it is a stop word; it also specifies the location
+of the file of stop words.  It uses the simple dictionary as
+a template:
+
+CREATE TEXT SEARCH DICTIONARY public.simple_dict
+    TEMPLATE pg_catalog.simple
+    OPTION 'english.stop';
+
+Relative paths in OPTION resolve relative to
+share/.  Now we can test our
+dictionary:
+
+SELECT lexize('public.simple_dict','YeS');
+ lexize
+--------
+ {yes}
+SELECT lexize('public.simple_dict','The');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+Synonym Dictionary
+
+
+This dictionary template is used to create dictionaries which replace a
+word with a synonym. Phrases are not supported (use the thesaurus
+dictionary () if you need them).  Synonym
+dictionary can be used to overcome linguistic problems, for example, to
+prevent an English stemmer dictionary from reducing the word 'Paris' to
+'pari'.  In that case, it is enough to have a Paris
+paris line in the synonym dictionary and put it before the
+en_stem dictionary:
+
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |      Dicts list      |       Lexized token
+-------+-------------+-------+----------------------+----------------------------
+ lword | Latin word  | Paris | {pg_catalog.en_stem} | pg_catalog.en_stem: {pari}
+(1 row)
+ALTER TEXT SEARCH CONFIGURATION ADD MAPPING ON english FOR lword WITH synonym, en_stem;
+ALTER TEXT SEARCH MAPPING
+Time: 340.867 ms
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |               Dicts list                |        Lexized token
+-------+-------------+-------+-----------------------------------------+-----------------------------
+ lword | Latin word  | Paris | {pg_catalog.synonym,pg_catalog.en_stem} | pg_catalog.synonym: {paris}
+(1 row)
+
+
+
+
+
+
+Thesaurus Dictionary
+
+
+A thesaurus dictionary (sometimes abbreviated as TZ) is
+a collection of words which includes information about the relationships
+of words and phrases, i.e., broader terms (BT), narrower
+terms (NT), preferred terms, non-preferred terms, related
+terms, etc.
+
+
+Basically a thesaurus dictionary replaces all non-preferred terms by one
+preferred term and, optionally, preserves them for indexing.  Thesauruses
+are used during indexing so any change in the thesaurus requires
+reindexing.  The current implementation of the thesaurus
+dictionary is an extension of the synonym dictionary with added
+phrase support.  A thesaurus is a plain file of the
+following format:
+
+# this is a comment
+sample word(s) : indexed word(s)
+...............................
+
+where  the colon (:) symbol acts as a delimiter.
+
+
+
+A thesaurus dictionary uses a subdictionary (which
+should be defined in the full text configuration) to normalize the
+thesaurus text. It is only possible to define one dictionary.  Notice that
+the subdictionary will produce an error if it can
+not recognize a word. In that case, you should remove the definition of
+the word or teach the subdictionary to about it.
+Use an asterisk (*) at the beginning of an indexed word to
+skip the subdictionary. It is still required that sample words are known.
+
+
+
+The thesaurus dictionary looks for the longest match.
+
+
+
+Stop words recognized by the subdictionary are replaced by a 'stop word
+placeholder' to record their position. To break possible ties the thesaurus
+uses the last definition. To illustrate this, consider a thesaurus (with
+a simple subdictionary) with pattern
+'swsw', where 's' designates any stop word and
+'w', any known word:
+
+a one the two : swsw
+the one a two : swsw2
+
+Words 'a' and 'the' are stop words defined in the
+configuration of a subdictionary. The thesaurus considers 'the
+one the two' and 'that one then two' as equal
+and will use definition 'swsw2'.
+
+
+
+As any normal dictionary, it can be assigned to the specific lexeme types.
+Since a thesaurus dictionary has the capability to recognize phrases it
+must remember its state and interact with the parser. A thesaurus dictionary
+uses these assignments to check if it should handle the next word or stop
+accumulation.  The thesaurus dictionary compiler must be configured
+carefully. For example, if the thesaurus dictionary is assigned to handle
+only the lword lexeme, then a thesaurus dictionary
+definition like ' one 7' will not work since lexeme type
+digit is not assigned to the thesaurus dictionary.
+
+
+
+
+
+Thesaurus Configuration
+
+
+To define a new thesaurus dictionary one can use the thesaurus template.
+For example:
+
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_simple
+    TEMPLATE thesaurus_template
+    OPTION 'DictFile="dicts_data/thesaurus.txt.sample", Dictionary="en_stem"';
+
+Here:
+
+
+thesaurus_simple is the thesaurus dictionary name
+
+
+DictFile="/path/to/thesaurus_simple.txt" is the location of the thesaurus file
+
+
+Dictionary="en_stem" defines the dictionary (snowball
+English stemmer) to use for thesaurus normalization. Notice that the
+en_stem dictionary has it is own configuration (for example,
+stop words).
+
+
+
+Now it is possible to bind the thesaurus dictionary thesaurus_simple
+and selected tokens, for example:
+
+
+ALTER TEXT SEARCH russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple;
+
+
+
+
+
+
+Thesaurus Example
+
+
+Consider a simple astronomical thesaurus thesaurus_astro,
+which contains some astronomical word combinations:
+
+supernovae stars : sn
+crab nebulae : crab
+
+Below we create a dictionary and bind some token types with
+an astronomical thesaurus and english stemmer:
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_astro OPTION
+    TEMPLATE thesaurus_template
+    'DictFile="dicts_data/thesaurus_astro.txt", Dictionary="en_stem"';
+ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword
+    WITH thesaurus_astro, en_stem;
+
+Now we can see how it works. Note that lexize cannot
+be used for testing the thesaurus (see description of
+lexize), but we can use
+plainto_tsquery and to_tsvector
+which accept text arguments, not lexemes:
+
+
+SELECT plainto_tsquery('supernova star');
+ plainto_tsquery
+-----------------
+ 'sn'
+SELECT to_tsvector('supernova star');
+ to_tsvector
+-------------
+ 'sn':1
+
+In principle, one can use to_tsquery if you quote
+the argument:
+
+SELECT to_tsquery('''supernova star''');
+ to_tsquery
+------------
+ 'sn'
+
+Notice that supernova star matches supernovae
+stars in thesaurus_astro because we specified the
+en_stem stemmer in the thesaurus definition.
+
+
+To keep an original phrase in full text indexing just add it to the right part
+of the definition:
+
+supernovae stars : sn supernovae stars
+
+SELECT plainto_tsquery('supernova star');
+       plainto_tsquery
+-----------------------------
+ 'sn' & 'supernova' & 'star'
+
+
+
+
+
+
+Ispell Dictionary
+
+
+The Ispell template dictionary for full text allows the
+creation of morphological dictionaries based on 
+url="http://ficus-www.cs.ucla.edu/geoff/ispell.html">Ispell, which
+supports a large number of languages. This dictionary tries to change an
+input word to its normalized form. Also, more modern spelling dictionaries
+are supported - 
+url="http://en.wikipedia.org/wiki/MySpell">MySpell (OO < 2.0.1)
+and Hunspell
+(OO >= 2.0.2).  A large list of dictionaries is available on the 
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+




This is the main PostgreSQL git repository.
RSS
Atom
+linkend="textsearch-rule-dictionary-example">) as an example.
+
+
+
+The ALTER TEXT SEARCH CONFIGURATION public.pg ADD
+MAPPING command binds specific types of lexemes and a set of
+dictionaries to process them. (Mappings can also be specified as part of
+configuration creation.) Lexemes are processed by a stack of dictionaries
+until some dictionary identifies it as a known word or it turns out to be
+a stop word.  If no dictionary recognizes a lexeme, it will be discarded
+and not indexed. A general rule for configuring a stack of dictionaries
+is to place first the most narrow, most specific dictionary, then the more
+general dictionaries and finish it with a very general dictionary, like
+the snowball stemmer or simple, which
+recognizes everything.  For example, for an astronomy-specific search
+(astro_en configuration) one could bind
+lword (latin word) with a synonym dictionary of astronomical
+terms, a general English dictionary and a snowball English
+stemmer:
+
+ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, en_ispell, en_stem;
+
+
+
+
+Function lexize can be used to test dictionaries,
+for example:
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+(1 row)
+
+Also, the ts_debug function ()
+can be used for this.
+
+
+
+Stop Words
+
+Stop words are words which are very common, appear in almost
+every document, and have no discrimination value. Therefore, they can be ignored
+in the context of full text searching. For example, every English text contains
+words like a although it is useless to store them in an index.
+However, stop words do affect the positions in tsvector,
+which in turn, do affect ranking:
+
+SELECT to_tsvector('english','in the list of stop words');
+        to_tsvector
+----------------------------
+ 'list':3 'stop':5 'word':6
+
+The gaps between positions 1-3 and 3-5 are because of stop words, so ranks
+calculated for documents with and without stop words are quite different:
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+     0.5
+
+SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop'));
+ rank_cd
+---------
+       1
+
+
+
+
+
+It is up to the specific dictionary how it treats stop words. For example,
+ispell dictionaries first normalize words and then
+look at the list of stop words, while stemmers
+first check the list of stop words. The reason for the different
+behaviour is an attempt to decrease possible noise.
+
+
+
+Here is an example of a dictionary that returns the input word as lowercase
+or NULL if it is a stop word; it also specifies the location
+of the file of stop words.  It uses the simple dictionary as
+a template:
+
+CREATE TEXT SEARCH DICTIONARY public.simple_dict
+    TEMPLATE pg_catalog.simple
+    OPTION 'english.stop';
+
+Relative paths in OPTION resolve relative to
+share/.  Now we can test our
+dictionary:
+
+SELECT lexize('public.simple_dict','YeS');
+ lexize
+--------
+ {yes}
+SELECT lexize('public.simple_dict','The');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+Synonym Dictionary
+
+
+This dictionary template is used to create dictionaries which replace a
+word with a synonym. Phrases are not supported (use the thesaurus
+dictionary () if you need them).  Synonym
+dictionary can be used to overcome linguistic problems, for example, to
+prevent an English stemmer dictionary from reducing the word 'Paris' to
+'pari'.  In that case, it is enough to have a Paris
+paris line in the synonym dictionary and put it before the
+en_stem dictionary:
+
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |      Dicts list      |       Lexized token
+-------+-------------+-------+----------------------+----------------------------
+ lword | Latin word  | Paris | {pg_catalog.en_stem} | pg_catalog.en_stem: {pari}
+(1 row)
+ALTER TEXT SEARCH CONFIGURATION ADD MAPPING ON english FOR lword WITH synonym, en_stem;
+ALTER TEXT SEARCH MAPPING
+Time: 340.867 ms
+SELECT * FROM ts_debug('english','Paris');
+ Alias | Description | Token |               Dicts list                |        Lexized token
+-------+-------------+-------+-----------------------------------------+-----------------------------
+ lword | Latin word  | Paris | {pg_catalog.synonym,pg_catalog.en_stem} | pg_catalog.synonym: {paris}
+(1 row)
+
+
+
+
+
+
+Thesaurus Dictionary
+
+
+A thesaurus dictionary (sometimes abbreviated as TZ) is
+a collection of words which includes information about the relationships
+of words and phrases, i.e., broader terms (BT), narrower
+terms (NT), preferred terms, non-preferred terms, related
+terms, etc.
+
+
+Basically a thesaurus dictionary replaces all non-preferred terms by one
+preferred term and, optionally, preserves them for indexing.  Thesauruses
+are used during indexing so any change in the thesaurus requires
+reindexing.  The current implementation of the thesaurus
+dictionary is an extension of the synonym dictionary with added
+phrase support.  A thesaurus is a plain file of the
+following format:
+
+# this is a comment
+sample word(s) : indexed word(s)
+...............................
+
+where  the colon (:) symbol acts as a delimiter.
+
+
+
+A thesaurus dictionary uses a subdictionary (which
+should be defined in the full text configuration) to normalize the
+thesaurus text. It is only possible to define one dictionary.  Notice that
+the subdictionary will produce an error if it can
+not recognize a word. In that case, you should remove the definition of
+the word or teach the subdictionary to about it.
+Use an asterisk (*) at the beginning of an indexed word to
+skip the subdictionary. It is still required that sample words are known.
+
+
+
+The thesaurus dictionary looks for the longest match.
+
+
+
+Stop words recognized by the subdictionary are replaced by a 'stop word
+placeholder' to record their position. To break possible ties the thesaurus
+uses the last definition. To illustrate this, consider a thesaurus (with
+a simple subdictionary) with pattern
+'swsw', where 's' designates any stop word and
+'w', any known word:
+
+a one the two : swsw
+the one a two : swsw2
+
+Words 'a' and 'the' are stop words defined in the
+configuration of a subdictionary. The thesaurus considers 'the
+one the two' and 'that one then two' as equal
+and will use definition 'swsw2'.
+
+
+
+As any normal dictionary, it can be assigned to the specific lexeme types.
+Since a thesaurus dictionary has the capability to recognize phrases it
+must remember its state and interact with the parser. A thesaurus dictionary
+uses these assignments to check if it should handle the next word or stop
+accumulation.  The thesaurus dictionary compiler must be configured
+carefully. For example, if the thesaurus dictionary is assigned to handle
+only the lword lexeme, then a thesaurus dictionary
+definition like ' one 7' will not work since lexeme type
+digit is not assigned to the thesaurus dictionary.
+
+
+
+
+
+Thesaurus Configuration
+
+
+To define a new thesaurus dictionary one can use the thesaurus template.
+For example:
+
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_simple
+    TEMPLATE thesaurus_template
+    OPTION 'DictFile="dicts_data/thesaurus.txt.sample", Dictionary="en_stem"';
+
+Here:
+
+
+thesaurus_simple is the thesaurus dictionary name
+
+
+DictFile="/path/to/thesaurus_simple.txt" is the location of the thesaurus file
+
+
+Dictionary="en_stem" defines the dictionary (snowball
+English stemmer) to use for thesaurus normalization. Notice that the
+en_stem dictionary has it is own configuration (for example,
+stop words).
+
+
+
+Now it is possible to bind the thesaurus dictionary thesaurus_simple
+and selected tokens, for example:
+
+
+ALTER TEXT SEARCH russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple;
+
+
+
+
+
+
+Thesaurus Example
+
+
+Consider a simple astronomical thesaurus thesaurus_astro,
+which contains some astronomical word combinations:
+
+supernovae stars : sn
+crab nebulae : crab
+
+Below we create a dictionary and bind some token types with
+an astronomical thesaurus and english stemmer:
+
+CREATE TEXT SEARCH DICTIONARY thesaurus_astro OPTION
+    TEMPLATE thesaurus_template
+    'DictFile="dicts_data/thesaurus_astro.txt", Dictionary="en_stem"';
+ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword
+    WITH thesaurus_astro, en_stem;
+
+Now we can see how it works. Note that lexize cannot
+be used for testing the thesaurus (see description of
+lexize), but we can use
+plainto_tsquery and to_tsvector
+which accept text arguments, not lexemes:
+
+
+SELECT plainto_tsquery('supernova star');
+ plainto_tsquery
+-----------------
+ 'sn'
+SELECT to_tsvector('supernova star');
+ to_tsvector
+-------------
+ 'sn':1
+
+In principle, one can use to_tsquery if you quote
+the argument:
+
+SELECT to_tsquery('''supernova star''');
+ to_tsquery
+------------
+ 'sn'
+
+Notice that supernova star matches supernovae
+stars in thesaurus_astro because we specified the
+en_stem stemmer in the thesaurus definition.
+
+
+To keep an original phrase in full text indexing just add it to the right part
+of the definition:
+
+supernovae stars : sn supernovae stars
+
+SELECT plainto_tsquery('supernova star');
+       plainto_tsquery
+-----------------------------
+ 'sn' & 'supernova' & 'star'
+
+
+
+
+
+
+Ispell Dictionary
+
+
+The Ispell template dictionary for full text allows the
+creation of morphological dictionaries based on 
+url="http://ficus-www.cs.ucla.edu/geoff/ispell.html">Ispell, which
+supports a large number of languages. This dictionary tries to change an
+input word to its normalized form. Also, more modern spelling dictionaries
+are supported - 
+url="http://en.wikipedia.org/wiki/MySpell">MySpell (OO < 2.0.1)
+and Hunspell
+(OO >= 2.0.2).  A large list of dictionaries is available on the 
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+




This is the main PostgreSQL git repository.
RSS
Atom
+url="http://ficus-www.cs.ucla.edu/geoff/ispell.html">Ispell, which
+supports a large number of languages. This dictionary tries to change an
+input word to its normalized form. Also, more modern spelling dictionaries
+are supported - 
+url="http://en.wikipedia.org/wiki/MySpell">MySpell (OO < 2.0.1)
+and Hunspell
+(OO >= 2.0.2).  A large list of dictionaries is available on the 
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+




This is the main PostgreSQL git repository.
RSS
Atom
+url="http://en.wikipedia.org/wiki/MySpell">MySpell (OO < 2.0.1)
+and Hunspell
+(OO >= 2.0.2).  A large list of dictionaries is available on the 
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+
+url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
+Wiki.
+
+
+
+The Ispell dictionary allows searches without bothering
+about different linguistic forms of a word. For example, a search on
+bank would return hits of all declensions and
+conjugations of the search term bank, e.g.
+banking, banked, banks,
+banks', and bank's.
+
+SELECT lexize('en_ispell','banking');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','bank''s');
+ lexize
+--------
+ {bank}
+SELECT lexize('en_ispell','banked');
+ lexize
+--------
+ {bank}
+
+
+
+
+
+To create an ispell dictionary one should use the built-in
+ispell_template dictionary and specify several
+parameters.
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict",
+            AffFile="/usr/local/share/dicts/ispell/english.aff",
+            StopFile="/usr/local/share/dicts/ispell/english.stop"';
+
+
+Here, DictFile, AffFile, StopFile
+specify the location of the dictionary and stop words files.
+
+
+
+Relative paths in OPTION resolve relative to
+share/dicts_data:
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="ispell/english.dict",
+            AffFile="ispell/english.aff",
+            StopFile="english.stop"';
+
+
+
+
+Ispell dictionaries  usually recognize a restricted set of words so it
+should be used in conjunction with another broader dictionary; for
+example, a stemming dictionary, which recognizes everything.
+
+
+
+
+Ispell dictionaries support splitting compound words based on an
+ispell dictionary. This is a nice feature and full text searching
+in PostgreSQL supports it.
+Notice that the affix file should specify a special flag using the
+compoundwords controlled statement that marks dictionary
+words that can participate in compound formation:
+
+compoundwords  controlled z
+
+Several examples for the Norwegian language:
+
+SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent');
+ {over,buljong,terning,pakk,mester,assistent}
+SELECT lexize('norwegian_ispell','sjokoladefabrikk');
+ {sjokoladefabrikk,sjokolade,fabrikk}
+
+
+
+
+
+MySpell does not support compound words.
+Hunspell has sophisticated support for compound words. At
+present,  full text searching implements only the basic compound word
+operations of Hunspell.
+
+
+
+
+
+
+<application>Snowball</> Stemming Dictionary
+
+
+The Snowball template dictionary is based on the project
+of Martin Porter, an inventor of the popular Porter's stemming algorithm
+for the English language and now supported in many languages (see the 
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+
+url="http://snowball.tartarus.org">Snowball site for more
+information). Full text searching contains a large number of stemmers for
+many languages. The only option that is accepted by a snowball stemmer is the
+location of a file with stop words. It can be defined using the
+ALTER TEXT SEARCH DICTIONARY command.
+
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem
+    SET OPTION 'StopFile=english-utf8.stop, Language=english';
+
+
+
+
+Relative paths in OPTION resolve relative
+share/dicts/data:
+
+ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop';
+
+
+
+
+The Snowball dictionary recognizes everything, so it is best
+to place it at the end of the dictionary stack. It it useless to have it
+before any other dictionary because a lexeme will not pass through its stemmer.
+
+
+
+
+
+Dictionary Testing
+
+
+The lexize function facilitates dictionary testing:
+
+
+
+
+
+lexize
+
+
+
+
+lexize( dict_name text, lexeme text) returns text[]
+
+
+
+
+
+Returns an array of lexemes if the input lexeme
+is known to the dictionary dictname, or a void
+array if the lexeme is known to the dictionary but it is a stop word, or
+NULL if it is an unknown word.
+
+
+SELECT lexize('en_stem', 'stars');
+ lexize
+--------
+ {star}
+SELECT lexize('en_stem', 'a');
+ lexize
+--------
+ {}
+
+
+
+
+
+
+
+
+
+The lexize function expects a
+lexeme, not text. Below is an example:
+
+SELECT lexize('thesaurus_astro','supernovae stars') is null;
+ ?column?
+----------
+ t
+
+Thesaurus dictionary thesaurus_astro does know
+supernovae stars, but lexize fails since it does not
+parse the input text and considers it as a single lexeme. Use
+plainto_tsquery and to_tsvector to test thesaurus
+dictionaries:
+
+SELECT plainto_tsquery('supernovae stars');
+ plainto_tsquery
+-----------------
+ 'sn'
+
+
+
+
+
+
+
+Configuration Example
+
+
+A full text configuration specifies all options necessary to transform a
+document into a tsvector: the parser breaks text into tokens,
+and the dictionaries transform each token into a lexeme.  Every call to
+to_tsvector() and to_tsquery()
+needs a configuration to perform its processing.  To facilitate management
+of full text searching objects, a set of SQL commands
+is available, and there are several psql commands which display information
+about full text searching objects ().
+
+
+
+The GUC variable default_text_search_config
+(optionally schema-qualified) defines the name of the current
+active configuration. It can be defined in
+postgresql.conf or using the SET command.
+
+
+
+Predefined full text searching objects are available in the
+pg_catalog schema. If you need a custom configuration
+you can create a new full text searching object and modify it using SQL
+commands.
+
+New full text searching objects are created in the current schema by default
+(usually the public schema), but a schema-qualified
+name can be used to create objects in the specified schema. It is owned
+by the current user and can be changed using the ALTER TEXT
+SEARCH OWNER command.
+
+
+
+As an example, we will create a configuration
+pg which starts as a duplicate of the
+english configuration. To be safe, we do this in a transaction:
+
+BEGIN;
+
+CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP;
+
+
+
+
+We will use a PostgreSQL-specific synonym dictionary
+and store it in the share/dicts_data directory. The
+dictionary looks like:
+
+postgres    pg
+pgsql       pg
+postgresql  pg
+
+
+
+CREATE TEXT SEARCH DICTIONARY pg_dict 
+    TEMPLATE synonym
+    OPTION  'pg_dict.txt';
+
+
+
+
+
+Then register the ispell dictionary en_ispell using
+the ispell_template template:
+
+
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="english-utf8.dict",
+            AffFile="english-utf8.aff",
+            StopFile="english-utf8.stop"';
+
+
+
+
+Use the same stop word list for the Snowball stemmer en_stem,
+which is available by default:
+
+
+ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop';
+
+
+
+
+Modify mappings for Latin words for configuration 'pg':
+
+
+ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword
+    WITH pg_dict, en_ispell, en_stem;
+
+
+
+
+We do not index or search some tokens:
+
+
+ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float;
+
+
+
+
+Now, we can test our configuration:
+
+SELECT * FROM ts_debug('public.pg', '
+PostgreSQL, the highly scalable, SQL compliant, open source object-relational
+database management system, is now undergoing beta testing of the next
+version of our software: PostgreSQL 8.2.
+');
+
+COMMIT;
+
+
+
+
+With the dictionaries and mappings set up, suppose we have a table
+pgweb which contains 11239 documents from the
+PostgreSQL web site.  Only relevant columns
+are shown:
+
+=> \d pgweb
+           Table "public.pgweb"
+  Column   |       Type        | Modifiers
+-----------+-------------------+-----------
+ tid       | integer           | not null
+ path      | character varying | not null
+ body      | character varying |
+ title     | character varying |
+ dlm       | integer           |
+
+
+
+
+The next step is to set the session to use the new configuration, which was
+created in the public schema:
+
+=> \dF
+postgres=# \dF public.*
+List of fulltext configurations
+  Schema | Name | Description
+--------+------+-------------
+  public | pg   |
+
+SET default_text_search_config = 'public.pg';
+SET
+
+SHOW default_text_search_config;
+ default_text_search_config
+----------------------------
+ public.pg
+
+
+
+
+
+
+Managing Multiple Configurations
+
+
+If you are using the same text search configuration for the entire cluster
+just set the value in postgresql.conf.  If using a single
+text search configuration for an entire database, use ALTER
+DATABASE ... SET.
+
+
+
+However, if you need to use several text search configurations in the same
+database you must be careful to reference the proper text search
+configuration.  This can be done by either setting
+default_text_search_conf in each session or supplying the
+configuration name in every function call, e.g.  to_tsquery('pg',
+'friend'), to_tsvector('pg', col).  If you are using an expression index,
+you must also be sure to use the proper text search configuration every
+time an INSERT or UPDATE is executed because these
+will modify the index, or you can embed the configuration name into the
+expression index, e.g.:
+
+CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body)));
+
+And if you do that, make sure you specify the configuration name in the
+WHERE clause as well so the expression index will be used.
+
+
+
+
+
+
+
+GiST and GIN Index Types
+
+  
+   index
+   full text
+  
+
+
+
+There are two kinds of indexes which can be used to speed up full text
+operators ().
+Note that indexes are not mandatory for full text searching.
+
+
+
+
+
+
+
+index
+GIST
+
+
+
+
+CREATE INDEX name ON table USING gist(column);
+
+
+
+
+
+Creates a GiST (Generalized Search Tree)-based index.
+
+
+
+
+
+
+
+
+index
+GIN
+
+
+
+
+CREATE INDEX name ON table USING gin(column);
+
+
+
+
+
+Creates a GIN (Generalized Inverted Index)-based index.
+column is a
+TSVECTOR, TEXT,
+VARCHAR, or CHAR-type column.
+
+
+
+
+
+
+
+
+
+A GiST index is lossy, meaning it is necessary
+to consult the heap to check for false results.
+PostgreSQL does this automatically; see
+Filter: in the example below:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae');
+                               QUERY PLAN
+-------------------------------------------------------------------------
+ Index Scan using textsearch_gidx on apod  (cost=0.00..12.29 rows=2 width=1469)
+   Index Cond: (textsearch @@ '''supernova'''::tsquery)
+   Filter: (textsearch @@ '''supernova'''::tsquery)
+
+GiST index lossiness happens because each document is represented by a
+fixed-length signature. The signature is generated by hashing (crc32) each
+word into a random bit in an n-bit string and all words combine to produce
+an n-bit document signature. Because of hashing there is a chance that
+some words hash to the same position and could result in a false hit.
+Signatures calculated for each document in a collection are stored in an
+RD-tree (Russian Doll tree), invented by Hellerstein,
+which is an adaptation of R-tree for sets.  In our case
+the transitive containment relation  is realized by
+superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the
+result of 'OR'-ing the bit-strings of all children.  This is a second
+factor of lossiness.  It is clear that parents tend to be full of
+'1's (degenerates) and become quite useless because of the
+limited selectivity.  Searching is performed as a bit comparison of a
+signature representing the query and an RD-tree entry.
+If all '1's of both signatures are in the same position we
+say that this branch probably matches the query, but if there is even one
+discrepancy we can definitely reject this branch.
+
+
+
+Lossiness causes serious performance degradation since random access of
+heap records is slow and limits the usefulness of GiST
+indexes.  The likelihood of false hits depends on several factors, like
+the number of unique words, so using dictionaries to reduce this number
+is recommended.
+
+
+
+Actually, this  is not the whole story. GiST indexes have an optimization
+for storing small tsvectors (< TOAST_INDEX_TARGET
+bytes, 512 bytes).  On leaf pages small tsvectors are stored unchanged,
+while longer ones are represented by their signatures, which introduces
+some lossiness.  Unfortunately, the existing index API does not allow for
+a return value to say whether it found an exact value (tsvector) or whether
+the result needs to be checked.  This is why the GiST index is
+currently marked as lossy.  We hope to improve this in the future.
+
+
+
+GIN indexes are not lossy but their performance depends logarithmically on
+the number of unique words.
+
+
+
+There is one side-effect of the non-lossiness of a GIN index when using
+query labels/weights, like 'supernovae:a'.  A GIN index
+has all the information necessary to determine a match, so the heap is
+not accessed.  However, if the query has label information it must access
+the heap. Therefore, a special full text search operator @@@
+was created which forces the use of the heap to get information about
+labels.  GiST indexes are lossy so it always reads the  heap and there is
+no need for a special operator. In the example below,
+fulltext_idx is a GIN index:
+
+EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a');
+                               QUERY PLAN
+------------------------------------------------------------------------
+ Index Scan using textsearch_idx on apod  (cost=0.00..12.30 rows=2 width=1469)
+   Index Cond: (textsearch @@@ '''supernova'':A'::tsquery)
+   Filter: (textsearch @@@ '''supernova'':A'::tsquery)
+
+
+
+
+
+In choosing which index type to use, GiST or GIN, consider these differences:
+
+
+GiN index lookups are three times faster than GiST
+
+
+GiN indexes take three times longer to build than GiST
+
+
+GiN is about ten times slower to update than GiST
+
+
+GiN indexes are two-to-three times larger than GiST
+
+
+
+
+
+In summary, GIN indexes are best for static data because
+the indexes are faster for lookups.  For dynamic data, GiST indexes are
+faster to update.  Specifically, GiST indexes are very
+good for dynamic data and fast if the number of unique words (lexemes) is
+under 100,000, while GIN handles +100,000 lexemes better
+but is slower to update.
+
+
+
+Partitioning of big collections and the proper use of GiST and GIN indexes
+allows the implementation of very fast searches with online update.
+Partitioning can be done at the database level using table inheritance
+and constraint_exclusion, or distributing documents over
+servers and collecting search results using the contrib/dblink
+extension module. The latter is possible because ranking functions use
+only local information.
+
+
+
+
+
+Limitations
+
+
+The current limitations of Full Text Searching are:
+
+The length of each lexeme must be less than 2K bytes
+The length of a tsvector (lexemes + positions) must be less than 1 megabyte
+The number of lexemes must be less than 264
+Positional information must be non-negative and less than 16,383
+No more than 256 positions per lexeme
+The number of nodes (lexemes + operations) in tsquery must be less than 32,768
+
+
+
+
+For comparison, the PostgreSQL 8.1 documentation
+consists of 10,441 unique words, a total of 335,420 words, and the most frequent word
+'postgresql' is mentioned 6,127 times in 655 documents.
+
+
+
+Another example - the PostgreSQL mailing list archives
+consists of 910,989  unique words with 57,491,343 lexemes in 461,020 messages.
+
+
+
+
+
+<application>psql</> Support
+
+
+Information about full text searching objects can be obtained
+in psql using a set of commands:
+
+\dF{,d,p}+ PATTERN
+
+An optional + produces more details.
+
+
+The optional parameter PATTERN should be the name of
+a full text searching object, optionally schema-qualified.  If
+PATTERN is not specified then information about all
+visible objects  will be displayed.  PATTERN can be a
+regular expression and can apply separately to schema
+names and object names.  The following examples illustrate this:
+
+=> \dF *fulltext*
+       List of fulltext configurations
+ Schema |  Name        | Description
+--------+--------------+-------------
+ public | fulltext_cfg |
+
+
+
+=> \dF *.fulltext*
+       List of fulltext configurations
+ Schema   |  Name        | Description
+----------+----------------------------
+ fulltext | fulltext_cfg | 
+ public   | fulltext_cfg |
+
+
+
+
+
+   
+\dF[+] [PATTERN]
+
+    
+     
+     List full text searching configurations (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text configurations will be
+      displayed.
+     
+
+
+=> \dF russian
+                               List of fulltext configurations
+   Schema   |   Name  |             Description
+------------+---------+-----------------------------------
+ pg_catalog | russian | default configuration for Russian
+
+=> \dF+ russian
+Configuration "pg_catalog.russian"
+Parser name: "pg_catalog.default"
+Locale: 'ru_RU.UTF-8' (default)
+    Token     |      Dictionaries
+--------------+-------------------------
+ email        | pg_catalog.simple
+ file         | pg_catalog.simple
+ float        | pg_catalog.simple
+ host         | pg_catalog.simple
+ hword        | pg_catalog.ru_stem_utf8
+ int          | pg_catalog.simple
+ lhword       | public.tz_simple
+ lpart_hword  | public.tz_simple
+ lword        | public.tz_simple
+ nlhword      | pg_catalog.ru_stem_utf8
+ nlpart_hword | pg_catalog.ru_stem_utf8
+ nlword       | pg_catalog.ru_stem_utf8
+ part_hword   | pg_catalog.simple
+ sfloat       | pg_catalog.simple
+ uint         | pg_catalog.simple
+ uri          | pg_catalog.simple
+ url          | pg_catalog.simple
+ version      | pg_catalog.simple
+ word         | pg_catalog.ru_stem_utf8
+
+
+    
+   
+
+   
+\dFd[+] [PATTERN]
+    
+     
+     List full text dictionaries (add "+" for more detail).
+     
+     
+      By default (without PATTERN), information about
+      all visible dictionaries will be displayed.
+     
+
+
+=> \dFd
+                           List of fulltext dictionaries
+   Schema   |    Name    |                        Description
+------------+------------+-----------------------------------------------------------
+ pg_catalog | danish     | Snowball stemmer for danish language
+ pg_catalog | dutch      | Snowball stemmer for dutch language
+ pg_catalog | english    | Snowball stemmer for english language
+ pg_catalog | finnish    | Snowball stemmer for finnish language
+ pg_catalog | french     | Snowball stemmer for french language
+ pg_catalog | german     | Snowball stemmer for german language
+ pg_catalog | hungarian  | Snowball stemmer for hungarian language
+ pg_catalog | italian    | Snowball stemmer for italian language
+ pg_catalog | norwegian  | Snowball stemmer for norwegian language
+ pg_catalog | portuguese | Snowball stemmer for portuguese language
+ pg_catalog | romanian   | Snowball stemmer for romanian language
+ pg_catalog | russian    | Snowball stemmer for russian language
+ pg_catalog | simple     | simple dictionary: just lower case and check for stopword
+ pg_catalog | spanish    | Snowball stemmer for spanish language
+ pg_catalog | swedish    | Snowball stemmer for swedish language
+ pg_catalog | turkish    | Snowball stemmer for turkish language
+
+
+    
+   
+
+   
+
+\dFp[+] [PATTERN]
+    
+     
+    List full text parsers (add "+" for more detail)
+     
+     
+      By default (without PATTERN), information about
+      all visible full text parsers will be displayed.
+     
+
+
+=> \dFp
+          List of fulltext parsers
+   Schema   |  Name   |     Description
+------------+---------+---------------------
+ pg_catalog | default | default word parser
+(1 row)
+=> \dFp+
+            Fulltext parser "pg_catalog.default"
+      Method       |         Function          | Description
+-------------------+---------------------------+-------------
+ Start parse       | pg_catalog.prsd_start     |
+ Get next token    | pg_catalog.prsd_nexttoken |
+ End parse         | pg_catalog.prsd_end       |
+ Get headline      | pg_catalog.prsd_headline  |
+ Get lexeme's type | pg_catalog.prsd_lextype   |
+
+  Token's types for parser "pg_catalog.default"
+  Token name  |            Description
+--------------+-----------------------------------
+ blank        | Space symbols
+ email        | Email
+ entity       | HTML Entity
+ file         | File or path name
+ float        | Decimal notation
+ host         | Host
+ hword        | Hyphenated word
+ int          | Signed integer
+ lhword       | Latin hyphenated word
+ lpart_hword  | Latin part of hyphenated word
+ lword        | Latin word
+ nlhword      | Non-latin hyphenated word
+ nlpart_hword | Non-latin part of hyphenated word
+ nlword       | Non-latin word
+ part_hword   | Part of hyphenated word
+ protocol     | Protocol head
+ sfloat       | Scientific notation
+ tag          | HTML Tag
+ uint         | Unsigned integer
+ uri          | URI
+ url          | URL
+ version      | VERSION
+ word         | Word
+(23 rows)
+
+
+    
+   
+
+
+  
+
+
+
+
+Debugging
+
+
+Function ts_debug allows easy testing of your full text searching
+configuration.
+
+
+
+ts_debug(conf_name, document TEXT) returns SETOF tsdebug
+
+
+
+ts_debug displays information about every token of
+document as produced by the
+parser and processed by the configured dictionaries using the configuration
+specified by conf_name.
+
+
+tsdebug type defined as:
+
+CREATE TYPE tsdebug AS (
+     "Alias" text,
+     "Description" text,
+     "Token" text,
+     "Dicts list"   text[],
+     "Lexized token" text
+
+
+
+
+For a demonstration of how function ts_debug works we
+first create a public.english configuration and
+ispell dictionary for the English language. You can skip the test step and
+play with the standard english configuration.
+
+
+CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT;
+CREATE TEXT SEARCH DICTIONARY en_ispell
+    TEMPLATE ispell_template
+    OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict",
+            AffFile="/usr/local/share/dicts/ispell/english-utf8.aff",
+            StopFile="/usr/local/share/dicts/english.stop"';
+ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem;
+
+
+
+SELECT * FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |  Description  |    Token    |              Dicts list               |          Lexized token
+-------+---------------+-------------+---------------------------------------+---------------------------------
+ lword | Latin word    | The         | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | Brightest   | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright}
+ blank | Space symbols |             |                                       |
+ lword | Latin word    | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+In this example, the word 'Brightest' was recognized by a
+parser as a Latin word (alias lword)
+and came through the dictionaries public.en_ispell and
+pg_catalog.en_stem. It was recognized by
+public.en_ispell, which reduced it to the noun
+bright. The word supernovaes is unknown
+by the public.en_ispell dictionary so it was passed to
+the next dictionary, and, fortunately, was recognized (in fact,
+public.en_stem is a stemming dictionary and recognizes
+everything; that is why it was placed at the end of the dictionary stack).
+
+
+
+The word The was recognized by public.en_ispell
+dictionary as a stop word () and will not be indexed.
+
+
+
+You can always explicitly specify which columns you want to see:
+
+SELECT "Alias", "Token", "Lexized token"
+FROM ts_debug('public.english','The Brightest supernovaes');
+ Alias |    Token    |          Lexized token
+-------+-------------+---------------------------------
+ lword | The         | public.en_ispell: {}
+ blank |             |
+ lword | Brightest   | public.en_ispell: {bright}
+ blank |             |
+ lword | supernovaes | pg_catalog.en_stem: {supernova}
+(5 rows)
+
+
+
+
+
+
+Example of Creating a Rule-Based Dictionary
+
+
+The motivation for this example dictionary is to control the indexing of
+integers (signed and unsigned), and, consequently, to minimize the number
+of unique words which greatly affects to performance of searching.
+
+
+
+The dictionary accepts two options:
+
+
+
+The MAXLEN parameter specifies the maximum length of the
+number considered as a 'good' integer. The default value is 6.
+
+
+
+The REJECTLONG parameter specifies if a 'long' integer
+should be indexed or treated as a stop word.  If
+REJECTLONG=FALSE (default),
+the dictionary returns the prefixed part of the integer with length
+MAXLEN.  If
+REJECTLONG=TRUE, the dictionary
+considers a long integer as a stop word.
+
+
+
+
+
+
+
+A similar idea can be applied to the indexing of decimal numbers, for
+example, in the DecDict dictionary. The dictionary
+accepts two options: the MAXLENFRAC parameter specifies
+the maximum length of the fractional part considered as a 'good' decimal.
+The default value is 3. The REJECTLONG parameter
+controls whether a decimal number with a 'long' fractional part should be indexed
+or treated as a stop word. If
+REJECTLONG=FALSE (default),
+the dictionary returns the decimal number with the length of its fraction part
+truncated to MAXLEN. If
+REJECTLONG=TRUE, the dictionary
+considers the number as a stop word. Notice that
+REJECTLONG=FALSE allows the indexing
+of 'shortened' numbers and search results will contain documents with
+shortened numbers.
+
+
+
+
+Examples:
+
+SELECT lexize('intdict', 11234567890);
+  lexize
+----------
+ {112345}
+
+
+
+Now, we want to ignore long integers:
+
+
+ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE';
+SELECT lexize('intdict', 11234567890);
+ lexize
+--------
+ {}
+
+
+
+
+Create contrib/dict_intdict directory with files
+dict_tmpl.c, Makefile, dict_intdict.sql.in:
+
+make && make install
+psql DBNAME < dict_intdict.sql
+
+
+
+
+This is a dict_tmpl.c file:
+
+
+
+#include "postgres.h"
+#include "utils/builtins.h"
+#include "fmgr.h"
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+#include "utils/ts_locale.h"
+#include "utils/ts_public.h"
+#include "utils/ts_utils.h"
+
+ typedef struct {
+        int     maxlen;
+        bool    rejectlong;
+ } DictInt;
+
+
+ PG_FUNCTION_INFO_V1(dinit_intdict);
+ Datum dinit_intdict(PG_FUNCTION_ARGS);
+
+ Datum
+ dinit_intdict(PG_FUNCTION_ARGS) {
+        DictInt *d = (DictInt*)malloc( sizeof(DictInt) );
+        Map *cfg, *pcfg;
+        text *in;
+
+        if (!d)
+                elog(ERROR, "No memory");
+        memset(d, 0, sizeof(DictInt));
+
+        /* Your INIT code */
+/* defaults */
+        d->maxlen = 6;
+        d->rejectlong = false;
+
+        if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */
+             PG_RETURN_POINTER(d);
+
+        in = PG_GETARG_TEXT_P(0);
+        parse_keyvalpairs(in, &cfg);
+        PG_FREE_IF_COPY(in, 0);
+        pcfg=cfg;
+
+        while (pcfg->key) 
+        {
+            if (strcasecmp("MAXLEN", pcfg->key) == 0)
+                    d->maxlen=atoi(pcfg->value);
+            else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) 
+            {
+               if ( strcasecmp("true", pcfg->value) == 0 )
+                   d->rejectlong=true;
+               else if ( strcasecmp("false", pcfg->value) == 0)
+                   d->rejectlong=false;
+               else
+                   elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value);
+            }
+            else
+                elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value);
+
+            pfree(pcfg->key);
+            pfree(pcfg->value);
+            pcfg++;
+        }
+        pfree(cfg);
+
+        PG_RETURN_POINTER(d);
+ }
+
+PG_FUNCTION_INFO_V1(dlexize_intdict);
+Datum dlexize_intdict(PG_FUNCTION_ARGS);
+Datum
+dlexize_intdict(PG_FUNCTION_ARGS)
+{
+        DictInt *d = (DictInt*)PG_GETARG_POINTER(0);
+        char       *in = (char*)PG_GETARG_POINTER(1);
+        char *txt = pnstrdup(in, PG_GETARG_INT32(2));
+        TSLexeme *res = palloc(sizeof(TSLexeme) * 2);
+
+        /* Your INIT dictionary code */
+        res[1].lexeme = NULL;
+
+        if  (PG_GETARG_INT32(2) > d->maxlen)
+        {
+           if (d->rejectlong) 
+           { /* stop, return void array */
+               pfree(txt);
+               res[0].lexeme = NULL;
+            }
+            else
+            { /* cut integer */
+               txt[d->maxlen] = '\0';
+               res[0].lexeme = txt;
+            }
+        }
+        else
+            res[0].lexeme = txt;
+
+        PG_RETURN_POINTER(res);
+}
+
+
+
+This is the Makefile:
+
+subdir = contrib/dict_intdict
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+
+MODULE_big = dict_intdict
+OBJS =  dict_tmpl.o
+DATA_built = dict_intdict.sql
+DOCS =
+
+include $(top_srcdir)/contrib/contrib-global.mk
+
+
+
+
+This is a dict_intdict.sql.in:
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE OR REPLACE FUNCTION dinit_intdict(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C';
+
+CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C'
+WITH (isstrict);
+
+CREATE TEXT SEARCH DICTIONARY intdict
+    LEXIZE  'dlexize_intdict' INIT  'dinit_intdict'
+    OPTION  'MAXLEN=6,REJECTLONG = false';
+
+COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers';
+
+END;
+
+
+
+
+
+
+Example of Creating a Parser
+
+
+SQL command CREATE TEXT SEARCH PARSER creates
+a parser for full text searching. In our example we will implement
+a simple parser which recognizes space-delimited words and
+has only two types (3, word, Word; 12, blank, Space symbols). Identifiers
+were chosen to keep compatibility with the default headline() function
+since we do not implement our own version.
+
+
+
+To implement a parser one needs to create a minimum of four functions.
+
+
+
+
+
+
+
+START = start_function
+
+
+
+
+Initialize the parser. Arguments are a pointer to the parsed text and its
+length.
+
+
+Returns a pointer to the internal structure of a parser. Note that it should
+be malloced or palloced in the
+TopMemoryContext.  We name it ParserState.
+
+
+
+
+
+
+
+GETTOKEN = gettoken_function
+
+
+
+
+Returns the next token.
+Arguments are ParserState *, char **, int *.
+
+
+This procedure will be called as long as the procedure returns token type zero.
+
+
+
+
+
+
+
+END = end_function,
+
+
+
+
+This void function will be called after parsing is finished to free
+allocated resources in this procedure (ParserState).  The argument
+is ParserState *.
+
+
+
+
+
+
+
+LEXTYPES = lextypes_function
+
+
+
+
+Returns an array containing the id, alias, and the description of the tokens
+in the parser. See LexDescr in src/include/utils/ts_public.h.
+
+
+
+
+
+
+
+Below is the source code of our test parser, organized as a contrib module.
+
+
+
+Testing:
+
+SELECT * FROM parse('testparser','That''s my first own parser');
+ tokid | token
+-------+--------
+     3 | That's
+    12 |
+     3 | my
+    12 |
+     3 | first
+    12 |
+     3 | own
+    12 |
+     3 | parser
+SELECT to_tsvector('testcfg','That''s my first own parser');
+                   to_tsvector
+-------------------------------------------------
+ 'my':2 'own':4 'first':3 'parser':5 'that''s':1
+SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star'));
+                            headline
+-----------------------------------------------------------------
+ Supernovae <b>stars</b> are the brightest phenomena in galaxies
+
+
+
+
+
+This test parser is an example adopted from a tutorial by Valli, 
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+
+url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser
+HOWTO.
+
+
+
+To compile the example just do:
+
+make
+make install
+psql regression < test_parser.sql
+
+
+
+
+This is a test_parser.c:
+
+
+#ifdef PG_MODULE_MAGIC
+PG_MODULE_MAGIC;
+#endif
+
+/*
+ * types
+ */
+
+/* self-defined type */
+typedef struct {
+    char *  buffer; /* text to parse */
+    int     len;    /* length of the text in buffer */
+    int     pos;    /* position of the parser */
+} ParserState;
+
+/* copy-paste from wparser.h of tsearch2 */
+typedef struct {
+    int     lexid;
+    char    *alias;
+    char    *descr;
+} LexDescr;
+
+/*
+ * prototypes
+ */
+PG_FUNCTION_INFO_V1(testprs_start);
+Datum testprs_start(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_getlexeme);
+Datum testprs_getlexeme(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_end);
+Datum testprs_end(PG_FUNCTION_ARGS);
+
+PG_FUNCTION_INFO_V1(testprs_lextype);
+Datum testprs_lextype(PG_FUNCTION_ARGS);
+
+/*
+ * functions
+ */
+Datum testprs_start(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) palloc(sizeof(ParserState));
+    pst->buffer = (char *) PG_GETARG_POINTER(0);
+    pst->len = PG_GETARG_INT32(1);
+    pst->pos = 0;
+
+    PG_RETURN_POINTER(pst);
+}
+
+Datum testprs_getlexeme(PG_FUNCTION_ARGS)
+{
+    ParserState *pst   = (ParserState *) PG_GETARG_POINTER(0);
+    char        **t    = (char **) PG_GETARG_POINTER(1);
+    int         *tlen  = (int *) PG_GETARG_POINTER(2);
+    int         type;
+
+    *tlen = pst->pos;
+    *t = pst->buffer +  pst->pos;
+
+    if ((pst->buffer)[pst->pos] == ' ')
+    {
+        /* blank type */
+        type = 12;
+        /* go to the next non-white-space character */
+        while ((pst->buffer)[pst->pos] == ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    } else {
+        /* word type */
+        type = 3;
+        /* go to the next white-space character */
+        while ((pst->buffer)[pst->pos] != ' ' && 
+               pst->pos < pst->len)
+          (pst->pos)++;
+    }
+
+    *tlen = pst->pos - *tlen;
+
+    /* we are finished if (*tlen == 0) */
+    if (*tlen == 0)
+        type=0;
+
+    PG_RETURN_INT32(type);
+}
+Datum testprs_end(PG_FUNCTION_ARGS)
+{
+    ParserState *pst = (ParserState *) PG_GETARG_POINTER(0);
+    pfree(pst);
+    PG_RETURN_VOID();
+}
+
+Datum testprs_lextype(PG_FUNCTION_ARGS)
+{
+    /*
+      Remarks:
+      - we have to return the blanks for headline reason
+      - we use the same lexids like Teodor in the default
+        word parser; in this way we can reuse the headline
+        function of the default word parser.
+    */
+    LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1));
+
+    /* there are only two types in this parser */
+    descr[0].lexid = 3;
+    descr[0].alias = pstrdup("word");
+    descr[0].descr = pstrdup("Word");
+    descr[1].lexid = 12;
+    descr[1].alias = pstrdup("blank");
+    descr[1].descr = pstrdup("Space symbols");
+    descr[2].lexid = 0;
+
+    PG_RETURN_POINTER(descr);
+}
+
+
+
+This is a Makefile
+
+
+override CPPFLAGS := -I. $(CPPFLAGS)
+
+MODULE_big = test_parser
+OBJS = test_parser.o
+
+DATA_built = test_parser.sql
+DATA =
+DOCS = README.test_parser
+REGRESS = test_parser
+
+
+ifdef USE_PGXS
+PGXS := $(shell pg_config --pgxs)
+include $(PGXS)
+else
+subdir = contrib/test_parser
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
+
+
+This is a test_parser.sql.in:
+
+
+SET default_text_search_config = 'english';
+
+BEGIN;
+
+CREATE FUNCTION testprs_start(internal,int4)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_getlexeme(internal,internal,internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_end(internal)
+RETURNS void
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+CREATE FUNCTION testprs_lextype(internal)
+RETURNS internal
+AS 'MODULE_PATHNAME'
+LANGUAGE 'C' with (isstrict);
+
+
+CREATE TEXT SEARCH PARSER testparser
+        START     'testprs_start'
+        GETTOKEN  'testprs_getlexeme'
+        END       'testprs_end'
+        LEXTYPES  'testprs_lextype'
+;
+
+CREATE TEXT SEARCH CONFIGURATION testcfg PARSER  'testparser';
+CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple;
+
+END;
+
+
+
+
+
+
+
author	Tom Lane
	Tue, 21 Aug 2007 21:08:47 +0000 (21:08 +0000)
committer	Tom Lane
	Tue, 21 Aug 2007 21:08:47 +0000 (21:08 +0000)
doc/src/sgml/filelist.sgml		patch \| blob \| blame \| history
doc/src/sgml/postgres.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/allfiles.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/alter_tsconfig.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/alter_tsdictionary.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/alter_tsparser.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/alter_tstemplate.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/comment.sgml		patch \| blob \| blame \| history
doc/src/sgml/ref/create_tsconfig.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/create_tsdictionary.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/create_tsparser.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/create_tstemplate.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/drop_tsconfig.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/drop_tsdictionary.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/drop_tsparser.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/ref/drop_tstemplate.sgml	[new file with mode: 0644]	patch \| blob
doc/src/sgml/reference.sgml		patch \| blob \| blame \| history
doc/src/sgml/textsearch.sgml	[new file with mode: 0644]	patch \| blob