Rework word_similarity documentation, make it close to actual algorithm.

author Teodor Sigaev

Wed, 21 Mar 2018 11:37:51 +0000 (14:37 +0300)

committer Teodor Sigaev

Wed, 21 Mar 2018 11:37:51 +0000 (14:37 +0300)
author Teodor Sigaev
Wed, 21 Mar 2018 11:37:51 +0000 (14:37 +0300)
committer Teodor Sigaev
Wed, 21 Mar 2018 11:37:51 +0000 (14:37 +0300)
diff --git a/contrib/pg_trgm/trgm_op.c b/contrib/pg_trgm/trgm_op.c

index 368e7c8941d08252169dd616a0274ca02b78fe88..32adecc9b82d1619b13f9922076427bd86d2dbd8 100644 (file)
--- a/contrib/pg_trgm/trgm_op.c
+++ b/contrib/pg_trgm/trgm_op.c
@@ -456,7 +456,7 @@ iterate_word_similarity(int *trg2indexes,
             lastpos[trgindex] = i;
         }
  
-       /* Adjust lower bound if this trigram is present in required substring */
+       /* Adjust upper bound if this trigram is present in required substring */
         if (found[trgindex])
         {
             int         prev_lower,
@@ -473,7 +473,7 @@ iterate_word_similarity(int *trg2indexes,
  
             smlr_cur = CALCSML(count, ulen1, ulen2);
  
-           /* Also try to adjust upper bound for greater similarity */
+           /* Also try to adjust lower bound for greater similarity */
             tmp_count = count;
             tmp_ulen2 = ulen2;
             prev_lower = lower;
diff --git a/doc/src/sgml/pgtrgm.sgml b/doc/src/sgml/pgtrgm.sgml

index 775a7b8be7933166d1e644265d0dadbbfae19d81..2613c7a3d5c40a9895bd8e4638c0a2e8acc88e49 100644 (file)
--- a/doc/src/sgml/pgtrgm.sgml
+++ b/doc/src/sgml/pgtrgm.sgml
@@ -99,12 +99,10 @@
        
        real
        
-       Returns a number that indicates how similar the first string
-       to the most similar word of the second string. The function searches in
-       the second string a most similar word not a most similar substring.  The
-       range of the result is zero (indicating that the two strings are
-       completely dissimilar) to one (indicating that the first string is
-       identical to one of the words of the second string).
+       Returns a number that indicates the greatest similarity between
+       the set of trigrams in the first string and any continuous extent
+       of an ordered set of trigrams in the second string.  For details, see
+       the explanation below.
        
       
       
@@ -131,6 +129,34 @@
     
    
  
+  
+   Consider the following example:
+
+
+# SELECT word_similarity('word', 'two words');
+ word_similarity
+-----------------
+             0.8
+(1 row)
+
+
+   In the first string, the set of trigrams is
+   {"  w"," wo","ord","wor","rd "}.
+   In the second string, the ordered set of trigrams is
+   {"  t"," tw",two,"wo ","  w"," wo","wor","ord","rds", ds "}.
+   The most similar extent of an ordered set of trigrams in the second string
+   is {"  w"," wo","wor","ord"}, and the similarity is
+   0.8.
+  
+
+  
+   This function returns a value that can be approximately understood as the
+   greatest similarity between the first string and any substring of the second
+   string.  However, this function does not add padding to the boundaries of
+   the extent.  Thus, a whole word match gets a higher score than a match with
+   a part of the word.
+  
+
    
     <filename>pg_trgm</filename> Operators
     
@@ -156,10 +182,11 @@
        text <% text
        boolean
        
-       Returns true if its first argument has the similar word in
-       the second argument and they have a similarity that is greater than the
-       current word similarity threshold set by
-       pg_trgm.word_similarity_threshold parameter.
+       Returns true if the similarity between the trigram
+       set in the first argument and a continuous extent of an ordered trigram
+       set in the second argument is greater than the current word similarity
+       threshold set by pg_trgm.word_similarity_threshold
+       parameter.
        
       
       
@@ -302,10 +329,11 @@ SELECT t, word_similarity('word', t) AS sml
    WHERE 'word' <% t
    ORDER BY sml DESC, t;
  
-   This will return all values in the text column that have a word
-   which sufficiently similar to word, sorted from best
-   match to worst.  The index will be used to make this a fast operation
-   even over very large data sets.
+   This will return all values in the text column for which there is a
+   continuous extent in the corresponding ordered trigram set that is
+   sufficiently similar to the trigram set of word,
+   sorted from best match to worst.  The index will be used to make this
+   a fast operation even over very large data sets.
author	Teodor Sigaev
	Wed, 21 Mar 2018 11:37:51 +0000 (14:37 +0300)
committer	Teodor Sigaev
	Wed, 21 Mar 2018 11:37:51 +0000 (14:37 +0300)
contrib/pg_trgm/trgm_op.c		patch \| blob \| blame \| history
doc/src/sgml/pgtrgm.sgml		patch \| blob \| blame \| history