1 Fix problem with lost precision in rank with OR-ed lexemes
authorTeodor Sigaev
Fri, 28 Oct 2005 13:05:06 +0000 (13:05 +0000)
committerTeodor Sigaev
Fri, 28 Oct 2005 13:05:06 +0000 (13:05 +0000)
2 Allow tsquery_in to input void tsquery: resolve dump/restore problem with tsquery

contrib/tsearch2/expected/tsearch2.out
contrib/tsearch2/query.c
contrib/tsearch2/rank.c

index 6c266a29ac6c4191f3b573df1a1a345611869127..a7ac240ef9eb04e6076cf48153841554e193c3a9 100644 (file)
@@ -746,21 +746,21 @@ select count(*) FROM test_tsvector WHERE a @@ to_tsquery('copyright');
 (1 row)
 
 select rank(' a:1 s:2C d g'::tsvector, 'a | s');
- rank 
-------
- 0.28
+   rank    
+-----------
+ 0.0911891
 (1 row)
 
 select rank(' a:1 s:2B d g'::tsvector, 'a | s');
- rank 
-------
- 0.46
+   rank   
+----------
+ 0.151982
 (1 row)
 
 select rank(' a:1 s:2 d g'::tsvector, 'a | s');
- rank 
-------
- 0.19
+   rank    
+-----------
+ 0.0607927
 (1 row)
 
 select rank(' a:1 s:2C d g'::tsvector, 'a & s');
index d8b8d4c80d55135e60d7bfbd98a758dba4f0f906..013f0031965e40014e77c4a900a90435559096db 100644 (file)
@@ -55,6 +55,7 @@ Datum     to_tsquery_current(PG_FUNCTION_ARGS);
 /* parser's states */
 #define WAITOPERAND 1
 #define WAITOPERATOR   2
+#define WAITFIRSTOPERAND 3
 
 /*
  * node of query tree, also used
@@ -137,6 +138,7 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
    {
        switch (state->state)
        {
+           case WAITFIRSTOPERAND:
            case WAITOPERAND:
                if (*(state->buf) == '!')
                {
@@ -159,14 +161,16 @@ gettoken_query(QPRS_STATE * state, int4 *val, int4 *lenval, char **strval, int2
                else if (*(state->buf) != ' ')
                {
                    state->valstate.prsbuf = state->buf;
-                   state->state = WAITOPERATOR;
                    if (gettoken_tsvector(&(state->valstate)))
                    {
                        *strval = state->valstate.word;
                        *lenval = state->valstate.curpos - state->valstate.word;
                        state->buf = get_weight(state->valstate.prsbuf, weight);
+                       state->state = WAITOPERATOR;
                        return VAL;
                    }
+                   else if ( state->state == WAITFIRSTOPERAND ) 
+                       return END;
                    else
                        ereport(ERROR,
                                (errcode(ERRCODE_SYNTAX_ERROR),
@@ -596,7 +600,7 @@ static QUERYTYPE *
 
    /* init state */
    state.buf = buf;
-   state.state = WAITOPERAND;
+   state.state = WAITFIRSTOPERAND;
    state.count = 0;
    state.num = 0;
    state.str = NULL;
@@ -616,10 +620,13 @@ static QUERYTYPE *
    /* parse query & make polish notation (postfix, but in reverse order) */
    makepol(&state, pushval);
    pfree(state.valstate.word);
-   if (!state.num)
-       ereport(ERROR,
-               (errcode(ERRCODE_SYNTAX_ERROR),
-                errmsg("empty query")));
+   if (!state.num) {
+       elog(NOTICE, "Query doesn't contain lexem(s)");
+       query = (QUERYTYPE*)palloc( HDRSIZEQT );
+       query->len = HDRSIZEQT;
+       query->size = 0;
+       return query; 
+   }
 
    /* make finish struct */
    commonlen = COMPUTESIZE(state.num, state.sumlen);
@@ -905,6 +912,10 @@ to_tsquery(PG_FUNCTION_ARGS)
    PG_FREE_IF_COPY(in, 1);
 
    query = queryin(str, pushval_morph, PG_GETARG_INT32(0));
+   
+   if ( query->size == 0 )
+       PG_RETURN_POINTER(query);
+
    res = clean_fakeval_v2(GETQUERY(query), &len);
    if (!res)
    {
index 081b0840875aa360270c963277f792bbb1868313..40bec1f48407be28356ef0c60313e144f755ed39 100644 (file)
@@ -257,7 +257,7 @@ calc_rank_or(float *w, tsvector * t, QUERYTYPE * q)
    int4        dimt,
                j,
                i;
-   float       res = -1.0;
+   float       res = 0.0;
    ITEM      **item;
    int         size = q->size;
 
@@ -266,6 +266,8 @@ calc_rank_or(float *w, tsvector * t, QUERYTYPE * q)
 
    for (i = 0; i < size; i++)
    {
+       float resj,wjm;
+       int4  jm;
        entry = find_wordentry(t, q, item[i]);
        if (!entry)
            continue;
@@ -281,14 +283,27 @@ calc_rank_or(float *w, tsvector * t, QUERYTYPE * q)
            post = POSNULL + 1;
        }
 
-       for (j = 0; j < dimt; j++)
-       {
-           if (res < 0)
-               res = wpos(post[j]);
-           else
-               res = 1.0 - (1.0 - res) * (1.0 - wpos(post[j]));
-       }
+                resj = 0.0;
+                wjm = -1.0;
+                jm = 0;
+                for (j = 0; j < dimt; j++)
+                {
+                        resj = resj + wpos(post[j])/((j+1)*(j+1));
+                        if ( wpos(post[j]) > wjm ) {
+                                wjm = wpos(post[j]);
+                                jm  = j;
+                        }
+                }
+/* 
+        limit (sum(i/i^2),i->inf) = pi^2/6
+        resj = sum(wi/i^2),i=1,noccurence,
+        wi - should be sorted desc, 
+        don't sort for now, just choose maximum weight. This should be corrected
+       Oleg Bartunov
+*/
+                res = res + ( wjm + resj - wjm/((jm+1)*(jm+1)))/1.64493406685; 
    }
+   res = res /size;
    pfree(item);
    return res;
 }