August 13, 2002
authorBruce Momjian
Thu, 15 Aug 2002 03:02:08 +0000 (03:02 +0000)
committerBruce Momjian
Thu, 15 Aug 2002 03:02:08 +0000 (03:02 +0000)
         Use parser of OpenFTS v0.33.

--
Teodor Sigaev

contrib/tsearch/README.tsearch
contrib/tsearch/deflex.h
contrib/tsearch/expected/tsearch.out
contrib/tsearch/morph.c
contrib/tsearch/parser.l

index c63ae91edd096cfbf4646e565f4cd1c41b08e85e..a57df55eea79f39d9f4a7bcb9a39185680266152 100644 (file)
@@ -4,6 +4,11 @@ a searchable data type (textual) with indexed access.
 All work was done by Teodor Sigaev ([email protected]) and Oleg Bartunov
 
+CHANGES:
+
+August 13, 2002
+   Use parser of OpenFTS v0.33.
+
 IMPORTANT NOTICE:
 
 This is a first step of our work on integration of OpenFTS
index f9d6847167988e8a11aae2226b134757ecf65b02..17c4fdf1ec3e765bfb71bb0f98c85b6c3805512e 100644 (file)
@@ -2,28 +2,33 @@
 #define __DEFLEX_H__
 
 /* rememder !!!! */
-#define LASTNUM        19
+#define LASTNUM        23
 
 #define LATWORD        1
-#define NONLATINWORD   2
+#define CYRWORD        2
 #define UWORD      3
 #define EMAIL      4
 #define FURL       5
 #define HOST       6
-#define FLOAT      7
-#define FINT       8
-#define PARTWORD   9
-#define NONLATINPARTWORD   10
-#define LATPARTWORD        11
-#define SPACE      12
-#define SYMTAG     13
-#define HTTP       14
-#define DEFISWORD  15
-#define DEFISLATWORD   16
-#define DEFISNONLATINWORD  17
+#define SCIENTIFIC 7
+#define VERSIONNUMBER  8
+#define PARTHYPHENWORD     9   
+#define CYRPARTHYPHENWORD  10  
+#define LATPARTHYPHENWORD  11  
+#define SPACE      12
+#define TAG        13
+#define HTTP       14
+#define HYPHENWORD 15
+#define LATHYPHENWORD  16
+#define CYRHYPHENWORD  17
 #define URI        18
 #define FILEPATH   19
+#define DECIMAL        20
+#define SIGNEDINT  21
+#define UNSIGNEDINT    22
+#define HTMLENTITY 23
 
 extern const char *descr[];
 
 #endif
+
index f75b429bcbb436f671a14fe6017753cd5fb0931d..0b12765d8f6cf0335703f54fe0c2007685867ad2 100644 (file)
@@ -689,9 +689,9 @@ SELECT count(*) FROM test_txtidx WHERE a ## '(eq|yt)&(wR|qh)';
 select txt2txtidx('345 [email protected] \' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 [email protected] qwe-wer asdf qwer jf sdjk ewr1> ewri2 ">
 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 
  wow  < jqw <> qwerty');
-                                                                                                                                                                                                                                                                                                                                            txt2txtidx                                                                                                                                                                                                                                                                                                                                             
----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' '[email protected]' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' '[email protected]' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
+                                                                                                                                                                                                                                                                                                                                                   txt2txtidx                                                                                                                                                                                                                                                                                                                                                   
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' '[email protected]' 'readline-4' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' '[email protected]' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32'
 (1 row)
 
 select txtidxsize(txt2txtidx('345 qw'));
@@ -705,7 +705,7 @@ select txtidxsize(txt2txtidx('345 [email protected] \' http://www.com/ http://aew.werc.e
  wow  < jqw <> qwerty'));
  txtidxsize 
 ------------
-         52
+         53
 (1 row)
 
 insert into test_txtidx (a) values ('345 qwerty');
index 60797b07e92441c7f218c16d775e22e658a249ed..b29a3f6779dbe57e786f04312a7fb72d7fcb928d 100644 (file)
@@ -75,19 +75,23 @@ static MAPDICT mapdict[] = {
    {NODICT, NODICT},           /* EMAIL        */
    {NODICT, NODICT},           /* FURL         */
    {NODICT, NODICT},           /* HOST         */
-   {NODICT, NODICT},           /* FLOAT        */
-   {NODICT, NODICT},           /* FINT         */
-   {BYLOCALE, DEFAULTDICT},    /* PARTWORD     */
-   {BYLOCALE, NODICT},         /* NONLATINPARTWORD */
-   {DEFAULTDICT, NODICT},      /* LATPARTWORD      */
+   {NODICT, NODICT},           /* SCIENTIFIC       */
+   {NODICT, NODICT},           /* VERSIONNUMBER        */
+   {BYLOCALE, DEFAULTDICT},    /* PARTHYPHENWORD       */
+   {BYLOCALE, NODICT},         /* CYRPARTHYPHENWORD */
+   {DEFAULTDICT, NODICT},      /* LATPARTHYPHENWORD        */
    {STOPLEXEM, NODICT},        /* SPACE        */
-   {STOPLEXEM, NODICT},        /* SYMTAG       */
+   {STOPLEXEM, NODICT},        /* TAG      */
    {STOPLEXEM, NODICT},        /* HTTP         */
-   {BYLOCALE, DEFAULTDICT},    /* DEFISWORD        */
-   {DEFAULTDICT, NODICT},      /* DEFISLATWORD     */
-   {BYLOCALE, NODICT},         /* DEFISNONLATINWORD    */
+   {BYLOCALE, DEFAULTDICT},    /* HYPHENWORD       */
+   {DEFAULTDICT, NODICT},      /* LATHYPHENWORD        */
+   {BYLOCALE, NODICT},         /* CYRHYPHENWORD    */
    {NODICT, NODICT},           /* URI          */
-   {NODICT, NODICT}            /* FILEPATH     */
+   {NODICT, NODICT},           /* FILEPATH     */
+   {NODICT, NODICT},           /* DECIMAL      */
+   {NODICT, NODICT},           /* SIGNEDINT        */
+   {NODICT, NODICT},           /* UNSIGNEDINT      */
+   {STOPLEXEM, NODICT}         /* HTMLENTITY       */
 };
 
 static bool inited = false;
index 6081fd4c7bec02bcbada539f43a6d65a3eb7cb5f..f30fbcd4f4608a8b104c1b1ce678e1c7a31af5bf 100644 (file)
@@ -5,18 +5,17 @@
 
 /* postgres allocation function */
 #include "postgres.h"
-#define free   pfree
-#define malloc palloc
+#define free    pfree
+#define malloc  palloc
 #define realloc repalloc
 
 #ifdef strdup
 #undef strdup
 #endif
-#define strdup pstrdup
-
+#define strdup  pstrdup
 
 char *token = NULL;  /* pointer to token */
-char *s     = NULL;  /* for returning full defis-word */
+char *s     = NULL;  /* to return WHOLE hyphenated-word */
 
 YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */
 
@@ -57,21 +56,21 @@ int bytestoread = 0;    /* for limiting read from filehandle */
 %option nounput
 %option noyywrap
 
-
-/* parser's state for parsing defis-word */
+/* parser's state for parsing hyphenated-word */
 %x DELIM  
 /* parser's state for parsing URL*/
 %x URL  
 %x SERVER  
 
-/* parser's state for parsing filepath */
-
+/* parser's state for parsing TAGS */
 %x INTAG
 %x QINTAG
+%x INCOMMENT
+%x INSCRIPT
 
-/* NONLATIN char */
-NONLATINALNUM  [0-9\200-\377]
-NONLATINALPHA  [\200-\377]
+/* cyrillic koi8 char */
+CYRALNUM   [0-9\200-\377]
+CYRALPHA   [\200-\377]
 ALPHA      [a-zA-Z\200-\377]
 ALNUM      [0-9a-zA-Z\200-\377]
 
@@ -81,66 +80,59 @@ URI     [-_[:alnum:]/%,\.;=&?#]+
 
 %%
 
-"<"[[:alpha:]] { BEGIN INTAG;
-   token = tsearch_yytext;
-   tokenlen = tsearch_yyleng;
-   return SYMTAG;
- }
-
-"
-   token = tsearch_yytext;
-   tokenlen = tsearch_yyleng;
-   return SYMTAG;
- }
+"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; }
 
-"<>" {
+"" {
+   BEGIN INITIAL; 
+   *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0'; 
    token = tsearch_yytext;
    tokenlen = tsearch_yyleng;
-   return SYMTAG;
+   return SPACE;
 }
 
-"<"[^>[:alpha:]] { 
+""   { 
+   BEGIN INITIAL;
+   *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0'; 
    token = tsearch_yytext;
    tokenlen = tsearch_yyleng;
    return SPACE;
 }
 
-"\""    { BEGIN QINTAG;
-   token = tsearch_yytext;
-   tokenlen = tsearch_yyleng;
-   return SYMTAG;
- }
 
-"\\\"" {
-   token = tsearch_yytext;
-   tokenlen = tsearch_yyleng;
-   return SYMTAG;
-}
+"<"[\![:alpha:]]   { BEGIN INTAG; }
 
-"\""   { BEGIN INTAG;
-   token = tsearch_yytext;
-   tokenlen = tsearch_yyleng;
-   return SYMTAG;
- }
+"
 
-.|\n   {
+"\""    { BEGIN QINTAG; }
+
+"\\\"" ;
+
+"\""   { BEGIN INTAG; }
+
+">" { 
+   BEGIN INITIAL;
    token = tsearch_yytext;
-   tokenlen = tsearch_yyleng;
-   return SYMTAG;
+   *tsearch_yytext=' '; 
+   token = tsearch_yytext;
+   tokenlen = 1;
+   return TAG;
 }
 
-">" { BEGIN INITIAL;
+.|\n  ;
+
+\&(quot|amp|nbsp|lt|gt)\;   {
    token = tsearch_yytext;
    tokenlen = tsearch_yyleng;
-   return SYMTAG;
- }
+   return HTMLENTITY;
+}
 
-.|\n    {
+\&\#[0-9][0-9]?[0-9]?\; {
    token = tsearch_yytext;
    tokenlen = tsearch_yyleng;
-   return SYMTAG;
+   return HTMLENTITY;
 }
-
  
 [-_\.[:alnum:]]+@{HOSTNAME}  /* Emails */ { 
    token = tsearch_yytext; 
@@ -148,22 +140,34 @@ URI       [-_[:alnum:]/%,\.;=&?#]+
    return EMAIL; 
 }
 
-[0-9]   /* digit's and point (might be a version) */ { 
+[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+  /* float */   { 
    token = tsearch_yytext; 
    tokenlen = tsearch_yyleng;
-   return FINT; 
+   return SCIENTIFIC; 
+}
+
+[0-9]+\.[0-9]+\.[0-9\.]*[0-9] {
+   token = tsearch_yytext;
+   tokenlen = tsearch_yyleng;
+   return VERSIONNUMBER;
+}
+
+[+-]?[0-9]+\.[0-9]+ {
+   token = tsearch_yytext;
+   tokenlen = tsearch_yyleng;
+   return DECIMAL;
 }
 
-[0-9]+[0-9\.]*[0-9]     /* digit's and point (might be a version) */ { 
+[+-][0-9]+ { 
    token = tsearch_yytext; 
    tokenlen = tsearch_yyleng;
-   return FINT; 
+   return SIGNEDINT; 
 }
 
-[+-]?[0-9\.]+[eE][+-]?[0-9]+  /* float */  { 
+[0-9]+ { 
    token = tsearch_yytext; 
    tokenlen = tsearch_yyleng;
-   return FLOAT; 
+   return UNSIGNEDINT; 
 }
 
 http"://"        { 
@@ -208,52 +212,58 @@ ftp"://"        {
    return FILEPATH;
 }
 
-({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */  {
+({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */    {
    BEGIN DELIM;
    if (s) { free(s); s=NULL; } 
    s = strdup( tsearch_yytext );
    tokenlen = tsearch_yyleng;
    yyless( 0 );
    token = s;
-   return DEFISNONLATINWORD;
+   return CYRHYPHENWORD;
 }
 
-([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */  {
+([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */  {
     BEGIN DELIM;
    if (s) { free(s); s=NULL; } 
-   tokenlen = tsearch_yyleng;
    s = strdup( tsearch_yytext );
+   tokenlen = tsearch_yyleng;
    yyless( 0 );
    token = s;
-   return DEFISLATWORD;
+   return LATHYPHENWORD;
 }
 
-({ALNUM}+-)+{ALPHA}+ /* composite-word */  {
+({ALNUM}+-)+{ALNUM}+ /* composite-word */  {
    BEGIN DELIM;
    if (s) { free(s); s=NULL; } 
    s = strdup( tsearch_yytext );
    tokenlen = tsearch_yyleng;
    yyless( 0 );
    token = s;
-   return DEFISWORD;
+   return HYPHENWORD;
+}
+
+\+?[0-9]+\.[0-9]+ {
+   token = tsearch_yytext;
+   tokenlen = tsearch_yyleng;
+   return DECIMAL;
 }
 
-{NONLATINALNUM}+  /* one word in composite-word */  { 
+{CYRALPHA}+  /* one word in composite-word */   { 
    token = tsearch_yytext; 
    tokenlen = tsearch_yyleng;
-   return NONLATINPARTWORD; 
+   return CYRPARTHYPHENWORD; 
 }
 
-[[:alnum:]]+  /* one word in composite-word */  { 
+[[:alpha:]]+  /* one word in composite-word */  { 
    token = tsearch_yytext; 
    tokenlen = tsearch_yyleng;
-   return LATPARTWORD; 
+   return LATPARTHYPHENWORD; 
 }
 
 {ALNUM}+  /* one word in composite-word */  { 
    token = tsearch_yytext; 
    tokenlen = tsearch_yyleng;
-   return PARTWORD; 
+   return PARTHYPHENWORD; 
 }
 
 -  { 
@@ -264,17 +274,16 @@ ftp"://"        {
 
 .|\n /* return in basic state */ {
    BEGIN INITIAL;
-   tokenlen = tsearch_yyleng;
    yyless( 0 );
 }
 
-{NONLATINALNUM}+ /* normal word */ { 
+{CYRALPHA}+ /* normal word */  { 
    token = tsearch_yytext; 
    tokenlen = tsearch_yyleng;
-   return NONLATINWORD; 
+   return CYRWORD; 
 }
 
-[[:alnum:]]+ /* normal word */ { 
+[[:alpha:]]+ /* normal word */ { 
    token = tsearch_yytext; 
    tokenlen = tsearch_yyleng;
    return LATWORD; 
@@ -286,7 +295,13 @@ ftp"://"        {
    return UWORD; 
 }
 
-.|\n {
+[ \r\n\t]+ {
+   token = tsearch_yytext;
+   tokenlen = tsearch_yyleng;
+   return SPACE;
+}
+
+. {
    token = tsearch_yytext;
    tokenlen = tsearch_yyleng;
    return SPACE;