Improve word parser.
authorTeodor Sigaev
Mon, 5 Dec 2005 18:13:22 +0000 (18:13 +0000)
committerTeodor Sigaev
Mon, 5 Dec 2005 18:13:22 +0000 (18:13 +0000)
 - improve file and path recognition
 - fix misspeling
 - improve tag recognition

contrib/tsearch2/wordparser/parser.c
contrib/tsearch2/wordparser/parser.h

index 282acf6e363eb05a5e443ca5c064385a8d06d94b..deccdb284ae1e36e52e4338cf14f8d254a45c5b4 100644 (file)
@@ -327,6 +327,7 @@ static TParserStateActionItem actionTPS_Base[] = {
    {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
    {p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL},
    {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
+   {p_iseqC, '.', A_PUSH, TPS_InPathFirst, 0, NULL},
    {NULL, 0, A_NEXT, TPS_InSpace, 0, NULL}
 };
 
@@ -336,6 +337,7 @@ static TParserStateActionItem actionTPS_InUWord[] = {
    {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
    {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
+   {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
    {NULL, 0, A_BINGO, TPS_Base, UWORD, NULL}
 };
@@ -343,8 +345,8 @@ static TParserStateActionItem actionTPS_InUWord[] = {
 static TParserStateActionItem actionTPS_InLatWord[] = {
    {p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL},
    {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
-   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
-   {p_iseqC, '.', A_PUSH, TPS_InFileFirst, 0, NULL},
+   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
+   {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
    {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
@@ -366,7 +368,7 @@ static TParserStateActionItem actionTPS_InCyrWord[] = {
 static TParserStateActionItem actionTPS_InUnsignedInt[] = {
    {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
    {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
-   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
+   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
    {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
    {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
@@ -500,10 +502,19 @@ static TParserStateActionItem actionTPS_InTagFirst[] = {
    {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    {p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
    {p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
+   {p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
    {p_islatin, 0, A_PUSH, TPS_InTag, 0, NULL},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
+static TParserStateActionItem actionTPS_InXMLBegin[] = {
+   {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+   /* 
+   {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL},
+   {p_iseqC, 'X', A_NEXT, TPS_InTag, 0, NULL},
+   {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
 static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
    {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    {p_islatin, 0, A_NEXT, TPS_InTag, 0, NULL},
@@ -520,6 +531,11 @@ static TParserStateActionItem actionTPS_InTag[] = {
    {p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
    {p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
    {p_iseqC, '#', A_NEXT, TPS_Null, 0, NULL},
+   {p_iseqC, '/', A_NEXT, TPS_Null, 0, NULL},
+   {p_iseqC, ':', A_NEXT, TPS_Null, 0, NULL},
+   {p_iseqC, '.', A_NEXT, TPS_Null, 0, NULL},
+   {p_iseqC, '&', A_NEXT, TPS_Null, 0, NULL},
+   {p_iseqC, '?', A_NEXT, TPS_Null, 0, NULL},
    {p_iseqC, '%', A_NEXT, TPS_Null, 0, NULL},
    {p_isspace, 0, A_NEXT, TPS_Null, 0, SpecialTags},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
@@ -551,6 +567,9 @@ static TParserStateActionItem actionTPS_InTagEnd[] = {
 static TParserStateActionItem actionTPS_InCommentFirst[] = {
    {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
    {p_iseqC, '-', A_NEXT, TPS_InCommentLast, 0, NULL},
+   /* */
+   {p_iseqC, 'D', A_NEXT, TPS_InTag, 0, NULL},
+   {p_iseqC, 'd', A_NEXT, TPS_InTag, 0, NULL},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
@@ -583,30 +602,30 @@ static TParserStateActionItem actionTPS_InCommentEnd[] = {
    {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
 };
 
-static TParserStateActionItem actionTPS_InHostFirstDomen[] = {
+static TParserStateActionItem actionTPS_InHostFirstDomain[] = {
    {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-   {p_islatin, 0, A_NEXT, TPS_InHostDomenSecond, 0, NULL},
+   {p_islatin, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
    {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
-static TParserStateActionItem actionTPS_InHostDomenSecond[] = {
+static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
    {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-   {p_islatin, 0, A_NEXT, TPS_InHostDomen, 0, NULL},
+   {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
    {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
-   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
+   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
-static TParserStateActionItem actionTPS_InHostDomen[] = {
+static TParserStateActionItem actionTPS_InHostDomain[] = {
    {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
-   {p_islatin, 0, A_NEXT, TPS_InHostDomen, 0, NULL},
+   {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
    {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
    {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
-   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
+   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
    {p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
    {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURIStart, HOST, NULL},
@@ -640,7 +659,7 @@ static TParserStateActionItem actionTPS_InHost[] = {
    {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
    {p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
    {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
-   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomen, 0, NULL},
+   {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
    {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
@@ -652,14 +671,32 @@ static TParserStateActionItem actionTPS_InEmail[] = {
 
 static TParserStateActionItem actionTPS_InFileFirst[] = {
    {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
-   {p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL},
-   {p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
-   {p_iseqC, '.', A_CLEAR, TPS_InFile, 0, NULL},
-   {p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
+   {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+   {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
+   {p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
+   {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
    {p_iseqC, '?', A_PUSH, TPS_InURIFirst, 0, NULL},
    {NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
+static TParserStateActionItem actionTPS_InPathFirst[] = {
+   {p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
+   {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+   {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
+   {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
+   {p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
+   {p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
+   {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
+static TParserStateActionItem actionTPS_InPathSecond[] = {
+   {p_isEOF, 0, A_BINGO|A_CLEAR, TPS_Base, FILEPATH, NULL},
+   {p_iseqC, '/', A_NEXT|A_PUSH, TPS_InFileFirst, 0, NULL},
+   {p_iseqC, '/', A_BINGO|A_CLEAR, TPS_Base, FILEPATH, NULL},
+   {p_isspace, 0, A_BINGO|A_CLEAR, TPS_Base, FILEPATH, NULL},
+   {NULL, 0, A_POP, TPS_Null, 0, NULL}
+};
+
 static TParserStateActionItem actionTPS_InFile[] = {
    {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
    {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
@@ -894,6 +931,7 @@ static const TParserStateAction Actions[] = {
    {TPS_InHTMLEntityNum, actionTPS_InHTMLEntityNum},
    {TPS_InHTMLEntityEnd, actionTPS_InHTMLEntityEnd},
    {TPS_InTagFirst, actionTPS_InTagFirst},
+   {TPS_InXMLBegin, actionTPS_InXMLBegin},
    {TPS_InTagCloseFirst, actionTPS_InTagCloseFirst},
    {TPS_InTag, actionTPS_InTag},
    {TPS_InTagEscapeK, actionTPS_InTagEscapeK},
@@ -906,15 +944,17 @@ static const TParserStateAction Actions[] = {
    {TPS_InCloseCommentFirst, actionTPS_InCloseCommentFirst},
    {TPS_InCloseCommentLast, actionTPS_InCloseCommentLast},
    {TPS_InCommentEnd, actionTPS_InCommentEnd},
-   {TPS_InHostFirstDomen, actionTPS_InHostFirstDomen},
-   {TPS_InHostDomenSecond, actionTPS_InHostDomenSecond},
-   {TPS_InHostDomen, actionTPS_InHostDomen},
+   {TPS_InHostFirstDomain, actionTPS_InHostFirstDomain},
+   {TPS_InHostDomainSecond, actionTPS_InHostDomainSecond},
+   {TPS_InHostDomain, actionTPS_InHostDomain},
    {TPS_InPortFirst, actionTPS_InPortFirst},
    {TPS_InPort, actionTPS_InPort},
    {TPS_InHostFirstAN, actionTPS_InHostFirstAN},
    {TPS_InHost, actionTPS_InHost},
    {TPS_InEmail, actionTPS_InEmail},
    {TPS_InFileFirst, actionTPS_InFileFirst},
+   {TPS_InPathFirst, actionTPS_InPathFirst},
+   {TPS_InPathSecond, actionTPS_InPathSecond},
    {TPS_InFile, actionTPS_InFile},
    {TPS_InFileNext, actionTPS_InFileNext},
    {TPS_InURIFirst, actionTPS_InURIFirst},
index 83468d657f00299e82fc35e3c46a2f7b04c5560e..9cdd141efdeab73957c0fd950100460a83ce0513 100644 (file)
@@ -30,6 +30,7 @@ typedef enum
    TPS_InHTMLEntityNum,
    TPS_InHTMLEntityEnd,
    TPS_InTagFirst,
+   TPS_InXMLBegin,
    TPS_InTagCloseFirst,
    TPS_InTag,
    TPS_InTagEscapeK,
@@ -42,15 +43,17 @@ typedef enum
    TPS_InCloseCommentFirst,
    TPS_InCloseCommentLast,
    TPS_InCommentEnd,
-   TPS_InHostFirstDomen,
-   TPS_InHostDomenSecond,
-   TPS_InHostDomen,
+   TPS_InHostFirstDomain,
+   TPS_InHostDomainSecond,
+   TPS_InHostDomain,
    TPS_InPortFirst,
    TPS_InPort,
    TPS_InHostFirstAN,
    TPS_InHost,
    TPS_InEmail,
    TPS_InFileFirst,
+   TPS_InPathFirst,
+   TPS_InPathSecond,
    TPS_InFile,
    TPS_InFileNext,
    TPS_InURIFirst,