+ should always be treated alike.
Now we can see how it works.
SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats');
/*-------------------------------------------------------------------------
*
* wparser_def.c
- * Standard word parser
+ * Default text search parser
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
*
*
* IDENTIFICATION
- * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.3 2007/09/07 15:09:55 teodor Exp $
+ * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.4 2007/10/23 20:46:12 tgl Exp $
*
*-------------------------------------------------------------------------
*/
#include "utils/builtins.h"
-/* rememder !!!! */
-#define LASTNUM 23
+/* Output token categories */
-#define LATWORD 1
-#define CYRWORD 2
-#define UWORD 3
-#define EMAIL 4
-#define FURL 5
-#define HOST 6
-#define SCIENTIFIC 7
+#define ASCIIWORD 1
+#define WORD_T 2
+#define NUMWORD 3
+#define EMAIL 4
+#define URL_T 5
+#define HOST 6
+#define SCIENTIFIC 7
#define VERSIONNUMBER 8
-#define PARTHYPHENWORD 9
-#define CYRPARTHYPHENWORD 10
-#define LATPARTHYPHENWORD 11
-#define SPACE 12
-#define TAG 13
+#define NUMPARTHWORD 9
+#define PARTHWORD 10
+#define ASCIIPARTHWORD 11
+#define SPACE 12
+#define TAG_T 13
#define PROTOCOL 14
-#define HYPHENWORD 15
-#define LATHYPHENWORD 16
-#define CYRHYPHENWORD 17
-#define URI 18
-#define FILEPATH 19
-#define DECIMAL 20
-#define SIGNEDINT 21
-#define UNSIGNEDINT 22
-#define HTMLENTITY 23
-
-static const char *lex_descr[] = {
+#define NUMHWORD 15
+#define ASCIIHWORD 16
+#define HWORD 17
+#define URI 18
+#define FILEPATH 19
+#define DECIMAL 20
+#define SIGNEDINT 21
+#define UNSIGNEDINT 22
+#define HTMLENTITY 23
+
+#define LASTNUM 23
+
+static const char * const tok_alias[] = {
"",
- "Latin word",
- "Non-latin word",
- "Word",
- "Email",
- "URL",
- "Host",
- "Scientific notation",
- "VERSION",
- "Part of hyphenated word",
- "Non-latin part of hyphenated word",
- "Latin part of hyphenated word",
- "Space symbols",
- "HTML Tag",
- "Protocol head",
- "Hyphenated word",
- "Latin hyphenated word",
- "Non-latin hyphenated word",
- "URI",
- "File or path name",
- "Decimal notation",
- "Signed integer",
- "Unsigned integer",
- "HTML Entity"
-};
-
-static const char *tok_alias[] = {
- "",
- "lword",
- "nlword",
+ "asciiword",
"word",
+ "numword",
"email",
"url",
"host",
"sfloat",
"version",
- "part_hword",
- "nlpart_hword",
- "lpart_hword",
+ "hword_numpart",
+ "hword_part",
+ "hword_asciipart",
"blank",
"tag",
"protocol",
+ "numhword",
+ "asciihword",
"hword",
- "lhword",
- "nlhword",
"uri",
"file",
"float",
"entity"
};
+static const char * const lex_descr[] = {
+ "",
+ "Word, all ASCII",
+ "Word, all letters",
+ "Word, letters and digits",
+ "Email address",
+ "URL",
+ "Host",
+ "Scientific notation",
+ "Version number",
+ "Hyphenated word part, letters and digits",
+ "Hyphenated word part, all letters",
+ "Hyphenated word part, all ASCII",
+ "Space symbols",
+ "HTML tag",
+ "Protocol head",
+ "Hyphenated word, letters and digits",
+ "Hyphenated word, all ASCII",
+ "Hyphenated word, all letters",
+ "URI",
+ "File or path name",
+ "Decimal notation",
+ "Signed integer",
+ "Unsigned integer",
+ "HTML entity"
+};
+
+
+/* Parser states */
+
typedef enum
{
TPS_Base = 0,
- TPS_InUWord,
- TPS_InLatWord,
- TPS_InCyrWord,
+ TPS_InNumWord,
+ TPS_InAsciiWord,
+ TPS_InWord,
TPS_InUnsignedInt,
TPS_InSignedIntFirst,
TPS_InSignedInt,
TPS_InProtocolFirst,
TPS_InProtocolSecond,
TPS_InProtocolEnd,
- TPS_InHyphenLatWordFirst,
- TPS_InHyphenLatWord,
- TPS_InHyphenCyrWordFirst,
- TPS_InHyphenCyrWord,
- TPS_InHyphenUWordFirst,
- TPS_InHyphenUWord,
+ TPS_InHyphenAsciiWordFirst,
+ TPS_InHyphenAsciiWord,
+ TPS_InHyphenWordFirst,
+ TPS_InHyphenWord,
+ TPS_InHyphenNumWordFirst,
+ TPS_InHyphenNumWord,
TPS_InHyphenValueFirst,
TPS_InHyphenValue,
TPS_InHyphenValueExact,
TPS_InParseHyphen,
TPS_InParseHyphenHyphen,
- TPS_InHyphenCyrWordPart,
- TPS_InHyphenLatWordPart,
- TPS_InHyphenUWordPart,
+ TPS_InHyphenWordPart,
+ TPS_InHyphenAsciiWordPart,
+ TPS_InHyphenNumWordPart,
TPS_InHyphenUnsignedInt,
TPS_InHDecimalPartFirst,
TPS_InHDecimalPart,
/* forward declaration */
struct TParser;
-
typedef int (*TParserCharTest) (struct TParser *); /* any p_is* functions
* except p_iseq */
typedef void (*TParserSpecial) (struct TParser *); /* special handler for
TParserSpecial special;
} TParserStateActionItem;
+/* Flag bits in TParserStateActionItem.flags */
+#define A_NEXT 0x0000
+#define A_BINGO 0x0001
+#define A_POP 0x0002
+#define A_PUSH 0x0004
+#define A_RERUN 0x0008
+#define A_CLEAR 0x0010
+#define A_MERGE 0x0020
+#define A_CLRALL 0x0040
+
typedef struct
{
TParserState state;
} TParser;
+
+/* forward decls here */
+static bool TParserGet(TParser * prs);
+
+
static TParserPosition *
newTParserPosition(TParserPosition * prev)
{
return prs;
}
-static bool TParserGet(TParser * prs);
-
static void
TParserClose(TParser * prs)
{
}
/*
- * defining support function, equvalent is* macroses, but
+ * Character-type support functions, equivalent to is* macros, but
* working with any possible encodings and locales. Note,
* that with multibyte encoding and C-locale isw* function may fail
- * or give wrong result. Note 2: multibyte encoding and C-local
+ * or give wrong result. Note 2: multibyte encoding and C-locale
* often are used for Asian languages
*/
}
static int
-p_islatin(TParser * prs)
+p_isasclet(TParser * prs)
{
- return (p_isalpha(prs) && p_isascii(prs)) ? 1 : 0;
+ return (p_isascii(prs) && p_isalpha(prs)) ? 1 : 0;
}
-static int
-p_isnonlatin(TParser * prs)
-{
- return (p_isalpha(prs) && !p_isascii(prs)) ? 1 : 0;
-}
+/* deliberately suppress unused-function complaints for the above */
void _make_compiler_happy(void);
void
_make_compiler_happy(void)
* Table of state/action of parser
*/
-#define A_NEXT 0x0000
-#define A_BINGO 0x0001
-#define A_POP 0x0002
-#define A_PUSH 0x0004
-#define A_RERUN 0x0008
-#define A_CLEAR 0x0010
-#define A_MERGE 0x0020
-#define A_CLRALL 0x0040
-
static TParserStateActionItem actionTPS_Base[] = {
{p_isEOF, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '<', A_PUSH, TPS_InTagFirst, 0, NULL},
{p_isignore, 0, A_NEXT, TPS_InSpace, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InLatWord, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InCyrWord, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InAsciiWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
};
-static TParserStateActionItem actionTPS_InUWord[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, UWORD, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
+static TParserStateActionItem actionTPS_InNumWord[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
+ {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
- {NULL, 0, A_BINGO, TPS_Base, UWORD, NULL}
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, NUMWORD, NULL}
};
-static TParserStateActionItem actionTPS_InLatWord[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, LATWORD, NULL},
- {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
+static TParserStateActionItem actionTPS_InAsciiWord[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InProtocolFirst, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
- {NULL, 0, A_BINGO, TPS_Base, LATWORD, NULL}
+ {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InWord, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}
};
-static TParserStateActionItem actionTPS_InCyrWord[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, CYRWORD, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_Null, 0, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InUWord, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL},
- {NULL, 0, A_BINGO, TPS_Base, CYRWORD, NULL}
+static TParserStateActionItem actionTPS_InWord[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_Base, WORD_T, NULL}
};
static TParserStateActionItem actionTPS_InUnsignedInt[] = {
{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
- {p_islatin, 0, A_PUSH, TPS_InHost, 0, NULL},
- {p_isalpha, 0, A_NEXT, TPS_InUWord, 0, NULL},
+ {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntityFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHTMLEntity[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL},
{p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
{p_iseqC, '/', A_PUSH, TPS_InTagCloseFirst, 0, NULL},
{p_iseqC, '!', A_PUSH, TPS_InCommentFirst, 0, NULL},
{p_iseqC, '?', A_PUSH, TPS_InXMLBegin, 0, NULL},
- {p_islatin, 0, A_PUSH, TPS_InTagName, 0, NULL},
+ {p_isasclet, 0, A_PUSH, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InTagCloseFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InTagName, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InTagName, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
{p_iseqC, '/', A_NEXT, TPS_InTagBeginEnd, 0, NULL},
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_isspace, 0, A_NEXT, TPS_InTag, 0, SpecialTags},
- {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
{p_iseqC, '>', A_NEXT, TPS_InTagEnd, 0, SpecialTags},
{p_iseqC, '\'', A_NEXT, TPS_InTagEscapeK, 0, NULL},
{p_iseqC, '"', A_NEXT, TPS_InTagEscapeKK, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '=', A_NEXT, TPS_Null, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_Null, 0, NULL},
};
static TParserStateActionItem actionTPS_InTagEnd[] = {
- {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
};
static TParserStateActionItem actionTPS_InCommentFirst[] = {
};
static TParserStateActionItem actionTPS_InCommentEnd[] = {
- {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG, NULL}
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, TAG_T, NULL}
};
static TParserStateActionItem actionTPS_InHostFirstDomain[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHostDomainSecond, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHostDomainSecond[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
static TParserStateActionItem actionTPS_InHostDomain[] = {
{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
static TParserStateActionItem actionTPS_InHostFirstAN[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static TParserStateActionItem actionTPS_InHost[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
static TParserStateActionItem actionTPS_InFileFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathFirst, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
static TParserStateActionItem actionTPS_InFileTwiddle[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '/', A_NEXT, TPS_InFileFirst, 0, NULL},
static TParserStateActionItem actionTPS_InPathFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_NEXT, TPS_InPathSecond, 0, NULL},
static TParserStateActionItem actionTPS_InFile[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
- {p_islatin, 0, A_NEXT, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
static TParserStateActionItem actionTPS_InFileNext[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_CLEAR, TPS_InFile, 0, NULL},
+ {p_isasclet, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_CLEAR, TPS_InFile, 0, NULL},
{p_iseqC, '_', A_CLEAR, TPS_InFile, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
static TParserStateActionItem actionTPS_InFURL[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, FURL, SpecialFURL},
+ {p_isURI, 0, A_BINGO | A_CLRALL, TPS_Base, URL_T, SpecialFURL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, PROTOCOL, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenLatWordFirst[] = {
+static TParserStateActionItem actionTPS_InHyphenAsciiWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenLatWord[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen},
- {p_islatin, 0, A_NEXT, TPS_InHyphenLatWord, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenLatWordFirst, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, LATHYPHENWORD, SpecialHyphen}
+static TParserStateActionItem actionTPS_InHyphenAsciiWord[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenAsciiWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, ASCIIHWORD, SpecialHyphen}
};
-static TParserStateActionItem actionTPS_InHyphenCyrWordFirst[] = {
+static TParserStateActionItem actionTPS_InHyphenWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenCyrWord[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWord, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenCyrWordFirst, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, CYRHYPHENWORD, SpecialHyphen}
+static TParserStateActionItem actionTPS_InHyphenWord[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWord, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HWORD, SpecialHyphen}
};
-static TParserStateActionItem actionTPS_InHyphenUWordFirst[] = {
+static TParserStateActionItem actionTPS_InHyphenNumWordFirst[] = {
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenUWord[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
- {p_isalnum, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
+static TParserStateActionItem actionTPS_InHyphenNumWord[] = {
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
+ {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueFirst[] = {
};
static TParserStateActionItem actionTPS_InHyphenValue[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValue, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
- {p_isalpha, 0, A_NEXT, TPS_InHyphenUWord, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWord, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InHyphenValueExact[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen},
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen},
{p_isdigit, 0, A_NEXT, TPS_InHyphenValueExact, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHyphenValueFirst, 0, NULL},
- {p_iseqC, '-', A_PUSH, TPS_InHyphenUWordFirst, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, HYPHENWORD, SpecialHyphen}
+ {p_iseqC, '-', A_PUSH, TPS_InHyphenNumWordFirst, 0, NULL},
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_InParseHyphen, NUMHWORD, SpecialHyphen}
};
static TParserStateActionItem actionTPS_InParseHyphen[] = {
{p_isEOF, 0, A_RERUN, TPS_Base, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InParseHyphenHyphen, 0, NULL},
{NULL, 0, A_RERUN, TPS_Base, 0, NULL}
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenCyrWordPart[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, CYRPARTHYPHENWORD, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenCyrWordPart, 0, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {NULL, 0, A_BINGO, TPS_InParseHyphen, CYRPARTHYPHENWORD, NULL}
+static TParserStateActionItem actionTPS_InHyphenWordPart[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, PARTHWORD, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHWORD, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenLatWordPart[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, LATPARTHYPHENWORD, NULL},
- {p_islatin, 0, A_NEXT, TPS_InHyphenLatWordPart, 0, NULL},
- {p_isnonlatin, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {p_isdigit, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {NULL, 0, A_BINGO, TPS_InParseHyphen, LATPARTHYPHENWORD, NULL}
+static TParserStateActionItem actionTPS_InHyphenAsciiWordPart[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIPARTHWORD, NULL},
+ {p_isasclet, 0, A_NEXT, TPS_InHyphenAsciiWordPart, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenWordPart, 0, NULL},
+ {p_isdigit, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_InParseHyphen, ASCIIPARTHWORD, NULL}
};
-static TParserStateActionItem actionTPS_InHyphenUWordPart[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, PARTHYPHENWORD, NULL},
- {p_isalnum, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
- {NULL, 0, A_BINGO, TPS_InParseHyphen, PARTHYPHENWORD, NULL}
+static TParserStateActionItem actionTPS_InHyphenNumWordPart[] = {
+ {p_isEOF, 0, A_BINGO, TPS_Base, NUMPARTHWORD, NULL},
+ {p_isalnum, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
+ {NULL, 0, A_BINGO, TPS_InParseHyphen, NUMPARTHWORD, NULL}
};
static TParserStateActionItem actionTPS_InHyphenUnsignedInt[] = {
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHyphenUnsignedInt, 0, NULL},
- {p_isalpha, 0, A_NEXT, TPS_InHyphenUWordPart, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InHyphenNumWordPart, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHDecimalPartFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_InParseHyphen, UNSIGNEDINT, NULL}
};
};
/*
- * order should be the same as in typedef enum {} TParserState!!
+ * order must be the same as in typedef enum {} TParserState!!
*/
static const TParserStateAction Actions[] = {
{TPS_Base, actionTPS_Base},
- {TPS_InUWord, actionTPS_InUWord},
- {TPS_InLatWord, actionTPS_InLatWord},
- {TPS_InCyrWord, actionTPS_InCyrWord},
+ {TPS_InNumWord, actionTPS_InNumWord},
+ {TPS_InAsciiWord, actionTPS_InAsciiWord},
+ {TPS_InWord, actionTPS_InWord},
{TPS_InUnsignedInt, actionTPS_InUnsignedInt},
{TPS_InSignedIntFirst, actionTPS_InSignedIntFirst},
{TPS_InSignedInt, actionTPS_InSignedInt},
{TPS_InProtocolFirst, actionTPS_InProtocolFirst},
{TPS_InProtocolSecond, actionTPS_InProtocolSecond},
{TPS_InProtocolEnd, actionTPS_InProtocolEnd},
- {TPS_InHyphenLatWordFirst, actionTPS_InHyphenLatWordFirst},
- {TPS_InHyphenLatWord, actionTPS_InHyphenLatWord},
- {TPS_InHyphenCyrWordFirst, actionTPS_InHyphenCyrWordFirst},
- {TPS_InHyphenCyrWord, actionTPS_InHyphenCyrWord},
- {TPS_InHyphenUWordFirst, actionTPS_InHyphenUWordFirst},
- {TPS_InHyphenUWord, actionTPS_InHyphenUWord},
+ {TPS_InHyphenAsciiWordFirst, actionTPS_InHyphenAsciiWordFirst},
+ {TPS_InHyphenAsciiWord, actionTPS_InHyphenAsciiWord},
+ {TPS_InHyphenWordFirst, actionTPS_InHyphenWordFirst},
+ {TPS_InHyphenWord, actionTPS_InHyphenWord},
+ {TPS_InHyphenNumWordFirst, actionTPS_InHyphenNumWordFirst},
+ {TPS_InHyphenNumWord, actionTPS_InHyphenNumWord},
{TPS_InHyphenValueFirst, actionTPS_InHyphenValueFirst},
{TPS_InHyphenValue, actionTPS_InHyphenValue},
{TPS_InHyphenValueExact, actionTPS_InHyphenValueExact},
{TPS_InParseHyphen, actionTPS_InParseHyphen},
{TPS_InParseHyphenHyphen, actionTPS_InParseHyphenHyphen},
- {TPS_InHyphenCyrWordPart, actionTPS_InHyphenCyrWordPart},
- {TPS_InHyphenLatWordPart, actionTPS_InHyphenLatWordPart},
- {TPS_InHyphenUWordPart, actionTPS_InHyphenUWordPart},
+ {TPS_InHyphenWordPart, actionTPS_InHyphenWordPart},
+ {TPS_InHyphenAsciiWordPart, actionTPS_InHyphenAsciiWordPart},
+ {TPS_InHyphenNumWordPart, actionTPS_InHyphenNumWordPart},
{TPS_InHyphenUnsignedInt, actionTPS_InHyphenUnsignedInt},
{TPS_InHDecimalPartFirst, actionTPS_InHDecimalPartFirst},
{TPS_InHDecimalPart, actionTPS_InHDecimalPart},
{
TParserStateActionItem *item = NULL;
+ Assert(prs->state);
+
if (prs->state->posbyte >= prs->lenstr)
return false;
- Assert(prs->state);
prs->lexeme = prs->str + prs->state->posbyte;
prs->state->pushedAtAction = NULL;
prs->state->state = item->tostate;
/* check for go away */
- if ((item->flags & A_BINGO) || (prs->state->posbyte >= prs->lenstr && (item->flags & A_RERUN) == 0))
+ if ((item->flags & A_BINGO) ||
+ (prs->state->posbyte >= prs->lenstr &&
+ (item->flags & A_RERUN) == 0))
break;
- /* go to begining of loop if we should rerun or we just restore state */
+ /* go to beginning of loop if we should rerun or we just restore state */
if (item->flags & (A_RERUN | A_POP))
continue;
PG_RETURN_VOID();
}
-#define LEAVETOKEN(x) ( (x)==12 )
-#define COMPLEXTOKEN(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
-#define ENDPUNCTOKEN(x) ( (x)==12 )
-
+#define LEAVETOKEN(x) ( (x)==SPACE )
+#define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define ENDPUNCTOKEN(x) ( (x)==SPACE )
-#define TS_IDIGNORE(x) ( (x)==13 || (x)==14 || (x)==12 || (x)==23 )
-#define HLIDIGNORE(x) ( (x)==5 || (x)==13 || (x)==15 || (x)==16 || (x)==17 )
-#define HTMLHLIDIGNORE(x) ( (x)==5 || (x)==15 || (x)==16 || (x)==17 )
-#define NONWORDTOKEN(x) ( (x)==12 || HLIDIGNORE(x) )
-#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==7 || (x)==8 || (x)==20 || (x)==21 || (x)==22 || TS_IDIGNORE(x) )
+#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY )
+#define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) )
+#define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
typedef struct
{