Back-patch fix for extraction of fixed prefixes from regular expressions.

author Tom Lane

Tue, 10 Jul 2012 22:00:39 +0000 (18:00 -0400)

committer Tom Lane

Tue, 10 Jul 2012 22:00:39 +0000 (18:00 -0400)
author Tom Lane
Tue, 10 Jul 2012 22:00:39 +0000 (18:00 -0400)
committer Tom Lane
Tue, 10 Jul 2012 22:00:39 +0000 (18:00 -0400)
diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile

index 21e7fa5329b9384333d6a8c9912be81dd24a4867..74a4c0c89d8efedcb8699dffac465def523431a9 100644 (file)
--- a/src/backend/regex/Makefile
+++ b/src/backend/regex/Makefile
@@ -12,7 +12,7 @@ subdir = src/backend/regex
  top_builddir = ../../..
  include $(top_builddir)/src/Makefile.global
  
-OBJS = regcomp.o regerror.o regexec.o regfree.o
+OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o
  
  include $(top_srcdir)/src/backend/common.mk
  
diff --git a/src/backend/regex/README b/src/backend/regex/README

index 89ba6a62ea2f70bfda729f03efedba4b9b6fce9b..c5d21e8c99da1c512379c1702bb7e422355ba9d9 100644 (file)
--- a/src/backend/regex/README
+++ b/src/backend/regex/README
@@ -7,12 +7,13 @@ So this file is an attempt to reverse-engineer some docs.
  General source-file layout
  --------------------------
  
-There are four separately-compilable source files, each exposing exactly
+There are five separately-compilable source files, each exposing exactly
  one exported function:
     regcomp.c: pg_regcomp
     regexec.c: pg_regexec
     regerror.c: pg_regerror
     regfree.c: pg_regfree
+   regprefix.c: pg_regprefix
  (The pg_ prefixes were added by the Postgres project to distinguish this
  library version from any similar one that might be present on a particular
  system.  They'd need to be removed or replaced in any standalone version
@@ -44,6 +45,7 @@ regexec.c     Top-level regex execution code
  rege_dfa.c     DFA creation and execution
  regerror.c     pg_regerror: generate text for a regex error code
  regfree.c      pg_regfree: API to free a no-longer-needed regex_t
+regprefix.c        Code for extracting a common prefix from a regex_t
  
  The locale-specific code is concerned primarily with case-folding and with
  expanding locale-specific character classes, such as [[:alnum:]].  It
diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c

index 2aeb861d9762f0d9f5ec667007ce1b00d22fccde..1c60566fbf57458a1f43a4116432e0636ebcd92d 100644 (file)
--- a/src/backend/regex/regc_color.c
+++ b/src/backend/regex/regc_color.c
@@ -66,8 +66,9 @@ initcm(struct vars * v,
     cd = cm->cd;                /* cm->cd[WHITE] */
     cd->sub = NOSUB;
     cd->arcs = NULL;
-   cd->flags = 0;
+   cd->firstchr = CHR_MIN;
     cd->nchrs = CHR_MAX - CHR_MIN + 1;
+   cd->flags = 0;
  
     /* upper levels of tree */
     for (t = &cm->tree[0], j = NBYTS - 1; j > 0; t = nextt, j--)
@@ -272,6 +273,7 @@ newcolor(struct colormap * cm)
     cd->nchrs = 0;
     cd->sub = NOSUB;
     cd->arcs = NULL;
+   cd->firstchr = CHR_MIN;     /* in case never set otherwise */
     cd->flags = 0;
     cd->block = NULL;
  
@@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c)
     if (co == sco)              /* already in an open subcolor */
         return co;              /* rest is redundant */
     cm->cd[co].nchrs--;
+   if (cm->cd[sco].nchrs == 0)
+       cm->cd[sco].firstchr = c;
     cm->cd[sco].nchrs++;
     setcolor(cm, c, sco);
     return sco;
@@ -438,6 +442,11 @@ subrange(struct vars * v,
  
  /*
   * subblock - allocate new subcolors for one tree block of chrs, fill in arcs
+ *
+ * Note: subcolors that are created during execution of this function
+ * will not be given a useful value of firstchr; it'll be left as CHR_MIN.
+ * For the current usage of firstchr in pg_regprefix, this does not matter
+ * because such subcolors won't occur in the common prefix of a regex.
   */
  static void
  subblock(struct vars * v,
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c

index 66a361ee2ffe7d61c16ac609a9d9be4d249b8497..085842c92b70d74ea969488e0cc50d1466f2e53e 100644 (file)
--- a/src/backend/regex/regc_nfa.c
+++ b/src/backend/regex/regc_nfa.c
@@ -1330,14 +1330,16 @@ compact(struct nfa * nfa,
     for (s = nfa->states; s != NULL; s = s->next)
     {
         nstates++;
-       narcs += 1 + s->nouts + 1;
-       /* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
+       narcs += s->nouts + 1;      /* need one extra for endmarker */
     }
  
+   cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
     cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
     cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
-   if (cnfa->states == NULL || cnfa->arcs == NULL)
+   if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
     {
+       if (cnfa->stflags != NULL)
+           FREE(cnfa->stflags);
         if (cnfa->states != NULL)
             FREE(cnfa->states);
         if (cnfa->arcs != NULL)
@@ -1359,9 +1361,8 @@ compact(struct nfa * nfa,
     for (s = nfa->states; s != NULL; s = s->next)
     {
         assert((size_t) s->no < nstates);
+       cnfa->stflags[s->no] = 0;
         cnfa->states[s->no] = ca;
-       ca->co = 0;             /* clear and skip flags "arc" */
-       ca++;
         first = ca;
         for (a = s->outs; a != NULL; a = a->outchain)
             switch (a->type)
@@ -1392,8 +1393,8 @@ compact(struct nfa * nfa,
  
     /* mark no-progress states */
     for (a = nfa->pre->outs; a != NULL; a = a->outchain)
-       cnfa->states[a->to->no]->co = 1;
-   cnfa->states[nfa->pre->no]->co = 1;
+       cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
+   cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
  }
  
  /*
@@ -1433,6 +1434,7 @@ freecnfa(struct cnfa * cnfa)
  {
     assert(cnfa->nstates != 0); /* not empty already */
     cnfa->nstates = 0;
+   FREE(cnfa->stflags);
     FREE(cnfa->states);
     FREE(cnfa->arcs);
  }
@@ -1617,7 +1619,7 @@ dumpcnfa(struct cnfa * cnfa,
         fprintf(f, ", haslacons");
     fprintf(f, "\n");
     for (st = 0; st < cnfa->nstates; st++)
-       dumpcstate(st, cnfa->states[st], cnfa, f);
+       dumpcstate(st, cnfa, f);
     fflush(f);
  }
  #endif
@@ -1629,22 +1631,20 @@ dumpcnfa(struct cnfa * cnfa,
   */
  static void
  dumpcstate(int st,
-          struct carc * ca,
            struct cnfa * cnfa,
            FILE *f)
  {
-   int         i;
+   struct carc * ca;
     int         pos;
  
-   fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
+   fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
     pos = 1;
-   for (i = 1; ca[i].co != COLORLESS; i++)
+   for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     {
-       if (ca[i].co < cnfa->ncolors)
-           fprintf(f, "\t[%ld]->%d", (long) ca[i].co, ca[i].to);
+       if (ca->co < cnfa->ncolors)
+           fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
         else
-           fprintf(f, "\t:%ld:->%d", (long) ca[i].co - cnfa->ncolors,
-                   ca[i].to);
+           fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
         if (pos == 5)
         {
             fprintf(f, "\n");
@@ -1653,7 +1653,7 @@ dumpcstate(int st,
         else
             pos++;
     }
-   if (i == 1 || pos != 1)
+   if (ca == cnfa->states[st] || pos != 1)
         fprintf(f, "\n");
     fflush(f);
  }
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c

index 57055f04abb807466744daf74d8506b9f0e594b8..ceb6f0f8737e9e34fca9d118669fcc69e796c5f2 100644 (file)
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -162,7 +162,7 @@ static void dumparcs(struct state *, FILE *);
  static int dumprarcs(struct arc *, struct state *, FILE *, int);
  static void dumparc(struct arc *, struct state *, FILE *);
  static void dumpcnfa(struct cnfa *, FILE *);
-static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
+static void dumpcstate(int, struct cnfa *, FILE *);
  #endif
  /* === regc_cvec.c === */
  static struct cvec *newcvec(int, int);
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c

index da7a0bf402ff41bace723924fa8a8f0635eb15ec..7a7ba5b89cf28fc5a448a2b3826dfe5b9298db63 100644 (file)
--- a/src/backend/regex/rege_dfa.c
+++ b/src/backend/regex/rege_dfa.c
@@ -457,14 +457,14 @@ miss(struct vars * v,         /* used only for debug flags */
     gotstate = 0;
     for (i = 0; i < d->nstates; i++)
         if (ISBSET(css->states, i))
-           for (ca = cnfa->states[i] + 1; ca->co != COLORLESS; ca++)
+           for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
                 if (ca->co == co)
                 {
                     BSET(d->work, ca->to);
                     gotstate = 1;
                     if (ca->to == cnfa->post)
                         ispost = 1;
-                   if (!cnfa->states[ca->to]->co)
+                   if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
                         noprogress = 0;
                     FDEBUG(("%d -> %d\n", i, ca->to));
                 }
@@ -475,10 +475,9 @@ miss(struct vars * v,          /* used only for debug flags */
         dolacons = 0;
         for (i = 0; i < d->nstates; i++)
             if (ISBSET(d->work, i))
-               for (ca = cnfa->states[i] + 1; ca->co != COLORLESS;
-                    ca++)
+               for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
                 {
-                   if (ca->co <= cnfa->ncolors)
+                   if (ca->co < cnfa->ncolors)
                         continue;       /* NOTE CONTINUE */
                     sawlacons = 1;
                     if (ISBSET(d->work, ca->to))
@@ -489,7 +488,7 @@ miss(struct vars * v,           /* used only for debug flags */
                     dolacons = 1;
                     if (ca->to == cnfa->post)
                         ispost = 1;
-                   if (!cnfa->states[ca->to]->co)
+                   if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
                         noprogress = 0;
                     FDEBUG(("%d :> %d\n", i, ca->to));
                 }
diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c

new file mode 100644 (file)

index 0000000..6f91288
--- /dev/null
+++ b/src/backend/regex/regprefix.c
@@ -0,0 +1,259 @@
+/*-------------------------------------------------------------------------
+ *
+ * regprefix.c
+ *   Extract a common prefix, if any, from a compiled regex.
+ *
+ *
+ * Portions Copyright (c) 2012, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1998, 1999 Henry Spencer
+ *
+ * IDENTIFICATION
+ *   src/backend/regex/regprefix.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "regex/regguts.h"
+
+
+/*
+ * forward declarations
+ */
+static int findprefix(struct cnfa * cnfa, struct colormap * cm,
+                     chr *string, size_t *slength);
+
+
+/*
+ * pg_regprefix - get common prefix for regular expression
+ *
+ * Returns one of:
+ * REG_NOMATCH: there is no common prefix of strings matching the regex
+ * REG_PREFIX: there is a common prefix of strings matching the regex
+ * REG_EXACT: all strings satisfying the regex must match the same string
+ * or a REG_XXX error code
+ *
+ * In the non-failure cases, *string is set to a malloc'd string containing
+ * the common prefix or exact value, of length *slength (measured in chrs
+ * not bytes!).
+ *
+ * This function does not analyze all complex cases (such as lookahead
+ * constraints) exactly.  Therefore it is possible that some strings matching
+ * the reported prefix or exact-match string do not satisfy the regex.  But
+ * it should never be the case that a string satisfying the regex does not
+ * match the reported prefix or exact-match string.
+ */
+int
+pg_regprefix(regex_t *re,
+            chr **string,
+            size_t *slength)
+{
+   struct guts *g;
+   struct cnfa *cnfa;
+   int         st;
+
+   /* sanity checks */
+   if (string == NULL || slength == NULL)
+       return REG_INVARG;
+   *string = NULL;             /* initialize for failure cases */
+   *slength = 0;
+   if (re == NULL || re->re_magic != REMAGIC)
+       return REG_INVARG;
+   if (re->re_csize != sizeof(chr))
+       return REG_MIXED;
+
+   /* Initialize locale-dependent support */
+   pg_set_regex_collation(re->re_collation);
+
+   /* setup */
+   g = (struct guts *) re->re_guts;
+   if (g->info & REG_UIMPOSSIBLE)
+       return REG_NOMATCH;
+
+   /*
+    * This implementation considers only the search NFA for the topmost regex
+    * tree node.  Therefore, constraints such as backrefs are not fully
+    * applied, which is allowed per the function's API spec.
+    */
+   assert(g->tree != NULL);
+   cnfa = &g->tree->cnfa;
+
+   /*
+    * Since a correct NFA should never contain any exit-free loops, it should
+    * not be possible for our traversal to return to a previously visited
+    * NFA state.  Hence we need at most nstates chrs in the output string.
+    */
+   *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
+   if (*string == NULL)
+       return REG_ESPACE;
+
+   /* do it */
+   st = findprefix(cnfa, &g->cmap, *string, slength);
+
+   assert(*slength <= cnfa->nstates);
+
+   /* clean up */
+   if (st != REG_PREFIX && st != REG_EXACT)
+   {
+       FREE(*string);
+       *string = NULL;
+       *slength = 0;
+   }
+
+   return st;
+}
+
+/*
+ * findprefix - extract common prefix from cNFA
+ *
+ * Results are returned into the preallocated chr array string[], with
+ * *slength (which must be preset to zero) incremented for each chr.
+ */
+static int                     /* regprefix return code */
+findprefix(struct cnfa * cnfa,
+          struct colormap * cm,
+          chr *string,
+          size_t *slength)
+{
+   int         st;
+   int         nextst;
+   color       thiscolor;
+   chr         c;
+   struct carc *ca;
+
+   /*
+    * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
+    * anchored left.  If we have both BOS and BOL, they must go to the
+    * same next state.
+    */
+   st = cnfa->pre;
+   nextst = -1;
+   for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+   {
+       if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+       {
+           if (nextst == -1)
+               nextst = ca->to;
+           else if (nextst != ca->to)
+               return REG_NOMATCH;
+       }
+       else
+           return REG_NOMATCH;
+   }
+   if (nextst == -1)
+       return REG_NOMATCH;
+
+   /*
+    * Scan through successive states, stopping as soon as we find one with
+    * more than one acceptable transition character (either multiple colors
+    * on out-arcs, or a color with more than one member chr).
+    *
+    * We could find a state with multiple out-arcs that are all labeled with
+    * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
+    * In that case we add the chr "c" to the output string but then exit the
+    * loop with nextst == -1.  This leaves a little bit on the table: if the
+    * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
+    * to the prefix.  But chasing multiple parallel state chains doesn't seem
+    * worth the trouble.
+    */
+   do
+   {
+       st = nextst;
+       nextst = -1;
+       thiscolor = COLORLESS;
+       for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+       {
+           /* We ignore lookahead constraints */
+           if (ca->co >= cnfa->ncolors)
+               continue;
+           /* We can also ignore BOS/BOL arcs */
+           if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
+               continue;
+           /* ... but EOS/EOL arcs terminate the search */
+           if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+           {
+               thiscolor = COLORLESS;
+               break;
+           }
+           if (thiscolor == COLORLESS)
+           {
+               /* First plain outarc */
+               thiscolor = ca->co;
+               nextst = ca->to;
+           }
+           else if (thiscolor == ca->co)
+           {
+               /* Another plain outarc for same color */
+               nextst = -1;
+           }
+           else
+           {
+               /* More than one plain outarc color terminates the search */
+               thiscolor = COLORLESS;
+               break;
+           }
+       }
+       /* Done if we didn't find exactly one color on plain outarcs */
+       if (thiscolor == COLORLESS)
+           break;
+       /* The color must be a singleton */
+       if (cm->cd[thiscolor].nchrs != 1)
+           break;
+
+       /*
+        * Identify the color's sole member chr and add it to the prefix
+        * string.  In general the colormap data structure doesn't provide a
+        * way to find color member chrs, except by trying GETCOLOR() on each
+        * possible chr value, which won't do at all.  However, for the cases
+        * we care about it should be sufficient to test the "firstchr" value,
+        * that is the first chr ever added to the color.  There are cases
+        * where this might no longer be a member of the color (so we do need
+        * to test), but none of them are likely to arise for a character that
+        * is a member of a common prefix.  If we do hit such a corner case,
+        * we just fall out without adding anything to the prefix string.
+        */
+       c = cm->cd[thiscolor].firstchr;
+       if (GETCOLOR(cm, c) != thiscolor)
+           break;
+
+       string[(*slength)++] = c;
+
+       /* Advance to next state, but only if we have a unique next state */
+   } while (nextst != -1);
+
+   /*
+    * If we ended at a state that only has EOS/EOL outarcs leading to the
+    * "post" state, then we have an exact-match string.  Note this is true
+    * even if the string is of zero length.
+    */
+   nextst = -1;
+   for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
+   {
+       if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
+       {
+           if (nextst == -1)
+               nextst = ca->to;
+           else if (nextst != ca->to)
+           {
+               nextst = -1;
+               break;
+           }
+       }
+       else
+       {
+           nextst = -1;
+           break;
+       }
+   }
+   if (nextst == cnfa->post)
+       return REG_EXACT;
+
+   /*
+    * Otherwise, if we were unable to identify any prefix characters, say
+    * NOMATCH --- the pattern is anchored left, but doesn't specify any
+    * particular first character.
+    */
+   if (*slength > 0)
+       return REG_PREFIX;
+
+   return REG_NOMATCH;
+}
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c

index 96c77078c8b51f434ea9ddb9947abb5e93143410..074142e7985499f152e07ab0c4e1c1d0675c4042 100644 (file)
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -1170,3 +1170,68 @@ build_regexp_split_result(regexp_matches_ctx *splitctx)
                                    Int32GetDatum(startpos + 1));
     }
  }
+
+/*
+ * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
+ *
+ * The result is NULL if there is no fixed prefix, else a palloc'd string.
+ * If it is an exact match, not just a prefix, *exact is returned as TRUE.
+ */
+char *
+regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
+                   bool *exact)
+{
+   char       *result;
+   regex_t    *re;
+   int         cflags;
+   int         re_result;
+   pg_wchar   *str;
+   size_t      slen;
+   size_t      maxlen;
+   char        errMsg[100];
+
+   *exact = false;             /* default result */
+
+   /* Compile RE */
+   cflags = REG_ADVANCED;
+   if (case_insensitive)
+       cflags |= REG_ICASE;
+
+   re = RE_compile_and_cache(text_re, cflags, collation);
+
+   /* Examine it to see if there's a fixed prefix */
+   re_result = pg_regprefix(re, &str, &slen);
+
+   switch (re_result)
+   {
+       case REG_NOMATCH:
+           return NULL;
+
+       case REG_PREFIX:
+           /* continue with wchar conversion */
+           break;
+
+       case REG_EXACT:
+           *exact = true;
+           /* continue with wchar conversion */
+           break;
+
+       default:
+           /* re failed??? */
+           pg_regerror(re_result, re, errMsg, sizeof(errMsg));
+           ereport(ERROR,
+                   (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
+                    errmsg("regular expression failed: %s", errMsg)));
+           break;
+   }
+
+   /* Convert pg_wchar result back to database encoding */
+   maxlen = pg_database_encoding_max_length() * slen + 1;
+   result = (char *) palloc(maxlen);
+   slen = pg_wchar2mb_with_len(str, result, slen);
+   Assert(slen < maxlen);
+
+   free(str);
+
+   return result;
+}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c

index 7eb64cba4bb039314610f28df35d7c35d40dc0a2..40e1bebac1602a19f31606f529a9dbb0ffd7a043 100644 (file)
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -195,7 +195,8 @@ static Selectivity prefix_selectivity(PlannerInfo *root,
  static Selectivity like_selectivity(const char *patt, int pattlen,
                                     bool case_insensitive);
  static Selectivity regex_selectivity(const char *patt, int pattlen,
-                                    bool case_insensitive);
+                                    bool case_insensitive,
+                                    int fixed_prefix_len);
  static Datum string_to_datum(const char *str, Oid datatype);
  static Const *string_to_const(const char *str, Oid datatype);
  static Const *string_to_bytea_const(const char *str, size_t str_len);
@@ -5255,18 +5256,9 @@ static Pattern_Prefix_Status
  regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
                    Const **prefix_const, Selectivity *rest_selec)
  {
-   char       *match;
-   int         pos,
-               match_pos,
-               prev_pos,
-               prev_match_pos;
-   bool        have_leading_paren;
-   char       *patt;
-   char       *rest;
     Oid         typeid = patt_const->consttype;
-   bool        is_multibyte = (pg_database_encoding_max_length() > 1);
-   pg_locale_t locale = 0;
-   bool        locale_is_c = false;
+   char       *prefix;
+   bool        exact;
  
     /*
      * Should be unnecessary, there are no bytea regex operators defined. As
@@ -5278,185 +5270,54 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
          errmsg("regular-expression matching not supported on type bytea")));
  
-   if (case_insensitive)
-   {
-       /* If case-insensitive, we need locale info */
-       if (lc_ctype_is_c(collation))
-           locale_is_c = true;
-       else if (collation != DEFAULT_COLLATION_OID)
-       {
-           if (!OidIsValid(collation))
-           {
-               /*
-                * This typically means that the parser could not resolve a
-                * conflict of implicit collations, so report it that way.
-                */
-               ereport(ERROR,
-                       (errcode(ERRCODE_INDETERMINATE_COLLATION),
-                        errmsg("could not determine which collation to use for regular expression"),
-                        errhint("Use the COLLATE clause to set the collation explicitly.")));
-           }
-           locale = pg_newlocale_from_collation(collation);
-       }
-   }
-
-   /* the right-hand const is type text for all of these */
-   patt = TextDatumGetCString(patt_const->constvalue);
-
-   /*
-    * Check for ARE director prefix.  It's worth our trouble to recognize
-    * this because similar_escape() used to use it, and some other code might
-    * still use it, to force ARE mode.
-    */
-   pos = 0;
-   if (strncmp(patt, "***:", 4) == 0)
-       pos = 4;
+   /* Use the regexp machinery to extract the prefix, if any */
+   prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
+                                case_insensitive, collation,
+                                &exact);
  
-   /* Pattern must be anchored left */
-   if (patt[pos] != '^')
+   if (prefix == NULL)
     {
         *prefix_const = NULL;
  
         if (rest_selec != NULL)
-           *rest_selec = regex_selectivity(patt, strlen(patt),
-                                           case_insensitive);
-
-       return Pattern_Prefix_None;
-   }
-   pos++;
-
-   /*
-    * If '|' is present in pattern, then there may be multiple alternatives
-    * for the start of the string.  (There are cases where this isn't so, for
-    * instance if the '|' is inside parens, but detecting that reliably is
-    * too hard.)
-    */
-   if (strchr(patt + pos, '|') != NULL)
-   {
-       *prefix_const = NULL;
+       {
+           char   *patt = TextDatumGetCString(patt_const->constvalue);
  
-       if (rest_selec != NULL)
             *rest_selec = regex_selectivity(patt, strlen(patt),
-                                           case_insensitive);
+                                           case_insensitive,
+                                           0);
+           pfree(patt);
+       }
  
         return Pattern_Prefix_None;
     }
  
-   /* OK, allocate space for pattern */
-   match = palloc(strlen(patt) + 1);
-   prev_match_pos = match_pos = 0;
+   *prefix_const = string_to_const(prefix, typeid);
  
-   /*
-    * We special-case the syntax '^(...)$' because psql uses it.  But beware:
-    * sequences beginning "(?" are not what they seem, unless they're "(?:".
-    * (We must recognize that because of similar_escape().)
-    */
-   have_leading_paren = false;
-   if (patt[pos] == '(' &&
-       (patt[pos + 1] != '?' || patt[pos + 2] == ':'))
-   {
-       have_leading_paren = true;
-       pos += (patt[pos + 1] != '?' ? 1 : 3);
-   }
-
-   /* Scan remainder of pattern */
-   prev_pos = pos;
-   while (patt[pos])
+   if (rest_selec != NULL)
     {
-       int         len;
-
-       /*
-        * Check for characters that indicate multiple possible matches here.
-        * Also, drop out at ')' or '$' so the termination test works right.
-        */
-       if (patt[pos] == '.' ||
-           patt[pos] == '(' ||
-           patt[pos] == ')' ||
-           patt[pos] == '[' ||
-           patt[pos] == '^' ||
-           patt[pos] == '$')
-           break;
-
-       /* Stop if case-varying character (it's sort of a wildcard) */
-       if (case_insensitive &&
-         pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
-           break;
-
-       /*
-        * Check for quantifiers.  Except for +, this means the preceding
-        * character is optional, so we must remove it from the prefix too!
-        */
-       if (patt[pos] == '*' ||
-           patt[pos] == '?' ||
-           patt[pos] == '{')
+       if (exact)
         {
-           match_pos = prev_match_pos;
-           pos = prev_pos;
-           break;
+           /* Exact match, so there's no additional selectivity */
+           *rest_selec = 1.0;
         }
-       if (patt[pos] == '+')
+       else
         {
-           pos = prev_pos;
-           break;
-       }
+           char   *patt = TextDatumGetCString(patt_const->constvalue);
  
-       /*
-        * Normally, backslash quotes the next character.  But in AREs,
-        * backslash followed by alphanumeric is an escape, not a quoted
-        * character.  Must treat it as having multiple possible matches.
-        * Note: since only ASCII alphanumerics are escapes, we don't have to
-        * be paranoid about multibyte or collations here.
-        */
-       if (patt[pos] == '\\')
-       {
-           if (isalnum((unsigned char) patt[pos + 1]))
-               break;
-           pos++;
-           if (patt[pos] == '\0')
-               break;
+           *rest_selec = regex_selectivity(patt, strlen(patt),
+                                           case_insensitive,
+                                           strlen(prefix));
+           pfree(patt);
         }
-       /* save position in case we need to back up on next loop cycle */
-       prev_match_pos = match_pos;
-       prev_pos = pos;
-       /* must use encoding-aware processing here */
-       len = pg_mblen(&patt[pos]);
-       memcpy(&match[match_pos], &patt[pos], len);
-       match_pos += len;
-       pos += len;
     }
  
-   match[match_pos] = '\0';
-   rest = &patt[pos];
-
-   if (have_leading_paren && patt[pos] == ')')
-       pos++;
-
-   if (patt[pos] == '$' && patt[pos + 1] == '\0')
-   {
-       *prefix_const = string_to_const(match, typeid);
-
-       if (rest_selec != NULL)
-           *rest_selec = 1.0;
-
-       pfree(patt);
-       pfree(match);
+   pfree(prefix);
  
+   if (exact)
         return Pattern_Prefix_Exact;    /* pattern specifies exact match */
-   }
-
-   *prefix_const = string_to_const(match, typeid);
-
-   if (rest_selec != NULL)
-       *rest_selec = regex_selectivity(rest, strlen(rest),
-                                       case_insensitive);
-
-   pfree(patt);
-   pfree(match);
-
-   if (match_pos > 0)
+   else
         return Pattern_Prefix_Partial;
-
-   return Pattern_Prefix_None;
  }
  
  Pattern_Prefix_Status
@@ -5741,7 +5602,8 @@ regex_selectivity_sub(const char *patt, int pattlen, bool case_insensitive)
  }
  
  static Selectivity
-regex_selectivity(const char *patt, int pattlen, bool case_insensitive)
+regex_selectivity(const char *patt, int pattlen, bool case_insensitive,
+                 int fixed_prefix_len)
  {
     Selectivity sel;
  
@@ -5757,9 +5619,14 @@ regex_selectivity(const char *patt, int pattlen, bool case_insensitive)
         /* no trailing $ */
         sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
         sel *= FULL_WILDCARD_SEL;
-       if (sel > 1.0)
-           sel = 1.0;
     }
+
+   /* If there's a fixed prefix, discount its selectivity */
+   if (fixed_prefix_len > 0)
+       sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
+
+   /* Make sure result stays in range */
+   CLAMP_PROBABILITY(sel);
     return sel;
  }
  
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h

index cec4b837cd15665da8e800ac1729245c21f1b22d..616c2c6450d80443361ca7e4660f52aaa266c9ad 100644 (file)
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -156,6 +156,9 @@ typedef struct
  /* two specials for debugging and testing */
  #define REG_ATOI   101         /* convert error-code name to number */
  #define REG_ITOA   102         /* convert error-code number to name */
+/* non-error result codes for pg_regprefix */
+#define REG_PREFIX (-1)        /* identified a common prefix */
+#define REG_EXACT  (-2)        /* identified an exact match */
  
  
  
@@ -164,6 +167,7 @@ typedef struct
   */
  extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int, Oid);
  extern int pg_regexec(regex_t *, const pg_wchar *, size_t, size_t, rm_detail_t *, size_t, regmatch_t[], int);
+extern int pg_regprefix(regex_t *, pg_wchar **, size_t *);
  extern void pg_regfree(regex_t *);
  extern size_t pg_regerror(int, const regex_t *, char *, size_t);
  extern void pg_set_regex_collation(Oid collation);
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h

index e8415799ec6da88945c5b93832e39f98b176fb4f..e1e406f4eaa77720d88938cf150348ce903f88d8 100644 (file)
--- a/src/include/regex/regguts.h
+++ b/src/include/regex/regguts.h
@@ -199,19 +199,21 @@ struct colordesc
     color       sub;            /* open subcolor, if any; or free-chain ptr */
  #define  NOSUB  COLORLESS      /* value of "sub" when no open subcolor */
     struct arc *arcs;           /* chain of all arcs of this color */
+   chr         firstchr;       /* char first assigned to this color */
     int         flags;          /* bit values defined next */
  #define  FREECOL 01                /* currently free */
  #define  PSEUDO  02                /* pseudocolor, no real chars */
-#define  UNUSEDCOLOR(cd) ((cd)->flags&FREECOL)
+#define  UNUSEDCOLOR(cd) ((cd)->flags & FREECOL)
     union tree *block;          /* block of solid color, if any */
  };
  
  /*
   * The color map itself
   *
- * Only the "tree" part is used at execution time, and that only via the
- * GETCOLOR() macro.  Possibly that should be separated from the compile-time
- * data.
+ * Much of the data in the colormap struct is only used at compile time.
+ * However, the bulk of the space usage is in the "tree" structure, so it's
+ * not clear that there's much point in converting the rest to a more compact
+ * form when compilation is finished.
   */
  struct colormap
  {
@@ -279,15 +281,14 @@ struct state;
  
  struct arc
  {
-   int         type;
-#define  ARCFREE '\0'
+   int         type;           /* 0 if free, else an NFA arc type code */
     color       co;
     struct state *from;         /* where it's from (and contained within) */
     struct state *to;           /* where it's to */
-   struct arc *outchain;       /* *from's outs chain or free chain */
+   struct arc *outchain;       /* link in *from's outs chain or free chain */
  #define  freechain  outchain
-   struct arc *inchain;        /* *to's ins chain */
-   struct arc *colorchain;     /* color's arc chain */
+   struct arc *inchain;        /* link in *to's ins chain */
+   struct arc *colorchain;     /* link in color's arc chain */
     struct arc *colorchainRev;  /* back-link in color's arc chain */
  };
  
@@ -339,24 +340,38 @@ struct nfa
  
  /*
   * definitions for compacted NFA
+ *
+ * The main space savings in a compacted NFA is from making the arcs as small
+ * as possible.  We store only the transition color and next-state number for
+ * each arc.  The list of out arcs for each state is an array beginning at
+ * cnfa.states[statenumber], and terminated by a dummy carc struct with
+ * co == COLORLESS.
+ *
+ * The non-dummy carc structs are of two types: plain arcs and LACON arcs.
+ * Plain arcs just store the transition color number as "co".  LACON arcs
+ * store the lookahead constraint number plus cnfa.ncolors as "co".  LACON
+ * arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
   */
  struct carc
  {
     color       co;             /* COLORLESS is list terminator */
-   int         to;             /* state number */
+   int         to;             /* next-state number */
  };
  
  struct cnfa
  {
     int         nstates;        /* number of states */
-   int         ncolors;        /* number of colors */
+   int         ncolors;        /* number of colors (max color in use + 1) */
     int         flags;
-#define  HASLACONS  01         /* uses lookahead constraints */
+#define  HASLACONS 01          /* uses lookahead constraints */
     int         pre;            /* setup state number */
     int         post;           /* teardown state number */
     color       bos[2];         /* colors, if any, assigned to BOS and BOL */
     color       eos[2];         /* colors, if any, assigned to EOS and EOL */
+   char       *stflags;        /* vector of per-state flags bytes */
+#define  CNFA_NOPROGRESS   01  /* flag bit for a no-progress state */
     struct carc **states;       /* vector of pointers to outarc lists */
+   /* states[n] are pointers into a single malloc'd array of arcs */
     struct carc *arcs;          /* the area for the lists */
  };
  
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h

index d1e83707606610f65afa8ca4630af73c3f91757d..6ec3cce8226249ba626af4389f6a1ab1b6546a8b 100644 (file)
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -568,6 +568,8 @@ extern Datum regexp_split_to_table(PG_FUNCTION_ARGS);
  extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS);
  extern Datum regexp_split_to_array(PG_FUNCTION_ARGS);
  extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS);
+extern char *regexp_fixed_prefix(text *text_re, bool case_insensitive,
+                                Oid collation, bool *exact);
  
  /* regproc.c */
  extern Datum regprocin(PG_FUNCTION_ARGS);
diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out

index dc0c713b408e4ab46b429e08a855472d0a91ee8c..658538fd4192a7b04062c52fce666897cc94b633 100644 (file)
--- a/src/test/regress/expected/regex.out
+++ b/src/test/regress/expected/regex.out
@@ -90,3 +90,66 @@ select substring('a' from '((a)+)');
   a
  (1 row)
  
+-- Test conversion of regex patterns to indexable conditions
+explain (costs off) select * from pg_proc where proname ~ 'abc';
+            QUERY PLAN             
+-----------------------------------
+ Seq Scan on pg_proc
+   Filter: (proname ~ 'abc'::text)
+(2 rows)
+
+explain (costs off) select * from pg_proc where proname ~ '^abc';
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Index Scan using pg_proc_proname_args_nsp_index on pg_proc
+   Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name))
+   Filter: (proname ~ '^abc'::text)
+(3 rows)
+
+explain (costs off) select * from pg_proc where proname ~ '^abc$';
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Index Scan using pg_proc_proname_args_nsp_index on pg_proc
+   Index Cond: (proname = 'abc'::name)
+   Filter: (proname ~ '^abc$'::text)
+(3 rows)
+
+explain (costs off) select * from pg_proc where proname ~ '^abcd*e';
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Index Scan using pg_proc_proname_args_nsp_index on pg_proc
+   Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name))
+   Filter: (proname ~ '^abcd*e'::text)
+(3 rows)
+
+explain (costs off) select * from pg_proc where proname ~ '^abc+d';
+                              QUERY PLAN                              
+----------------------------------------------------------------------
+ Index Scan using pg_proc_proname_args_nsp_index on pg_proc
+   Index Cond: ((proname >= 'abc'::name) AND (proname < 'abd'::name))
+   Filter: (proname ~ '^abc+d'::text)
+(3 rows)
+
+explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
+                                 QUERY PLAN                                 
+----------------------------------------------------------------------------
+ Index Scan using pg_proc_proname_args_nsp_index on pg_proc
+   Index Cond: ((proname >= 'abcdef'::name) AND (proname < 'abcdeg'::name))
+   Filter: (proname ~ '^(abc)(def)'::text)
+(3 rows)
+
+explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Index Scan using pg_proc_proname_args_nsp_index on pg_proc
+   Index Cond: (proname = 'abc'::name)
+   Filter: (proname ~ '^(abc)$'::text)
+(3 rows)
+
+explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
+               QUERY PLAN               
+----------------------------------------
+ Seq Scan on pg_proc
+   Filter: (proname ~ '^(abc)?d'::text)
+(2 rows)
+
diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql

index 9fdcb2f5bd5bfd130018dec96ee792a9a4dfafc2..c29ed05d7688bade1bd6ab1e4eb91603170290af 100644 (file)
--- a/src/test/regress/sql/regex.sql
+++ b/src/test/regress/sql/regex.sql
@@ -24,3 +24,13 @@ select 'abc abc abd' ~ '^(.+)( \1)+$' as f;
  select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)');
  select substring('a' from '((a))+');
  select substring('a' from '((a)+)');
+
+-- Test conversion of regex patterns to indexable conditions
+explain (costs off) select * from pg_proc where proname ~ 'abc';
+explain (costs off) select * from pg_proc where proname ~ '^abc';
+explain (costs off) select * from pg_proc where proname ~ '^abc$';
+explain (costs off) select * from pg_proc where proname ~ '^abcd*e';
+explain (costs off) select * from pg_proc where proname ~ '^abc+d';
+explain (costs off) select * from pg_proc where proname ~ '^(abc)(def)';
+explain (costs off) select * from pg_proc where proname ~ '^(abc)$';
+explain (costs off) select * from pg_proc where proname ~ '^(abc)?d';
author	Tom Lane
	Tue, 10 Jul 2012 22:00:39 +0000 (18:00 -0400)
committer	Tom Lane
	Tue, 10 Jul 2012 22:00:39 +0000 (18:00 -0400)
src/backend/regex/Makefile		patch \| blob \| blame \| history
src/backend/regex/README		patch \| blob \| blame \| history
src/backend/regex/regc_color.c		patch \| blob \| blame \| history
src/backend/regex/regc_nfa.c		patch \| blob \| blame \| history
src/backend/regex/regcomp.c		patch \| blob \| blame \| history
src/backend/regex/rege_dfa.c		patch \| blob \| blame \| history
src/backend/regex/regprefix.c	[new file with mode: 0644]	patch \| blob
src/backend/utils/adt/regexp.c		patch \| blob \| blame \| history
src/backend/utils/adt/selfuncs.c		patch \| blob \| blame \| history
src/include/regex/regex.h		patch \| blob \| blame \| history
src/include/regex/regguts.h		patch \| blob \| blame \| history
src/include/utils/builtins.h		patch \| blob \| blame \| history
src/test/regress/expected/regex.out		patch \| blob \| blame \| history
src/test/regress/sql/regex.sql		patch \| blob \| blame \| history