regexec.c

Go to the documentation of this file.
00001 /*
00002  * re_*exec and friends - match REs
00003  *
00004  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
00005  *
00006  * Development of this software was funded, in part, by Cray Research Inc.,
00007  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
00008  * Corporation, none of whom are responsible for the results.  The author
00009  * thanks all of them.
00010  *
00011  * Redistribution and use in source and binary forms -- with or without
00012  * modification -- are permitted for any purpose, provided that
00013  * redistributions in source form retain this entire copyright notice and
00014  * indicate the origin and nature of any modifications.
00015  *
00016  * I'd appreciate being given credit for this package in the documentation of
00017  * software which uses it, but that is not a requirement.
00018  *
00019  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
00020  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
00021  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
00022  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00023  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00024  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00025  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00026  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00027  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00028  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00029  */
00030 
00031 #include "regguts.h"
00032 
00033 /*
00034  * Lazy-DFA representation.
00035  */
00036 
00037 struct arcp {                   /* "pointer" to an outarc */
00038     struct sset *ss;
00039     color co;
00040 };
00041 
00042 struct sset {                   /* state set */
00043     unsigned *states;           /* pointer to bitvector */
00044     unsigned hash;              /* hash of bitvector */
00045 #define HASH(bv, nw)    (((nw) == 1) ? *(bv) : hash(bv, nw))
00046 #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
00047         memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
00048     int flags;
00049 #define STARTER         01      /* the initial state set */
00050 #define POSTSTATE       02      /* includes the goal state */
00051 #define LOCKED          04      /* locked in cache */
00052 #define NOPROGRESS      010     /* zero-progress state set */
00053     struct arcp ins;            /* chain of inarcs pointing here */
00054     chr *lastseen;              /* last entered on arrival here */
00055     struct sset **outs;         /* outarc vector indexed by color */
00056     struct arcp *inchain;       /* chain-pointer vector for outarcs */
00057 };
00058 
00059 struct dfa {
00060     int nssets;                 /* size of cache */
00061     int nssused;                /* how many entries occupied yet */
00062     int nstates;                /* number of states */
00063     int ncolors;                /* length of outarc and inchain vectors */
00064     int wordsper;               /* length of state-set bitvectors */
00065     struct sset *ssets;         /* state-set cache */
00066     unsigned *statesarea;       /* bitvector storage */
00067     unsigned *work;             /* pointer to work area within statesarea */
00068     struct sset **outsarea;     /* outarc-vector storage */
00069     struct arcp *incarea;       /* inchain storage */
00070     struct cnfa *cnfa;
00071     struct colormap *cm;
00072     chr *lastpost;              /* location of last cache-flushed success */
00073     chr *lastnopr;              /* location of last cache-flushed NOPROGRESS */
00074     struct sset *search;        /* replacement-search-pointer memory */
00075     int cptsmalloced;           /* were the areas individually malloced? */
00076     char *mallocarea;           /* self, or master malloced area, or NULL */
00077 };
00078 
00079 #define WORK    1               /* number of work bitvectors needed */
00080 
00081 /*
00082  * Setup for non-malloc allocation for small cases.
00083  */
00084 
00085 #define FEWSTATES       20      /* must be less than UBITS */
00086 #define FEWCOLORS       15
00087 struct smalldfa {
00088     struct dfa dfa;
00089     struct sset ssets[FEWSTATES*2];
00090     unsigned statesarea[FEWSTATES*2 + WORK];
00091     struct sset *outsarea[FEWSTATES*2 * FEWCOLORS];
00092     struct arcp incarea[FEWSTATES*2 * FEWCOLORS];
00093 };
00094 #define DOMALLOC        ((struct smalldfa *)NULL)       /* force malloc */
00095 
00096 /*
00097  * Internal variables, bundled for easy passing around.
00098  */
00099 
00100 struct vars {
00101     regex_t *re;
00102     struct guts *g;
00103     int eflags;                 /* copies of arguments */
00104     size_t nmatch;
00105     regmatch_t *pmatch;
00106     rm_detail_t *details;
00107     chr *start;                 /* start of string */
00108     chr *stop;                  /* just past end of string */
00109     int err;                    /* error code if any (0 none) */
00110     regoff_t *mem;              /* memory vector for backtracking */
00111     struct smalldfa dfa1;
00112     struct smalldfa dfa2;
00113 };
00114 #define VISERR(vv) ((vv)->err != 0)     /* have we seen an error yet? */
00115 #define ISERR() VISERR(v)
00116 #define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e)))
00117 #define ERR(e)  VERR(v, e)      /* record an error */
00118 #define NOERR() {if (ISERR()) return v->err;}   /* if error seen, return it */
00119 #define OFF(p)  ((p) - v->start)
00120 #define LOFF(p) ((long)OFF(p))
00121 
00122 /*
00123  * forward declarations
00124  */
00125 /* =====^!^===== begin forwards =====^!^===== */
00126 /* automatically gathered by fwd; do not hand-edit */
00127 /* === regexec.c === */
00128 int exec(regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int);
00129 static int find(struct vars *, struct cnfa *, struct colormap *);
00130 static int cfind(struct vars *, struct cnfa *, struct colormap *);
00131 static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
00132 static VOID zapsubs(regmatch_t *, size_t);
00133 static VOID zapmem(struct vars *, struct subre *);
00134 static VOID subset(struct vars *, struct subre *, chr *, chr *);
00135 static int dissect(struct vars *, struct subre *, chr *, chr *);
00136 static int condissect(struct vars *, struct subre *, chr *, chr *);
00137 static int altdissect(struct vars *, struct subre *, chr *, chr *);
00138 static int cdissect(struct vars *, struct subre *, chr *, chr *);
00139 static int ccondissect(struct vars *, struct subre *, chr *, chr *);
00140 static int crevdissect(struct vars *, struct subre *, chr *, chr *);
00141 static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
00142 static int caltdissect(struct vars *, struct subre *, chr *, chr *);
00143 /* === rege_dfa.c === */
00144 static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
00145 static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
00146 static chr *lastcold(struct vars *, struct dfa *);
00147 static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
00148 static VOID freedfa(struct dfa *);
00149 static unsigned hash(unsigned *, int);
00150 static struct sset *initialize(struct vars *, struct dfa *, chr *);
00151 static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *);
00152 static int lacon(struct vars *, struct cnfa *, chr *, pcolor);
00153 static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
00154 static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
00155 /* automatically gathered by fwd; do not hand-edit */
00156 /* =====^!^===== end forwards =====^!^===== */
00157 
00158 /*
00159  - exec - match regular expression
00160  ^ int exec(regex_t *, CONST chr *, size_t, rm_detail_t *,
00161  ^                                      size_t, regmatch_t [], int);
00162  */
00163 int
00164 exec(
00165     regex_t *re,
00166     CONST chr *string,
00167     size_t len,
00168     rm_detail_t *details,
00169     size_t nmatch,
00170     regmatch_t pmatch[],
00171     int flags)
00172 {
00173     AllocVars(v);
00174     int st;
00175     size_t n;
00176     int backref;
00177 #define LOCALMAT        20
00178     regmatch_t mat[LOCALMAT];
00179 #define LOCALMEM        40
00180     regoff_t mem[LOCALMEM];
00181 
00182     /*
00183      * Sanity checks.
00184      */
00185 
00186     if (re == NULL || string == NULL || re->re_magic != REMAGIC) {
00187         FreeVars(v);
00188         return REG_INVARG;
00189     }
00190     if (re->re_csize != sizeof(chr)) {
00191         FreeVars(v);
00192         return REG_MIXED;
00193     }
00194 
00195     /*
00196      * Setup.
00197      */
00198 
00199     v->re = re;
00200     v->g = (struct guts *)re->re_guts;
00201     if ((v->g->cflags&REG_EXPECT) && details == NULL) {
00202         FreeVars(v);
00203         return REG_INVARG;
00204     }
00205     if (v->g->info&REG_UIMPOSSIBLE) {
00206         FreeVars(v);
00207         return REG_NOMATCH;
00208     }
00209     backref = (v->g->info&REG_UBACKREF) ? 1 : 0;
00210     v->eflags = flags;
00211     if (v->g->cflags&REG_NOSUB) {
00212         nmatch = 0;             /* override client */
00213     }
00214     v->nmatch = nmatch;
00215     if (backref) {
00216         /*
00217          * Need work area.
00218          */
00219 
00220         if (v->g->nsub + 1 <= LOCALMAT) {
00221             v->pmatch = mat;
00222         } else {
00223             v->pmatch = (regmatch_t *)
00224                     MALLOC((v->g->nsub + 1) * sizeof(regmatch_t));
00225         }
00226         if (v->pmatch == NULL) {
00227             FreeVars(v);
00228             return REG_ESPACE;
00229         }
00230         v->nmatch = v->g->nsub + 1;
00231     } else {
00232         v->pmatch = pmatch;
00233     }
00234     v->details = details;
00235     v->start = (chr *)string;
00236     v->stop = (chr *)string + len;
00237     v->err = 0;
00238     if (backref) {
00239         /*
00240          * Need retry memory.
00241          */
00242 
00243         assert(v->g->ntree >= 0);
00244         n = (size_t)v->g->ntree;
00245         if (n <= LOCALMEM) {
00246             v->mem = mem;
00247         } else {
00248             v->mem = (regoff_t *) MALLOC(n*sizeof(regoff_t));
00249         }
00250         if (v->mem == NULL) {
00251             if (v->pmatch != pmatch && v->pmatch != mat) {
00252                 FREE(v->pmatch);
00253             }
00254             FreeVars(v);
00255             return REG_ESPACE;
00256         }
00257     } else {
00258         v->mem = NULL;
00259     }
00260 
00261     /*
00262      * Do it.
00263      */
00264 
00265     assert(v->g->tree != NULL);
00266     if (backref) {
00267         st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
00268     } else {
00269         st = find(v, &v->g->tree->cnfa, &v->g->cmap);
00270     }
00271 
00272     /*
00273      * Copy (portion of) match vector over if necessary.
00274      */
00275 
00276     if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
00277         zapsubs(pmatch, nmatch);
00278         n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
00279         memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t));
00280     }
00281 
00282     /*
00283      * Clean up.
00284      */
00285 
00286     if (v->pmatch != pmatch && v->pmatch != mat) {
00287         FREE(v->pmatch);
00288     }
00289     if (v->mem != NULL && v->mem != mem) {
00290         FREE(v->mem);
00291     }
00292     FreeVars(v);
00293     return st;
00294 }
00295 
00296 /*
00297  - find - find a match for the main NFA (no-complications case)
00298  ^ static int find(struct vars *, struct cnfa *, struct colormap *);
00299  */
00300 static int
00301 find(
00302     struct vars *v,
00303     struct cnfa *cnfa,
00304     struct colormap *cm)
00305 {
00306     struct dfa *s;
00307     struct dfa *d;
00308     chr *begin;
00309     chr *end = NULL;
00310     chr *cold;
00311     chr *open;                  /* Open and close of range of possible
00312                                  * starts */
00313     chr *close;
00314     int hitend;
00315     int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0;
00316 
00317     /*
00318      * First, a shot with the search RE.
00319      */
00320 
00321     s = newdfa(v, &v->g->search, cm, &v->dfa1);
00322     assert(!(ISERR() && s != NULL));
00323     NOERR();
00324     MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
00325     cold = NULL;
00326     close = shortest(v, s, v->start, v->start, v->stop, &cold, NULL);
00327     freedfa(s);
00328     NOERR();
00329     if (v->g->cflags&REG_EXPECT) {
00330         assert(v->details != NULL);
00331         if (cold != NULL) {
00332             v->details->rm_extend.rm_so = OFF(cold);
00333         } else {
00334             v->details->rm_extend.rm_so = OFF(v->stop);
00335         }
00336         v->details->rm_extend.rm_eo = OFF(v->stop);     /* unknown */
00337     }
00338     if (close == NULL) {        /* not found */
00339         return REG_NOMATCH;
00340     }
00341     if (v->nmatch == 0) {       /* found, don't need exact location */
00342         return REG_OKAY;
00343     }
00344 
00345     /*
00346      * Find starting point and match.
00347      */
00348 
00349     assert(cold != NULL);
00350     open = cold;
00351     cold = NULL;
00352     MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
00353     d = newdfa(v, cnfa, cm, &v->dfa1);
00354     assert(!(ISERR() && d != NULL));
00355     NOERR();
00356     for (begin = open; begin <= close; begin++) {
00357         MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
00358         if (shorter) {
00359             end = shortest(v, d, begin, begin, v->stop, NULL, &hitend);
00360         } else {
00361             end = longest(v, d, begin, v->stop, &hitend);
00362         }
00363         NOERR();
00364         if (hitend && cold == NULL) {
00365             cold = begin;
00366         }
00367         if (end != NULL) {
00368             break;              /* NOTE BREAK OUT */
00369         }
00370     }
00371     assert(end != NULL);        /* search RE succeeded so loop should */
00372     freedfa(d);
00373 
00374     /*
00375      * And pin down details.
00376      */
00377 
00378     assert(v->nmatch > 0);
00379     v->pmatch[0].rm_so = OFF(begin);
00380     v->pmatch[0].rm_eo = OFF(end);
00381     if (v->g->cflags&REG_EXPECT) {
00382         if (cold != NULL) {
00383             v->details->rm_extend.rm_so = OFF(cold);
00384         } else {
00385             v->details->rm_extend.rm_so = OFF(v->stop);
00386         }
00387         v->details->rm_extend.rm_eo = OFF(v->stop);     /* unknown */
00388     }
00389     if (v->nmatch == 1) {       /* no need for submatches */
00390         return REG_OKAY;
00391     }
00392 
00393     /*
00394      * Submatches.
00395      */
00396 
00397     zapsubs(v->pmatch, v->nmatch);
00398     return dissect(v, v->g->tree, begin, end);
00399 }
00400 
00401 /*
00402  - cfind - find a match for the main NFA (with complications)
00403  ^ static int cfind(struct vars *, struct cnfa *, struct colormap *);
00404  */
00405 static int
00406 cfind(
00407     struct vars *v,
00408     struct cnfa *cnfa,
00409     struct colormap *cm)
00410 {
00411     struct dfa *s;
00412     struct dfa *d;
00413     chr *cold = NULL; /* silence gcc 4 warning */
00414     int ret;
00415 
00416     s = newdfa(v, &v->g->search, cm, &v->dfa1);
00417     NOERR();
00418     d = newdfa(v, cnfa, cm, &v->dfa2);
00419     if (ISERR()) {
00420         assert(d == NULL);
00421         freedfa(s);
00422         return v->err;
00423     }
00424 
00425     ret = cfindloop(v, cnfa, cm, d, s, &cold);
00426 
00427     freedfa(d);
00428     freedfa(s);
00429     NOERR();
00430     if (v->g->cflags&REG_EXPECT) {
00431         assert(v->details != NULL);
00432         if (cold != NULL) {
00433             v->details->rm_extend.rm_so = OFF(cold);
00434         } else {
00435             v->details->rm_extend.rm_so = OFF(v->stop);
00436         }
00437         v->details->rm_extend.rm_eo = OFF(v->stop);     /* unknown */
00438     }
00439     return ret;
00440 }
00441 
00442 /*
00443  - cfindloop - the heart of cfind
00444  ^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *,
00445  ^      struct dfa *, struct dfa *, chr **);
00446  */
00447 static int
00448 cfindloop(
00449     struct vars *v,
00450     struct cnfa *cnfa,
00451     struct colormap *cm,
00452     struct dfa *d,
00453     struct dfa *s,
00454     chr **coldp)                /* where to put coldstart pointer */
00455 {
00456     chr *begin;
00457     chr *end;
00458     chr *cold;
00459     chr *open;                  /* Open and close of range of possible
00460                                  * starts */
00461     chr *close;
00462     chr *estart;
00463     chr *estop;
00464     int er;
00465     int shorter = v->g->tree->flags&SHORTER;
00466     int hitend;
00467 
00468     assert(d != NULL && s != NULL);
00469     cold = NULL;
00470     close = v->start;
00471     do {
00472         MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
00473         close = shortest(v, s, close, close, v->stop, &cold, NULL);
00474         if (close == NULL) {
00475             break;              /* NOTE BREAK */
00476         }
00477         assert(cold != NULL);
00478         open = cold;
00479         cold = NULL;
00480         MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
00481         for (begin = open; begin <= close; begin++) {
00482             MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
00483             estart = begin;
00484             estop = v->stop;
00485             for (;;) {
00486                 if (shorter) {
00487                     end = shortest(v, d, begin, estart, estop, NULL, &hitend);
00488                 } else {
00489                     end = longest(v, d, begin, estop, &hitend);
00490                 }
00491                 if (hitend && cold == NULL) {
00492                     cold = begin;
00493                 }
00494                 if (end == NULL) {
00495                     break;      /* NOTE BREAK OUT */
00496                 }
00497 
00498                 MDEBUG(("tentative end %ld\n", LOFF(end)));
00499                 zapsubs(v->pmatch, v->nmatch);
00500                 zapmem(v, v->g->tree);
00501                 er = cdissect(v, v->g->tree, begin, end);
00502                 if (er == REG_OKAY) {
00503                     if (v->nmatch > 0) {
00504                         v->pmatch[0].rm_so = OFF(begin);
00505                         v->pmatch[0].rm_eo = OFF(end);
00506                     }
00507                     *coldp = cold;
00508                     return REG_OKAY;
00509                 }
00510                 if (er != REG_NOMATCH) {
00511                     ERR(er);
00512                     return er;
00513                 }
00514                 if ((shorter) ? end == estop : end == begin) {
00515                     /*
00516                      * No point in trying again.
00517                      */
00518 
00519                     *coldp = cold;
00520                     return REG_NOMATCH;
00521                 }
00522 
00523                 /*
00524                  * Go around and try again
00525                  */
00526 
00527                 if (shorter) {
00528                     estart = end + 1;
00529                 } else {
00530                     estop = end - 1;
00531                 }
00532             }
00533         }
00534     } while (close < v->stop);
00535 
00536     *coldp = cold;
00537     return REG_NOMATCH;
00538 }
00539 
00540 /*
00541  - zapsubs - initialize the subexpression matches to "no match"
00542  ^ static VOID zapsubs(regmatch_t *, size_t);
00543  */
00544 static void
00545 zapsubs(
00546     regmatch_t *p,
00547     size_t n)
00548 {
00549     size_t i;
00550 
00551     for (i = n-1; i > 0; i--) {
00552         p[i].rm_so = -1;
00553         p[i].rm_eo = -1;
00554     }
00555 }
00556 
00557 /*
00558  - zapmem - initialize the retry memory of a subtree to zeros
00559  ^ static VOID zapmem(struct vars *, struct subre *);
00560  */
00561 static void
00562 zapmem(
00563     struct vars *v,
00564     struct subre *t)
00565 {
00566     if (t == NULL) {
00567         return;
00568     }
00569 
00570     assert(v->mem != NULL);
00571     v->mem[t->retry] = 0;
00572     if (t->op == '(') {
00573         assert(t->subno > 0);
00574         v->pmatch[t->subno].rm_so = -1;
00575                 v->pmatch[t->subno].rm_eo = -1;
00576     }
00577 
00578     if (t->left != NULL) {
00579         zapmem(v, t->left);
00580     }
00581     if (t->right != NULL) {
00582         zapmem(v, t->right);
00583     }
00584 }
00585 
00586 /*
00587  - subset - set any subexpression relevant to a successful subre
00588  ^ static VOID subset(struct vars *, struct subre *, chr *, chr *);
00589  */
00590 static void
00591 subset(
00592     struct vars *v,
00593     struct subre *sub,
00594     chr *begin,
00595     chr *end)
00596 {
00597     int n = sub->subno;
00598 
00599     assert(n > 0);
00600     if ((size_t)n >= v->nmatch) {
00601         return;
00602     }
00603 
00604     MDEBUG(("setting %d\n", n));
00605     v->pmatch[n].rm_so = OFF(begin);
00606     v->pmatch[n].rm_eo = OFF(end);
00607 }
00608 
00609 /*
00610  - dissect - determine subexpression matches (uncomplicated case)
00611  ^ static int dissect(struct vars *, struct subre *, chr *, chr *);
00612  */
00613 static int                      /* regexec return code */
00614 dissect(
00615     struct vars *v,
00616     struct subre *t,
00617     chr *begin,                 /* beginning of relevant substring */
00618     chr *end)                   /* end of same */
00619 {
00620     assert(t != NULL);
00621     MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
00622 
00623     switch (t->op) {
00624     case '=':                   /* terminal node */
00625         assert(t->left == NULL && t->right == NULL);
00626         return REG_OKAY;        /* no action, parent did the work */
00627         break;
00628     case '|':                   /* alternation */
00629         assert(t->left != NULL);
00630         return altdissect(v, t, begin, end);
00631         break;
00632     case 'b':                   /* back ref -- shouldn't be calling us! */
00633         return REG_ASSERT;
00634         break;
00635     case '.':                   /* concatenation */
00636         assert(t->left != NULL && t->right != NULL);
00637         return condissect(v, t, begin, end);
00638         break;
00639     case '(':                   /* capturing */
00640         assert(t->left != NULL && t->right == NULL);
00641         assert(t->subno > 0);
00642         subset(v, t, begin, end);
00643         return dissect(v, t->left, begin, end);
00644         break;
00645     default:
00646         return REG_ASSERT;
00647         break;
00648     }
00649 }
00650 
00651 /*
00652  - condissect - determine concatenation subexpression matches (uncomplicated)
00653  ^ static int condissect(struct vars *, struct subre *, chr *, chr *);
00654  */
00655 static int                      /* regexec return code */
00656 condissect(
00657     struct vars *v,
00658     struct subre *t,
00659     chr *begin,                 /* beginning of relevant substring */
00660     chr *end)                   /* end of same */
00661 {
00662     struct dfa *d;
00663     struct dfa *d2;
00664     chr *mid;
00665     int i;
00666     int shorter = (t->left->flags&SHORTER) ? 1 : 0;
00667     chr *stop = (shorter) ? end : begin;
00668 
00669     assert(t->op == '.');
00670     assert(t->left != NULL && t->left->cnfa.nstates > 0);
00671     assert(t->right != NULL && t->right->cnfa.nstates > 0);
00672 
00673     d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
00674     NOERR();
00675     d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
00676     if (ISERR()) {
00677         assert(d2 == NULL);
00678         freedfa(d);
00679         return v->err;
00680     }
00681 
00682     /*
00683      * Pick a tentative midpoint.
00684      */
00685 
00686     if (shorter) {
00687         mid = shortest(v, d, begin, begin, end, NULL, NULL);
00688     } else {
00689         mid = longest(v, d, begin, end, NULL);
00690     }
00691     if (mid == NULL) {
00692         freedfa(d);
00693         freedfa(d2);
00694         return REG_ASSERT;
00695     }
00696     MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
00697 
00698     /*
00699      * Iterate until satisfaction or failure.
00700      */
00701 
00702     while (longest(v, d2, mid, end, NULL) != end) {
00703         /*
00704          * That midpoint didn't work, find a new one.
00705          */
00706 
00707         if (mid == stop) {
00708             /*
00709              * All possibilities exhausted!
00710              */
00711 
00712             MDEBUG(("no midpoint!\n"));
00713             freedfa(d);
00714             freedfa(d2);
00715             return REG_ASSERT;
00716         }
00717         if (shorter) {
00718             mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
00719         } else {
00720             mid = longest(v, d, begin, mid-1, NULL);
00721         }
00722         if (mid == NULL) {
00723             /*
00724              * Failed to find a new one!
00725              */
00726 
00727             MDEBUG(("failed midpoint!\n"));
00728             freedfa(d);
00729             freedfa(d2);
00730             return REG_ASSERT;
00731         }
00732         MDEBUG(("new midpoint %ld\n", LOFF(mid)));
00733     }
00734 
00735     /*
00736      * Satisfaction.
00737      */
00738 
00739     MDEBUG(("successful\n"));
00740     freedfa(d);
00741     freedfa(d2);
00742     i = dissect(v, t->left, begin, mid);
00743     if (i != REG_OKAY) {
00744         return i;
00745     }
00746     return dissect(v, t->right, mid, end);
00747 }
00748 
00749 /*
00750  - altdissect - determine alternative subexpression matches (uncomplicated)
00751  ^ static int altdissect(struct vars *, struct subre *, chr *, chr *);
00752  */
00753 static int                      /* regexec return code */
00754 altdissect(
00755     struct vars *v,
00756     struct subre *t,
00757     chr *begin,                 /* beginning of relevant substring */
00758     chr *end)                   /* end of same */
00759 {
00760     struct dfa *d;
00761     int i;
00762 
00763     assert(t != NULL);
00764     assert(t->op == '|');
00765 
00766     for (i = 0; t != NULL; t = t->right, i++) {
00767         MDEBUG(("trying %dth\n", i));
00768         assert(t->left != NULL && t->left->cnfa.nstates > 0);
00769         d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
00770         if (ISERR()) {
00771             return v->err;
00772         }
00773         if (longest(v, d, begin, end, NULL) == end) {
00774             MDEBUG(("success\n"));
00775             freedfa(d);
00776             return dissect(v, t->left, begin, end);
00777         }
00778         freedfa(d);
00779     }
00780     return REG_ASSERT;          /* none of them matched?!? */
00781 }
00782 
00783 /*
00784  - cdissect - determine subexpression matches (with complications)
00785  * The retry memory stores the offset of the trial midpoint from begin, plus 1
00786  * so that 0 uniquely means "clean slate".
00787  ^ static int cdissect(struct vars *, struct subre *, chr *, chr *);
00788  */
00789 static int                      /* regexec return code */
00790 cdissect(
00791     struct vars *v,
00792     struct subre *t,
00793     chr *begin,                 /* beginning of relevant substring */
00794     chr *end)                   /* end of same */
00795 {
00796     int er;
00797 
00798     assert(t != NULL);
00799     MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
00800 
00801     switch (t->op) {
00802     case '=':                   /* terminal node */
00803         assert(t->left == NULL && t->right == NULL);
00804         return REG_OKAY;        /* no action, parent did the work */
00805         break;
00806     case '|':                   /* alternation */
00807         assert(t->left != NULL);
00808         return caltdissect(v, t, begin, end);
00809         break;
00810     case 'b':                   /* back ref -- shouldn't be calling us! */
00811         assert(t->left == NULL && t->right == NULL);
00812         return cbrdissect(v, t, begin, end);
00813         break;
00814     case '.':                   /* concatenation */
00815         assert(t->left != NULL && t->right != NULL);
00816         return ccondissect(v, t, begin, end);
00817         break;
00818     case '(':                   /* capturing */
00819         assert(t->left != NULL && t->right == NULL);
00820         assert(t->subno > 0);
00821         er = cdissect(v, t->left, begin, end);
00822         if (er == REG_OKAY) {
00823             subset(v, t, begin, end);
00824         }
00825         return er;
00826         break;
00827     default:
00828         return REG_ASSERT;
00829         break;
00830     }
00831 }
00832 
00833 /*
00834  - ccondissect - concatenation subexpression matches (with complications)
00835  * The retry memory stores the offset of the trial midpoint from begin, plus 1
00836  * so that 0 uniquely means "clean slate".
00837  ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *);
00838  */
00839 static int                      /* regexec return code */
00840 ccondissect(
00841     struct vars *v,
00842     struct subre *t,
00843     chr *begin,                 /* beginning of relevant substring */
00844     chr *end)                   /* end of same */
00845 {
00846     struct dfa *d;
00847     struct dfa *d2;
00848     chr *mid;
00849     int er;
00850 
00851     assert(t->op == '.');
00852     assert(t->left != NULL && t->left->cnfa.nstates > 0);
00853     assert(t->right != NULL && t->right->cnfa.nstates > 0);
00854 
00855     if (t->left->flags&SHORTER) { /* reverse scan */
00856         return crevdissect(v, t, begin, end);
00857     }
00858 
00859     d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
00860     if (ISERR()) {
00861         return v->err;
00862     }
00863     d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
00864     if (ISERR()) {
00865         freedfa(d);
00866         return v->err;
00867     }
00868     MDEBUG(("cconcat %d\n", t->retry));
00869 
00870     /*
00871      * Pick a tentative midpoint.
00872      */
00873 
00874     if (v->mem[t->retry] == 0) {
00875         mid = longest(v, d, begin, end, NULL);
00876         if (mid == NULL) {
00877             freedfa(d);
00878             freedfa(d2);
00879             return REG_NOMATCH;
00880         }
00881         MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
00882         v->mem[t->retry] = (mid - begin) + 1;
00883     } else {
00884         mid = begin + (v->mem[t->retry] - 1);
00885         MDEBUG(("working midpoint %ld\n", LOFF(mid)));
00886     }
00887 
00888     /*
00889      * Iterate until satisfaction or failure.
00890      */
00891 
00892     for (;;) {
00893         /*
00894          * Try this midpoint on for size.
00895          */
00896 
00897         er = cdissect(v, t->left, begin, mid);
00898         if ((er == REG_OKAY) && (longest(v, d2, mid, end, NULL) == end)
00899                 && (er = cdissect(v, t->right, mid, end)) == REG_OKAY) {
00900             break;              /* NOTE BREAK OUT */
00901         }
00902         if ((er != REG_OKAY) && (er != REG_NOMATCH)) {
00903             freedfa(d);
00904             freedfa(d2);
00905             return er;
00906         }
00907 
00908         /*
00909          * That midpoint didn't work, find a new one.
00910          */
00911 
00912         if (mid == begin) {
00913             /*
00914              * All possibilities exhausted.
00915              */
00916 
00917             MDEBUG(("%d no midpoint\n", t->retry));
00918             freedfa(d);
00919             freedfa(d2);
00920             return REG_NOMATCH;
00921         }
00922         mid = longest(v, d, begin, mid-1, NULL);
00923         if (mid == NULL) {
00924             /*
00925              * Failed to find a new one.
00926              */
00927 
00928             MDEBUG(("%d failed midpoint\n", t->retry));
00929             freedfa(d);
00930             freedfa(d2);
00931             return REG_NOMATCH;
00932         }
00933         MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
00934         v->mem[t->retry] = (mid - begin) + 1;
00935         zapmem(v, t->left);
00936         zapmem(v, t->right);
00937     }
00938 
00939     /*
00940      * Satisfaction.
00941      */
00942 
00943     MDEBUG(("successful\n"));
00944     freedfa(d);
00945     freedfa(d2);
00946     return REG_OKAY;
00947 }
00948 
00949 /*
00950  - crevdissect - determine backref shortest-first subexpression matches
00951  * The retry memory stores the offset of the trial midpoint from begin, plus 1
00952  * so that 0 uniquely means "clean slate".
00953  ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *);
00954  */
00955 static int                      /* regexec return code */
00956 crevdissect(
00957     struct vars *v,
00958     struct subre *t,
00959     chr *begin,                 /* beginning of relevant substring */
00960     chr *end)                   /* end of same */
00961 {
00962     struct dfa *d;
00963     struct dfa *d2;
00964     chr *mid;
00965     int er;
00966 
00967     assert(t->op == '.');
00968     assert(t->left != NULL && t->left->cnfa.nstates > 0);
00969     assert(t->right != NULL && t->right->cnfa.nstates > 0);
00970     assert(t->left->flags&SHORTER);
00971 
00972     /*
00973      * Concatenation -- need to split the substring between parts.
00974      */
00975 
00976     d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
00977     if (ISERR()) {
00978         return v->err;
00979     }
00980     d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
00981     if (ISERR()) {
00982         freedfa(d);
00983         return v->err;
00984     }
00985     MDEBUG(("crev %d\n", t->retry));
00986 
00987     /*
00988      * Pick a tentative midpoint.
00989      */
00990 
00991     if (v->mem[t->retry] == 0) {
00992         mid = shortest(v, d, begin, begin, end, NULL, NULL);
00993         if (mid == NULL) {
00994             freedfa(d);
00995             freedfa(d2);
00996             return REG_NOMATCH;
00997         }
00998         MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
00999         v->mem[t->retry] = (mid - begin) + 1;
01000     } else {
01001         mid = begin + (v->mem[t->retry] - 1);
01002         MDEBUG(("working midpoint %ld\n", LOFF(mid)));
01003     }
01004 
01005     /*
01006      * Iterate until satisfaction or failure.
01007      */
01008 
01009     for (;;) {
01010         /*
01011          * Try this midpoint on for size.
01012          */
01013 
01014         er = cdissect(v, t->left, begin, mid);
01015         if ((er == REG_OKAY) && (longest(v, d2, mid, end, NULL) == end)
01016                 && (er = cdissect(v, t->right, mid, end)) == REG_OKAY) {
01017             break;              /* NOTE BREAK OUT */
01018         }
01019         if (er != REG_OKAY && er != REG_NOMATCH) {
01020             freedfa(d);
01021             freedfa(d2);
01022             return er;
01023         }
01024 
01025         /*
01026          * That midpoint didn't work, find a new one.
01027          */
01028 
01029         if (mid == end) {
01030             /*
01031              * All possibilities exhausted.
01032              */
01033 
01034             MDEBUG(("%d no midpoint\n", t->retry));
01035             freedfa(d);
01036             freedfa(d2);
01037             return REG_NOMATCH;
01038         }
01039         mid = shortest(v, d, begin, mid+1, end, NULL, NULL);
01040         if (mid == NULL) {
01041             /*
01042              * Failed to find a new one.
01043              */
01044 
01045             MDEBUG(("%d failed midpoint\n", t->retry));
01046             freedfa(d);
01047             freedfa(d2);
01048             return REG_NOMATCH;
01049         }
01050         MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
01051         v->mem[t->retry] = (mid - begin) + 1;
01052         zapmem(v, t->left);
01053         zapmem(v, t->right);
01054     }
01055 
01056     /*
01057      * Satisfaction.
01058      */
01059 
01060     MDEBUG(("successful\n"));
01061     freedfa(d);
01062     freedfa(d2);
01063     return REG_OKAY;
01064 }
01065 
01066 /*
01067  - cbrdissect - determine backref subexpression matches
01068  ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
01069  */
01070 static int                      /* regexec return code */
01071 cbrdissect(
01072     struct vars *v,
01073     struct subre *t,
01074     chr *begin,                 /* beginning of relevant substring */
01075     chr *end)                   /* end of same */
01076 {
01077     int i;
01078     int n = t->subno;
01079     size_t len;
01080     chr *paren;
01081     chr *p;
01082     chr *stop;
01083     int min = t->min;
01084     int max = t->max;
01085 
01086     assert(t != NULL);
01087     assert(t->op == 'b');
01088     assert(n >= 0);
01089     assert((size_t)n < v->nmatch);
01090 
01091     MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
01092 
01093     if (v->pmatch[n].rm_so == -1) {
01094         return REG_NOMATCH;
01095     }
01096     paren = v->start + v->pmatch[n].rm_so;
01097     len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
01098 
01099     /*
01100      * No room to maneuver -- retries are pointless.
01101      */
01102 
01103     if (v->mem[t->retry]) {
01104         return REG_NOMATCH;
01105     }
01106     v->mem[t->retry] = 1;
01107 
01108     /*
01109      * Special-case zero-length string.
01110      */
01111 
01112     if (len == 0) {
01113         if (begin == end) {
01114             return REG_OKAY;
01115         }
01116         return REG_NOMATCH;
01117     }
01118 
01119     /*
01120      * And too-short string.
01121      */
01122 
01123     assert(end >= begin);
01124     if ((size_t)(end - begin) < len) {
01125         return REG_NOMATCH;
01126     }
01127     stop = end - len;
01128 
01129     /*
01130      * Count occurrences.
01131      */
01132 
01133     i = 0;
01134     for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) {
01135         if ((*v->g->compare)(paren, p, len) != 0) {
01136             break;
01137         }
01138         i++;
01139     }
01140     MDEBUG(("cbackref found %d\n", i));
01141 
01142     /*
01143      * And sort it out.
01144      */
01145 
01146     if (p != end) {             /* didn't consume all of it */
01147         return REG_NOMATCH;
01148     }
01149     if (min <= i && (i <= max || max == INFINITY)) {
01150         return REG_OKAY;
01151     }
01152     return REG_NOMATCH;         /* out of range */
01153 }
01154 
01155 /*
01156  - caltdissect - determine alternative subexpression matches (w. complications)
01157  ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *);
01158  */
01159 static int                      /* regexec return code */
01160 caltdissect(
01161     struct vars *v,
01162     struct subre *t,
01163     chr *begin,                 /* beginning of relevant substring */
01164     chr *end)                   /* end of same */
01165 {
01166     struct dfa *d;
01167     int er;
01168 #define UNTRIED 0               /* not yet tried at all */
01169 #define TRYING  1               /* top matched, trying submatches */
01170 #define TRIED   2               /* top didn't match or submatches exhausted */
01171 
01172     if (t == NULL) {
01173         return REG_NOMATCH;
01174     }
01175     assert(t->op == '|');
01176     if (v->mem[t->retry] == TRIED) {
01177         return caltdissect(v, t->right, begin, end);
01178     }
01179 
01180     MDEBUG(("calt n%d\n", t->retry));
01181     assert(t->left != NULL);
01182 
01183     if (v->mem[t->retry] == UNTRIED) {
01184         d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
01185         if (ISERR()) {
01186             return v->err;
01187         }
01188         if (longest(v, d, begin, end, NULL) != end) {
01189             freedfa(d);
01190             v->mem[t->retry] = TRIED;
01191             return caltdissect(v, t->right, begin, end);
01192         }
01193         freedfa(d);
01194         MDEBUG(("calt matched\n"));
01195         v->mem[t->retry] = TRYING;
01196     }
01197 
01198     er = cdissect(v, t->left, begin, end);
01199     if (er != REG_NOMATCH) {
01200         return er;
01201     }
01202 
01203     v->mem[t->retry] = TRIED;
01204     return caltdissect(v, t->right, begin, end);
01205 }
01206 
01207 #include "rege_dfa.c"
01208 
01209 /*
01210  * Local Variables:
01211  * mode: c
01212  * c-basic-offset: 4
01213  * fill-column: 78
01214  * End:
01215  */



Generated on Wed Mar 12 12:18:10 2008 by  doxygen 1.5.1