regexec.cGo to the documentation of this file.00001 /* 00002 * re_*exec and friends - match REs 00003 * 00004 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. 00005 * 00006 * Development of this software was funded, in part, by Cray Research Inc., 00007 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 00008 * Corporation, none of whom are responsible for the results. The author 00009 * thanks all of them. 00010 * 00011 * Redistribution and use in source and binary forms -- with or without 00012 * modification -- are permitted for any purpose, provided that 00013 * redistributions in source form retain this entire copyright notice and 00014 * indicate the origin and nature of any modifications. 00015 * 00016 * I'd appreciate being given credit for this package in the documentation of 00017 * software which uses it, but that is not a requirement. 00018 * 00019 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 00020 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 00021 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 00022 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00023 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00024 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 00025 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00026 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 00027 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 00028 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00029 */ 00030 00031 #include "regguts.h" 00032 00033 /* 00034 * Lazy-DFA representation. 00035 */ 00036 00037 struct arcp { /* "pointer" to an outarc */ 00038 struct sset *ss; 00039 color co; 00040 }; 00041 00042 struct sset { /* state set */ 00043 unsigned *states; /* pointer to bitvector */ 00044 unsigned hash; /* hash of bitvector */ 00045 #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw)) 00046 #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \ 00047 memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0)) 00048 int flags; 00049 #define STARTER 01 /* the initial state set */ 00050 #define POSTSTATE 02 /* includes the goal state */ 00051 #define LOCKED 04 /* locked in cache */ 00052 #define NOPROGRESS 010 /* zero-progress state set */ 00053 struct arcp ins; /* chain of inarcs pointing here */ 00054 chr *lastseen; /* last entered on arrival here */ 00055 struct sset **outs; /* outarc vector indexed by color */ 00056 struct arcp *inchain; /* chain-pointer vector for outarcs */ 00057 }; 00058 00059 struct dfa { 00060 int nssets; /* size of cache */ 00061 int nssused; /* how many entries occupied yet */ 00062 int nstates; /* number of states */ 00063 int ncolors; /* length of outarc and inchain vectors */ 00064 int wordsper; /* length of state-set bitvectors */ 00065 struct sset *ssets; /* state-set cache */ 00066 unsigned *statesarea; /* bitvector storage */ 00067 unsigned *work; /* pointer to work area within statesarea */ 00068 struct sset **outsarea; /* outarc-vector storage */ 00069 struct arcp *incarea; /* inchain storage */ 00070 struct cnfa *cnfa; 00071 struct colormap *cm; 00072 chr *lastpost; /* location of last cache-flushed success */ 00073 chr *lastnopr; /* location of last cache-flushed NOPROGRESS */ 00074 struct sset *search; /* replacement-search-pointer memory */ 00075 int cptsmalloced; /* were the areas individually malloced? */ 00076 char *mallocarea; /* self, or master malloced area, or NULL */ 00077 }; 00078 00079 #define WORK 1 /* number of work bitvectors needed */ 00080 00081 /* 00082 * Setup for non-malloc allocation for small cases. 00083 */ 00084 00085 #define FEWSTATES 20 /* must be less than UBITS */ 00086 #define FEWCOLORS 15 00087 struct smalldfa { 00088 struct dfa dfa; 00089 struct sset ssets[FEWSTATES*2]; 00090 unsigned statesarea[FEWSTATES*2 + WORK]; 00091 struct sset *outsarea[FEWSTATES*2 * FEWCOLORS]; 00092 struct arcp incarea[FEWSTATES*2 * FEWCOLORS]; 00093 }; 00094 #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */ 00095 00096 /* 00097 * Internal variables, bundled for easy passing around. 00098 */ 00099 00100 struct vars { 00101 regex_t *re; 00102 struct guts *g; 00103 int eflags; /* copies of arguments */ 00104 size_t nmatch; 00105 regmatch_t *pmatch; 00106 rm_detail_t *details; 00107 chr *start; /* start of string */ 00108 chr *stop; /* just past end of string */ 00109 int err; /* error code if any (0 none) */ 00110 regoff_t *mem; /* memory vector for backtracking */ 00111 struct smalldfa dfa1; 00112 struct smalldfa dfa2; 00113 }; 00114 #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */ 00115 #define ISERR() VISERR(v) 00116 #define VERR(vv,e) (((vv)->err) ? (vv)->err : ((vv)->err = (e))) 00117 #define ERR(e) VERR(v, e) /* record an error */ 00118 #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */ 00119 #define OFF(p) ((p) - v->start) 00120 #define LOFF(p) ((long)OFF(p)) 00121 00122 /* 00123 * forward declarations 00124 */ 00125 /* =====^!^===== begin forwards =====^!^===== */ 00126 /* automatically gathered by fwd; do not hand-edit */ 00127 /* === regexec.c === */ 00128 int exec(regex_t *, CONST chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int); 00129 static int find(struct vars *, struct cnfa *, struct colormap *); 00130 static int cfind(struct vars *, struct cnfa *, struct colormap *); 00131 static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **); 00132 static VOID zapsubs(regmatch_t *, size_t); 00133 static VOID zapmem(struct vars *, struct subre *); 00134 static VOID subset(struct vars *, struct subre *, chr *, chr *); 00135 static int dissect(struct vars *, struct subre *, chr *, chr *); 00136 static int condissect(struct vars *, struct subre *, chr *, chr *); 00137 static int altdissect(struct vars *, struct subre *, chr *, chr *); 00138 static int cdissect(struct vars *, struct subre *, chr *, chr *); 00139 static int ccondissect(struct vars *, struct subre *, chr *, chr *); 00140 static int crevdissect(struct vars *, struct subre *, chr *, chr *); 00141 static int cbrdissect(struct vars *, struct subre *, chr *, chr *); 00142 static int caltdissect(struct vars *, struct subre *, chr *, chr *); 00143 /* === rege_dfa.c === */ 00144 static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); 00145 static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *); 00146 static chr *lastcold(struct vars *, struct dfa *); 00147 static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *); 00148 static VOID freedfa(struct dfa *); 00149 static unsigned hash(unsigned *, int); 00150 static struct sset *initialize(struct vars *, struct dfa *, chr *); 00151 static struct sset *miss(struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *); 00152 static int lacon(struct vars *, struct cnfa *, chr *, pcolor); 00153 static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *); 00154 static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *); 00155 /* automatically gathered by fwd; do not hand-edit */ 00156 /* =====^!^===== end forwards =====^!^===== */ 00157 00158 /* 00159 - exec - match regular expression 00160 ^ int exec(regex_t *, CONST chr *, size_t, rm_detail_t *, 00161 ^ size_t, regmatch_t [], int); 00162 */ 00163 int 00164 exec( 00165 regex_t *re, 00166 CONST chr *string, 00167 size_t len, 00168 rm_detail_t *details, 00169 size_t nmatch, 00170 regmatch_t pmatch[], 00171 int flags) 00172 { 00173 AllocVars(v); 00174 int st; 00175 size_t n; 00176 int backref; 00177 #define LOCALMAT 20 00178 regmatch_t mat[LOCALMAT]; 00179 #define LOCALMEM 40 00180 regoff_t mem[LOCALMEM]; 00181 00182 /* 00183 * Sanity checks. 00184 */ 00185 00186 if (re == NULL || string == NULL || re->re_magic != REMAGIC) { 00187 FreeVars(v); 00188 return REG_INVARG; 00189 } 00190 if (re->re_csize != sizeof(chr)) { 00191 FreeVars(v); 00192 return REG_MIXED; 00193 } 00194 00195 /* 00196 * Setup. 00197 */ 00198 00199 v->re = re; 00200 v->g = (struct guts *)re->re_guts; 00201 if ((v->g->cflags®_EXPECT) && details == NULL) { 00202 FreeVars(v); 00203 return REG_INVARG; 00204 } 00205 if (v->g->info®_UIMPOSSIBLE) { 00206 FreeVars(v); 00207 return REG_NOMATCH; 00208 } 00209 backref = (v->g->info®_UBACKREF) ? 1 : 0; 00210 v->eflags = flags; 00211 if (v->g->cflags®_NOSUB) { 00212 nmatch = 0; /* override client */ 00213 } 00214 v->nmatch = nmatch; 00215 if (backref) { 00216 /* 00217 * Need work area. 00218 */ 00219 00220 if (v->g->nsub + 1 <= LOCALMAT) { 00221 v->pmatch = mat; 00222 } else { 00223 v->pmatch = (regmatch_t *) 00224 MALLOC((v->g->nsub + 1) * sizeof(regmatch_t)); 00225 } 00226 if (v->pmatch == NULL) { 00227 FreeVars(v); 00228 return REG_ESPACE; 00229 } 00230 v->nmatch = v->g->nsub + 1; 00231 } else { 00232 v->pmatch = pmatch; 00233 } 00234 v->details = details; 00235 v->start = (chr *)string; 00236 v->stop = (chr *)string + len; 00237 v->err = 0; 00238 if (backref) { 00239 /* 00240 * Need retry memory. 00241 */ 00242 00243 assert(v->g->ntree >= 0); 00244 n = (size_t)v->g->ntree; 00245 if (n <= LOCALMEM) { 00246 v->mem = mem; 00247 } else { 00248 v->mem = (regoff_t *) MALLOC(n*sizeof(regoff_t)); 00249 } 00250 if (v->mem == NULL) { 00251 if (v->pmatch != pmatch && v->pmatch != mat) { 00252 FREE(v->pmatch); 00253 } 00254 FreeVars(v); 00255 return REG_ESPACE; 00256 } 00257 } else { 00258 v->mem = NULL; 00259 } 00260 00261 /* 00262 * Do it. 00263 */ 00264 00265 assert(v->g->tree != NULL); 00266 if (backref) { 00267 st = cfind(v, &v->g->tree->cnfa, &v->g->cmap); 00268 } else { 00269 st = find(v, &v->g->tree->cnfa, &v->g->cmap); 00270 } 00271 00272 /* 00273 * Copy (portion of) match vector over if necessary. 00274 */ 00275 00276 if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) { 00277 zapsubs(pmatch, nmatch); 00278 n = (nmatch < v->nmatch) ? nmatch : v->nmatch; 00279 memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t)); 00280 } 00281 00282 /* 00283 * Clean up. 00284 */ 00285 00286 if (v->pmatch != pmatch && v->pmatch != mat) { 00287 FREE(v->pmatch); 00288 } 00289 if (v->mem != NULL && v->mem != mem) { 00290 FREE(v->mem); 00291 } 00292 FreeVars(v); 00293 return st; 00294 } 00295 00296 /* 00297 - find - find a match for the main NFA (no-complications case) 00298 ^ static int find(struct vars *, struct cnfa *, struct colormap *); 00299 */ 00300 static int 00301 find( 00302 struct vars *v, 00303 struct cnfa *cnfa, 00304 struct colormap *cm) 00305 { 00306 struct dfa *s; 00307 struct dfa *d; 00308 chr *begin; 00309 chr *end = NULL; 00310 chr *cold; 00311 chr *open; /* Open and close of range of possible 00312 * starts */ 00313 chr *close; 00314 int hitend; 00315 int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0; 00316 00317 /* 00318 * First, a shot with the search RE. 00319 */ 00320 00321 s = newdfa(v, &v->g->search, cm, &v->dfa1); 00322 assert(!(ISERR() && s != NULL)); 00323 NOERR(); 00324 MDEBUG(("\nsearch at %ld\n", LOFF(v->start))); 00325 cold = NULL; 00326 close = shortest(v, s, v->start, v->start, v->stop, &cold, NULL); 00327 freedfa(s); 00328 NOERR(); 00329 if (v->g->cflags®_EXPECT) { 00330 assert(v->details != NULL); 00331 if (cold != NULL) { 00332 v->details->rm_extend.rm_so = OFF(cold); 00333 } else { 00334 v->details->rm_extend.rm_so = OFF(v->stop); 00335 } 00336 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ 00337 } 00338 if (close == NULL) { /* not found */ 00339 return REG_NOMATCH; 00340 } 00341 if (v->nmatch == 0) { /* found, don't need exact location */ 00342 return REG_OKAY; 00343 } 00344 00345 /* 00346 * Find starting point and match. 00347 */ 00348 00349 assert(cold != NULL); 00350 open = cold; 00351 cold = NULL; 00352 MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close))); 00353 d = newdfa(v, cnfa, cm, &v->dfa1); 00354 assert(!(ISERR() && d != NULL)); 00355 NOERR(); 00356 for (begin = open; begin <= close; begin++) { 00357 MDEBUG(("\nfind trying at %ld\n", LOFF(begin))); 00358 if (shorter) { 00359 end = shortest(v, d, begin, begin, v->stop, NULL, &hitend); 00360 } else { 00361 end = longest(v, d, begin, v->stop, &hitend); 00362 } 00363 NOERR(); 00364 if (hitend && cold == NULL) { 00365 cold = begin; 00366 } 00367 if (end != NULL) { 00368 break; /* NOTE BREAK OUT */ 00369 } 00370 } 00371 assert(end != NULL); /* search RE succeeded so loop should */ 00372 freedfa(d); 00373 00374 /* 00375 * And pin down details. 00376 */ 00377 00378 assert(v->nmatch > 0); 00379 v->pmatch[0].rm_so = OFF(begin); 00380 v->pmatch[0].rm_eo = OFF(end); 00381 if (v->g->cflags®_EXPECT) { 00382 if (cold != NULL) { 00383 v->details->rm_extend.rm_so = OFF(cold); 00384 } else { 00385 v->details->rm_extend.rm_so = OFF(v->stop); 00386 } 00387 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ 00388 } 00389 if (v->nmatch == 1) { /* no need for submatches */ 00390 return REG_OKAY; 00391 } 00392 00393 /* 00394 * Submatches. 00395 */ 00396 00397 zapsubs(v->pmatch, v->nmatch); 00398 return dissect(v, v->g->tree, begin, end); 00399 } 00400 00401 /* 00402 - cfind - find a match for the main NFA (with complications) 00403 ^ static int cfind(struct vars *, struct cnfa *, struct colormap *); 00404 */ 00405 static int 00406 cfind( 00407 struct vars *v, 00408 struct cnfa *cnfa, 00409 struct colormap *cm) 00410 { 00411 struct dfa *s; 00412 struct dfa *d; 00413 chr *cold = NULL; /* silence gcc 4 warning */ 00414 int ret; 00415 00416 s = newdfa(v, &v->g->search, cm, &v->dfa1); 00417 NOERR(); 00418 d = newdfa(v, cnfa, cm, &v->dfa2); 00419 if (ISERR()) { 00420 assert(d == NULL); 00421 freedfa(s); 00422 return v->err; 00423 } 00424 00425 ret = cfindloop(v, cnfa, cm, d, s, &cold); 00426 00427 freedfa(d); 00428 freedfa(s); 00429 NOERR(); 00430 if (v->g->cflags®_EXPECT) { 00431 assert(v->details != NULL); 00432 if (cold != NULL) { 00433 v->details->rm_extend.rm_so = OFF(cold); 00434 } else { 00435 v->details->rm_extend.rm_so = OFF(v->stop); 00436 } 00437 v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */ 00438 } 00439 return ret; 00440 } 00441 00442 /* 00443 - cfindloop - the heart of cfind 00444 ^ static int cfindloop(struct vars *, struct cnfa *, struct colormap *, 00445 ^ struct dfa *, struct dfa *, chr **); 00446 */ 00447 static int 00448 cfindloop( 00449 struct vars *v, 00450 struct cnfa *cnfa, 00451 struct colormap *cm, 00452 struct dfa *d, 00453 struct dfa *s, 00454 chr **coldp) /* where to put coldstart pointer */ 00455 { 00456 chr *begin; 00457 chr *end; 00458 chr *cold; 00459 chr *open; /* Open and close of range of possible 00460 * starts */ 00461 chr *close; 00462 chr *estart; 00463 chr *estop; 00464 int er; 00465 int shorter = v->g->tree->flags&SHORTER; 00466 int hitend; 00467 00468 assert(d != NULL && s != NULL); 00469 cold = NULL; 00470 close = v->start; 00471 do { 00472 MDEBUG(("\ncsearch at %ld\n", LOFF(close))); 00473 close = shortest(v, s, close, close, v->stop, &cold, NULL); 00474 if (close == NULL) { 00475 break; /* NOTE BREAK */ 00476 } 00477 assert(cold != NULL); 00478 open = cold; 00479 cold = NULL; 00480 MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close))); 00481 for (begin = open; begin <= close; begin++) { 00482 MDEBUG(("\ncfind trying at %ld\n", LOFF(begin))); 00483 estart = begin; 00484 estop = v->stop; 00485 for (;;) { 00486 if (shorter) { 00487 end = shortest(v, d, begin, estart, estop, NULL, &hitend); 00488 } else { 00489 end = longest(v, d, begin, estop, &hitend); 00490 } 00491 if (hitend && cold == NULL) { 00492 cold = begin; 00493 } 00494 if (end == NULL) { 00495 break; /* NOTE BREAK OUT */ 00496 } 00497 00498 MDEBUG(("tentative end %ld\n", LOFF(end))); 00499 zapsubs(v->pmatch, v->nmatch); 00500 zapmem(v, v->g->tree); 00501 er = cdissect(v, v->g->tree, begin, end); 00502 if (er == REG_OKAY) { 00503 if (v->nmatch > 0) { 00504 v->pmatch[0].rm_so = OFF(begin); 00505 v->pmatch[0].rm_eo = OFF(end); 00506 } 00507 *coldp = cold; 00508 return REG_OKAY; 00509 } 00510 if (er != REG_NOMATCH) { 00511 ERR(er); 00512 return er; 00513 } 00514 if ((shorter) ? end == estop : end == begin) { 00515 /* 00516 * No point in trying again. 00517 */ 00518 00519 *coldp = cold; 00520 return REG_NOMATCH; 00521 } 00522 00523 /* 00524 * Go around and try again 00525 */ 00526 00527 if (shorter) { 00528 estart = end + 1; 00529 } else { 00530 estop = end - 1; 00531 } 00532 } 00533 } 00534 } while (close < v->stop); 00535 00536 *coldp = cold; 00537 return REG_NOMATCH; 00538 } 00539 00540 /* 00541 - zapsubs - initialize the subexpression matches to "no match" 00542 ^ static VOID zapsubs(regmatch_t *, size_t); 00543 */ 00544 static void 00545 zapsubs( 00546 regmatch_t *p, 00547 size_t n) 00548 { 00549 size_t i; 00550 00551 for (i = n-1; i > 0; i--) { 00552 p[i].rm_so = -1; 00553 p[i].rm_eo = -1; 00554 } 00555 } 00556 00557 /* 00558 - zapmem - initialize the retry memory of a subtree to zeros 00559 ^ static VOID zapmem(struct vars *, struct subre *); 00560 */ 00561 static void 00562 zapmem( 00563 struct vars *v, 00564 struct subre *t) 00565 { 00566 if (t == NULL) { 00567 return; 00568 } 00569 00570 assert(v->mem != NULL); 00571 v->mem[t->retry] = 0; 00572 if (t->op == '(') { 00573 assert(t->subno > 0); 00574 v->pmatch[t->subno].rm_so = -1; 00575 v->pmatch[t->subno].rm_eo = -1; 00576 } 00577 00578 if (t->left != NULL) { 00579 zapmem(v, t->left); 00580 } 00581 if (t->right != NULL) { 00582 zapmem(v, t->right); 00583 } 00584 } 00585 00586 /* 00587 - subset - set any subexpression relevant to a successful subre 00588 ^ static VOID subset(struct vars *, struct subre *, chr *, chr *); 00589 */ 00590 static void 00591 subset( 00592 struct vars *v, 00593 struct subre *sub, 00594 chr *begin, 00595 chr *end) 00596 { 00597 int n = sub->subno; 00598 00599 assert(n > 0); 00600 if ((size_t)n >= v->nmatch) { 00601 return; 00602 } 00603 00604 MDEBUG(("setting %d\n", n)); 00605 v->pmatch[n].rm_so = OFF(begin); 00606 v->pmatch[n].rm_eo = OFF(end); 00607 } 00608 00609 /* 00610 - dissect - determine subexpression matches (uncomplicated case) 00611 ^ static int dissect(struct vars *, struct subre *, chr *, chr *); 00612 */ 00613 static int /* regexec return code */ 00614 dissect( 00615 struct vars *v, 00616 struct subre *t, 00617 chr *begin, /* beginning of relevant substring */ 00618 chr *end) /* end of same */ 00619 { 00620 assert(t != NULL); 00621 MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end))); 00622 00623 switch (t->op) { 00624 case '=': /* terminal node */ 00625 assert(t->left == NULL && t->right == NULL); 00626 return REG_OKAY; /* no action, parent did the work */ 00627 break; 00628 case '|': /* alternation */ 00629 assert(t->left != NULL); 00630 return altdissect(v, t, begin, end); 00631 break; 00632 case 'b': /* back ref -- shouldn't be calling us! */ 00633 return REG_ASSERT; 00634 break; 00635 case '.': /* concatenation */ 00636 assert(t->left != NULL && t->right != NULL); 00637 return condissect(v, t, begin, end); 00638 break; 00639 case '(': /* capturing */ 00640 assert(t->left != NULL && t->right == NULL); 00641 assert(t->subno > 0); 00642 subset(v, t, begin, end); 00643 return dissect(v, t->left, begin, end); 00644 break; 00645 default: 00646 return REG_ASSERT; 00647 break; 00648 } 00649 } 00650 00651 /* 00652 - condissect - determine concatenation subexpression matches (uncomplicated) 00653 ^ static int condissect(struct vars *, struct subre *, chr *, chr *); 00654 */ 00655 static int /* regexec return code */ 00656 condissect( 00657 struct vars *v, 00658 struct subre *t, 00659 chr *begin, /* beginning of relevant substring */ 00660 chr *end) /* end of same */ 00661 { 00662 struct dfa *d; 00663 struct dfa *d2; 00664 chr *mid; 00665 int i; 00666 int shorter = (t->left->flags&SHORTER) ? 1 : 0; 00667 chr *stop = (shorter) ? end : begin; 00668 00669 assert(t->op == '.'); 00670 assert(t->left != NULL && t->left->cnfa.nstates > 0); 00671 assert(t->right != NULL && t->right->cnfa.nstates > 0); 00672 00673 d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); 00674 NOERR(); 00675 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2); 00676 if (ISERR()) { 00677 assert(d2 == NULL); 00678 freedfa(d); 00679 return v->err; 00680 } 00681 00682 /* 00683 * Pick a tentative midpoint. 00684 */ 00685 00686 if (shorter) { 00687 mid = shortest(v, d, begin, begin, end, NULL, NULL); 00688 } else { 00689 mid = longest(v, d, begin, end, NULL); 00690 } 00691 if (mid == NULL) { 00692 freedfa(d); 00693 freedfa(d2); 00694 return REG_ASSERT; 00695 } 00696 MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); 00697 00698 /* 00699 * Iterate until satisfaction or failure. 00700 */ 00701 00702 while (longest(v, d2, mid, end, NULL) != end) { 00703 /* 00704 * That midpoint didn't work, find a new one. 00705 */ 00706 00707 if (mid == stop) { 00708 /* 00709 * All possibilities exhausted! 00710 */ 00711 00712 MDEBUG(("no midpoint!\n")); 00713 freedfa(d); 00714 freedfa(d2); 00715 return REG_ASSERT; 00716 } 00717 if (shorter) { 00718 mid = shortest(v, d, begin, mid+1, end, NULL, NULL); 00719 } else { 00720 mid = longest(v, d, begin, mid-1, NULL); 00721 } 00722 if (mid == NULL) { 00723 /* 00724 * Failed to find a new one! 00725 */ 00726 00727 MDEBUG(("failed midpoint!\n")); 00728 freedfa(d); 00729 freedfa(d2); 00730 return REG_ASSERT; 00731 } 00732 MDEBUG(("new midpoint %ld\n", LOFF(mid))); 00733 } 00734 00735 /* 00736 * Satisfaction. 00737 */ 00738 00739 MDEBUG(("successful\n")); 00740 freedfa(d); 00741 freedfa(d2); 00742 i = dissect(v, t->left, begin, mid); 00743 if (i != REG_OKAY) { 00744 return i; 00745 } 00746 return dissect(v, t->right, mid, end); 00747 } 00748 00749 /* 00750 - altdissect - determine alternative subexpression matches (uncomplicated) 00751 ^ static int altdissect(struct vars *, struct subre *, chr *, chr *); 00752 */ 00753 static int /* regexec return code */ 00754 altdissect( 00755 struct vars *v, 00756 struct subre *t, 00757 chr *begin, /* beginning of relevant substring */ 00758 chr *end) /* end of same */ 00759 { 00760 struct dfa *d; 00761 int i; 00762 00763 assert(t != NULL); 00764 assert(t->op == '|'); 00765 00766 for (i = 0; t != NULL; t = t->right, i++) { 00767 MDEBUG(("trying %dth\n", i)); 00768 assert(t->left != NULL && t->left->cnfa.nstates > 0); 00769 d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1); 00770 if (ISERR()) { 00771 return v->err; 00772 } 00773 if (longest(v, d, begin, end, NULL) == end) { 00774 MDEBUG(("success\n")); 00775 freedfa(d); 00776 return dissect(v, t->left, begin, end); 00777 } 00778 freedfa(d); 00779 } 00780 return REG_ASSERT; /* none of them matched?!? */ 00781 } 00782 00783 /* 00784 - cdissect - determine subexpression matches (with complications) 00785 * The retry memory stores the offset of the trial midpoint from begin, plus 1 00786 * so that 0 uniquely means "clean slate". 00787 ^ static int cdissect(struct vars *, struct subre *, chr *, chr *); 00788 */ 00789 static int /* regexec return code */ 00790 cdissect( 00791 struct vars *v, 00792 struct subre *t, 00793 chr *begin, /* beginning of relevant substring */ 00794 chr *end) /* end of same */ 00795 { 00796 int er; 00797 00798 assert(t != NULL); 00799 MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op)); 00800 00801 switch (t->op) { 00802 case '=': /* terminal node */ 00803 assert(t->left == NULL && t->right == NULL); 00804 return REG_OKAY; /* no action, parent did the work */ 00805 break; 00806 case '|': /* alternation */ 00807 assert(t->left != NULL); 00808 return caltdissect(v, t, begin, end); 00809 break; 00810 case 'b': /* back ref -- shouldn't be calling us! */ 00811 assert(t->left == NULL && t->right == NULL); 00812 return cbrdissect(v, t, begin, end); 00813 break; 00814 case '.': /* concatenation */ 00815 assert(t->left != NULL && t->right != NULL); 00816 return ccondissect(v, t, begin, end); 00817 break; 00818 case '(': /* capturing */ 00819 assert(t->left != NULL && t->right == NULL); 00820 assert(t->subno > 0); 00821 er = cdissect(v, t->left, begin, end); 00822 if (er == REG_OKAY) { 00823 subset(v, t, begin, end); 00824 } 00825 return er; 00826 break; 00827 default: 00828 return REG_ASSERT; 00829 break; 00830 } 00831 } 00832 00833 /* 00834 - ccondissect - concatenation subexpression matches (with complications) 00835 * The retry memory stores the offset of the trial midpoint from begin, plus 1 00836 * so that 0 uniquely means "clean slate". 00837 ^ static int ccondissect(struct vars *, struct subre *, chr *, chr *); 00838 */ 00839 static int /* regexec return code */ 00840 ccondissect( 00841 struct vars *v, 00842 struct subre *t, 00843 chr *begin, /* beginning of relevant substring */ 00844 chr *end) /* end of same */ 00845 { 00846 struct dfa *d; 00847 struct dfa *d2; 00848 chr *mid; 00849 int er; 00850 00851 assert(t->op == '.'); 00852 assert(t->left != NULL && t->left->cnfa.nstates > 0); 00853 assert(t->right != NULL && t->right->cnfa.nstates > 0); 00854 00855 if (t->left->flags&SHORTER) { /* reverse scan */ 00856 return crevdissect(v, t, begin, end); 00857 } 00858 00859 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); 00860 if (ISERR()) { 00861 return v->err; 00862 } 00863 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); 00864 if (ISERR()) { 00865 freedfa(d); 00866 return v->err; 00867 } 00868 MDEBUG(("cconcat %d\n", t->retry)); 00869 00870 /* 00871 * Pick a tentative midpoint. 00872 */ 00873 00874 if (v->mem[t->retry] == 0) { 00875 mid = longest(v, d, begin, end, NULL); 00876 if (mid == NULL) { 00877 freedfa(d); 00878 freedfa(d2); 00879 return REG_NOMATCH; 00880 } 00881 MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); 00882 v->mem[t->retry] = (mid - begin) + 1; 00883 } else { 00884 mid = begin + (v->mem[t->retry] - 1); 00885 MDEBUG(("working midpoint %ld\n", LOFF(mid))); 00886 } 00887 00888 /* 00889 * Iterate until satisfaction or failure. 00890 */ 00891 00892 for (;;) { 00893 /* 00894 * Try this midpoint on for size. 00895 */ 00896 00897 er = cdissect(v, t->left, begin, mid); 00898 if ((er == REG_OKAY) && (longest(v, d2, mid, end, NULL) == end) 00899 && (er = cdissect(v, t->right, mid, end)) == REG_OKAY) { 00900 break; /* NOTE BREAK OUT */ 00901 } 00902 if ((er != REG_OKAY) && (er != REG_NOMATCH)) { 00903 freedfa(d); 00904 freedfa(d2); 00905 return er; 00906 } 00907 00908 /* 00909 * That midpoint didn't work, find a new one. 00910 */ 00911 00912 if (mid == begin) { 00913 /* 00914 * All possibilities exhausted. 00915 */ 00916 00917 MDEBUG(("%d no midpoint\n", t->retry)); 00918 freedfa(d); 00919 freedfa(d2); 00920 return REG_NOMATCH; 00921 } 00922 mid = longest(v, d, begin, mid-1, NULL); 00923 if (mid == NULL) { 00924 /* 00925 * Failed to find a new one. 00926 */ 00927 00928 MDEBUG(("%d failed midpoint\n", t->retry)); 00929 freedfa(d); 00930 freedfa(d2); 00931 return REG_NOMATCH; 00932 } 00933 MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); 00934 v->mem[t->retry] = (mid - begin) + 1; 00935 zapmem(v, t->left); 00936 zapmem(v, t->right); 00937 } 00938 00939 /* 00940 * Satisfaction. 00941 */ 00942 00943 MDEBUG(("successful\n")); 00944 freedfa(d); 00945 freedfa(d2); 00946 return REG_OKAY; 00947 } 00948 00949 /* 00950 - crevdissect - determine backref shortest-first subexpression matches 00951 * The retry memory stores the offset of the trial midpoint from begin, plus 1 00952 * so that 0 uniquely means "clean slate". 00953 ^ static int crevdissect(struct vars *, struct subre *, chr *, chr *); 00954 */ 00955 static int /* regexec return code */ 00956 crevdissect( 00957 struct vars *v, 00958 struct subre *t, 00959 chr *begin, /* beginning of relevant substring */ 00960 chr *end) /* end of same */ 00961 { 00962 struct dfa *d; 00963 struct dfa *d2; 00964 chr *mid; 00965 int er; 00966 00967 assert(t->op == '.'); 00968 assert(t->left != NULL && t->left->cnfa.nstates > 0); 00969 assert(t->right != NULL && t->right->cnfa.nstates > 0); 00970 assert(t->left->flags&SHORTER); 00971 00972 /* 00973 * Concatenation -- need to split the substring between parts. 00974 */ 00975 00976 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); 00977 if (ISERR()) { 00978 return v->err; 00979 } 00980 d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC); 00981 if (ISERR()) { 00982 freedfa(d); 00983 return v->err; 00984 } 00985 MDEBUG(("crev %d\n", t->retry)); 00986 00987 /* 00988 * Pick a tentative midpoint. 00989 */ 00990 00991 if (v->mem[t->retry] == 0) { 00992 mid = shortest(v, d, begin, begin, end, NULL, NULL); 00993 if (mid == NULL) { 00994 freedfa(d); 00995 freedfa(d2); 00996 return REG_NOMATCH; 00997 } 00998 MDEBUG(("tentative midpoint %ld\n", LOFF(mid))); 00999 v->mem[t->retry] = (mid - begin) + 1; 01000 } else { 01001 mid = begin + (v->mem[t->retry] - 1); 01002 MDEBUG(("working midpoint %ld\n", LOFF(mid))); 01003 } 01004 01005 /* 01006 * Iterate until satisfaction or failure. 01007 */ 01008 01009 for (;;) { 01010 /* 01011 * Try this midpoint on for size. 01012 */ 01013 01014 er = cdissect(v, t->left, begin, mid); 01015 if ((er == REG_OKAY) && (longest(v, d2, mid, end, NULL) == end) 01016 && (er = cdissect(v, t->right, mid, end)) == REG_OKAY) { 01017 break; /* NOTE BREAK OUT */ 01018 } 01019 if (er != REG_OKAY && er != REG_NOMATCH) { 01020 freedfa(d); 01021 freedfa(d2); 01022 return er; 01023 } 01024 01025 /* 01026 * That midpoint didn't work, find a new one. 01027 */ 01028 01029 if (mid == end) { 01030 /* 01031 * All possibilities exhausted. 01032 */ 01033 01034 MDEBUG(("%d no midpoint\n", t->retry)); 01035 freedfa(d); 01036 freedfa(d2); 01037 return REG_NOMATCH; 01038 } 01039 mid = shortest(v, d, begin, mid+1, end, NULL, NULL); 01040 if (mid == NULL) { 01041 /* 01042 * Failed to find a new one. 01043 */ 01044 01045 MDEBUG(("%d failed midpoint\n", t->retry)); 01046 freedfa(d); 01047 freedfa(d2); 01048 return REG_NOMATCH; 01049 } 01050 MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid))); 01051 v->mem[t->retry] = (mid - begin) + 1; 01052 zapmem(v, t->left); 01053 zapmem(v, t->right); 01054 } 01055 01056 /* 01057 * Satisfaction. 01058 */ 01059 01060 MDEBUG(("successful\n")); 01061 freedfa(d); 01062 freedfa(d2); 01063 return REG_OKAY; 01064 } 01065 01066 /* 01067 - cbrdissect - determine backref subexpression matches 01068 ^ static int cbrdissect(struct vars *, struct subre *, chr *, chr *); 01069 */ 01070 static int /* regexec return code */ 01071 cbrdissect( 01072 struct vars *v, 01073 struct subre *t, 01074 chr *begin, /* beginning of relevant substring */ 01075 chr *end) /* end of same */ 01076 { 01077 int i; 01078 int n = t->subno; 01079 size_t len; 01080 chr *paren; 01081 chr *p; 01082 chr *stop; 01083 int min = t->min; 01084 int max = t->max; 01085 01086 assert(t != NULL); 01087 assert(t->op == 'b'); 01088 assert(n >= 0); 01089 assert((size_t)n < v->nmatch); 01090 01091 MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max)); 01092 01093 if (v->pmatch[n].rm_so == -1) { 01094 return REG_NOMATCH; 01095 } 01096 paren = v->start + v->pmatch[n].rm_so; 01097 len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so; 01098 01099 /* 01100 * No room to maneuver -- retries are pointless. 01101 */ 01102 01103 if (v->mem[t->retry]) { 01104 return REG_NOMATCH; 01105 } 01106 v->mem[t->retry] = 1; 01107 01108 /* 01109 * Special-case zero-length string. 01110 */ 01111 01112 if (len == 0) { 01113 if (begin == end) { 01114 return REG_OKAY; 01115 } 01116 return REG_NOMATCH; 01117 } 01118 01119 /* 01120 * And too-short string. 01121 */ 01122 01123 assert(end >= begin); 01124 if ((size_t)(end - begin) < len) { 01125 return REG_NOMATCH; 01126 } 01127 stop = end - len; 01128 01129 /* 01130 * Count occurrences. 01131 */ 01132 01133 i = 0; 01134 for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) { 01135 if ((*v->g->compare)(paren, p, len) != 0) { 01136 break; 01137 } 01138 i++; 01139 } 01140 MDEBUG(("cbackref found %d\n", i)); 01141 01142 /* 01143 * And sort it out. 01144 */ 01145 01146 if (p != end) { /* didn't consume all of it */ 01147 return REG_NOMATCH; 01148 } 01149 if (min <= i && (i <= max || max == INFINITY)) { 01150 return REG_OKAY; 01151 } 01152 return REG_NOMATCH; /* out of range */ 01153 } 01154 01155 /* 01156 - caltdissect - determine alternative subexpression matches (w. complications) 01157 ^ static int caltdissect(struct vars *, struct subre *, chr *, chr *); 01158 */ 01159 static int /* regexec return code */ 01160 caltdissect( 01161 struct vars *v, 01162 struct subre *t, 01163 chr *begin, /* beginning of relevant substring */ 01164 chr *end) /* end of same */ 01165 { 01166 struct dfa *d; 01167 int er; 01168 #define UNTRIED 0 /* not yet tried at all */ 01169 #define TRYING 1 /* top matched, trying submatches */ 01170 #define TRIED 2 /* top didn't match or submatches exhausted */ 01171 01172 if (t == NULL) { 01173 return REG_NOMATCH; 01174 } 01175 assert(t->op == '|'); 01176 if (v->mem[t->retry] == TRIED) { 01177 return caltdissect(v, t->right, begin, end); 01178 } 01179 01180 MDEBUG(("calt n%d\n", t->retry)); 01181 assert(t->left != NULL); 01182 01183 if (v->mem[t->retry] == UNTRIED) { 01184 d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); 01185 if (ISERR()) { 01186 return v->err; 01187 } 01188 if (longest(v, d, begin, end, NULL) != end) { 01189 freedfa(d); 01190 v->mem[t->retry] = TRIED; 01191 return caltdissect(v, t->right, begin, end); 01192 } 01193 freedfa(d); 01194 MDEBUG(("calt matched\n")); 01195 v->mem[t->retry] = TRYING; 01196 } 01197 01198 er = cdissect(v, t->left, begin, end); 01199 if (er != REG_NOMATCH) { 01200 return er; 01201 } 01202 01203 v->mem[t->retry] = TRIED; 01204 return caltdissect(v, t->right, begin, end); 01205 } 01206 01207 #include "rege_dfa.c" 01208 01209 /* 01210 * Local Variables: 01211 * mode: c 01212 * c-basic-offset: 4 01213 * fill-column: 78 01214 * End: 01215 */
Generated on Wed Mar 12 12:18:10 2008 by 1.5.1 |