regc_lex.cGo to the documentation of this file.00001 /* 00002 * lexical analyzer 00003 * This file is #included by regcomp.c. 00004 * 00005 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved. 00006 * 00007 * Development of this software was funded, in part, by Cray Research Inc., 00008 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics 00009 * Corporation, none of whom are responsible for the results. The author 00010 * thanks all of them. 00011 * 00012 * Redistribution and use in source and binary forms -- with or without 00013 * modification -- are permitted for any purpose, provided that 00014 * redistributions in source form retain this entire copyright notice and 00015 * indicate the origin and nature of any modifications. 00016 * 00017 * I'd appreciate being given credit for this package in the documentation of 00018 * software which uses it, but that is not a requirement. 00019 * 00020 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, 00021 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY 00022 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL 00023 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 00024 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 00025 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 00026 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, 00027 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR 00028 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF 00029 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 00030 */ 00031 00032 /* scanning macros (know about v) */ 00033 #define ATEOS() (v->now >= v->stop) 00034 #define HAVE(n) (v->stop - v->now >= (n)) 00035 #define NEXT1(c) (!ATEOS() && *v->now == CHR(c)) 00036 #define NEXT2(a,b) (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b)) 00037 #define NEXT3(a,b,c) \ 00038 (HAVE(3) && *v->now == CHR(a) && \ 00039 *(v->now+1) == CHR(b) && \ 00040 *(v->now+2) == CHR(c)) 00041 #define SET(c) (v->nexttype = (c)) 00042 #define SETV(c, n) (v->nexttype = (c), v->nextvalue = (n)) 00043 #define RET(c) return (SET(c), 1) 00044 #define RETV(c, n) return (SETV(c, n), 1) 00045 #define FAILW(e) return (ERR(e), 0) /* ERR does SET(EOS) */ 00046 #define LASTTYPE(t) (v->lasttype == (t)) 00047 00048 /* lexical contexts */ 00049 #define L_ERE 1 /* mainline ERE/ARE */ 00050 #define L_BRE 2 /* mainline BRE */ 00051 #define L_Q 3 /* REG_QUOTE */ 00052 #define L_EBND 4 /* ERE/ARE bound */ 00053 #define L_BBND 5 /* BRE bound */ 00054 #define L_BRACK 6 /* brackets */ 00055 #define L_CEL 7 /* collating element */ 00056 #define L_ECL 8 /* equivalence class */ 00057 #define L_CCL 9 /* character class */ 00058 #define INTOCON(c) (v->lexcon = (c)) 00059 #define INCON(con) (v->lexcon == (con)) 00060 00061 /* construct pointer past end of chr array */ 00062 #define ENDOF(array) ((array) + sizeof(array)/sizeof(chr)) 00063 00064 /* 00065 - lexstart - set up lexical stuff, scan leading options 00066 ^ static VOID lexstart(struct vars *); 00067 */ 00068 static void 00069 lexstart( 00070 struct vars *v) 00071 { 00072 prefixes(v); /* may turn on new type bits etc. */ 00073 NOERR(); 00074 00075 if (v->cflags®_QUOTE) { 00076 assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))); 00077 INTOCON(L_Q); 00078 } else if (v->cflags®_EXTENDED) { 00079 assert(!(v->cflags®_QUOTE)); 00080 INTOCON(L_ERE); 00081 } else { 00082 assert(!(v->cflags&(REG_QUOTE|REG_ADVF))); 00083 INTOCON(L_BRE); 00084 } 00085 00086 v->nexttype = EMPTY; /* remember we were at the start */ 00087 next(v); /* set up the first token */ 00088 } 00089 00090 /* 00091 - prefixes - implement various special prefixes 00092 ^ static VOID prefixes(struct vars *); 00093 */ 00094 static void 00095 prefixes( 00096 struct vars *v) 00097 { 00098 /* 00099 * Literal string doesn't get any of this stuff. 00100 */ 00101 00102 if (v->cflags®_QUOTE) { 00103 return; 00104 } 00105 00106 /* 00107 * Initial "***" gets special things. 00108 */ 00109 00110 if (HAVE(4) && NEXT3('*', '*', '*')) { 00111 switch (*(v->now + 3)) { 00112 case CHR('?'): /* "***?" error, msg shows version */ 00113 ERR(REG_BADPAT); 00114 return; /* proceed no further */ 00115 break; 00116 case CHR('='): /* "***=" shifts to literal string */ 00117 NOTE(REG_UNONPOSIX); 00118 v->cflags |= REG_QUOTE; 00119 v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE); 00120 v->now += 4; 00121 return; /* and there can be no more prefixes */ 00122 break; 00123 case CHR(':'): /* "***:" shifts to AREs */ 00124 NOTE(REG_UNONPOSIX); 00125 v->cflags |= REG_ADVANCED; 00126 v->now += 4; 00127 break; 00128 default: /* otherwise *** is just an error */ 00129 ERR(REG_BADRPT); 00130 return; 00131 break; 00132 } 00133 } 00134 00135 /* 00136 * BREs and EREs don't get embedded options. 00137 */ 00138 00139 if ((v->cflags®_ADVANCED) != REG_ADVANCED) { 00140 return; 00141 } 00142 00143 /* 00144 * Embedded options (AREs only). 00145 */ 00146 00147 if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) { 00148 NOTE(REG_UNONPOSIX); 00149 v->now += 2; 00150 for (; !ATEOS() && iscalpha(*v->now); v->now++) { 00151 switch (*v->now) { 00152 case CHR('b'): /* BREs (but why???) */ 00153 v->cflags &= ~(REG_ADVANCED|REG_QUOTE); 00154 break; 00155 case CHR('c'): /* case sensitive */ 00156 v->cflags &= ~REG_ICASE; 00157 break; 00158 case CHR('e'): /* plain EREs */ 00159 v->cflags |= REG_EXTENDED; 00160 v->cflags &= ~(REG_ADVF|REG_QUOTE); 00161 break; 00162 case CHR('i'): /* case insensitive */ 00163 v->cflags |= REG_ICASE; 00164 break; 00165 case CHR('m'): /* Perloid synonym for n */ 00166 case CHR('n'): /* \n affects ^ $ . [^ */ 00167 v->cflags |= REG_NEWLINE; 00168 break; 00169 case CHR('p'): /* ~Perl, \n affects . [^ */ 00170 v->cflags |= REG_NLSTOP; 00171 v->cflags &= ~REG_NLANCH; 00172 break; 00173 case CHR('q'): /* literal string */ 00174 v->cflags |= REG_QUOTE; 00175 v->cflags &= ~REG_ADVANCED; 00176 break; 00177 case CHR('s'): /* single line, \n ordinary */ 00178 v->cflags &= ~REG_NEWLINE; 00179 break; 00180 case CHR('t'): /* tight syntax */ 00181 v->cflags &= ~REG_EXPANDED; 00182 break; 00183 case CHR('w'): /* weird, \n affects ^ $ only */ 00184 v->cflags &= ~REG_NLSTOP; 00185 v->cflags |= REG_NLANCH; 00186 break; 00187 case CHR('x'): /* expanded syntax */ 00188 v->cflags |= REG_EXPANDED; 00189 break; 00190 default: 00191 ERR(REG_BADOPT); 00192 return; 00193 } 00194 } 00195 if (!NEXT1(')')) { 00196 ERR(REG_BADOPT); 00197 return; 00198 } 00199 v->now++; 00200 if (v->cflags®_QUOTE) { 00201 v->cflags &= ~(REG_EXPANDED|REG_NEWLINE); 00202 } 00203 } 00204 } 00205 00206 /* 00207 - lexnest - "call a subroutine", interpolating string at the lexical level 00208 * Note, this is not a very general facility. There are a number of 00209 * implicit assumptions about what sorts of strings can be subroutines. 00210 ^ static VOID lexnest(struct vars *, const chr *, const chr *); 00211 */ 00212 static void 00213 lexnest( 00214 struct vars *v, 00215 const chr *beginp, /* start of interpolation */ 00216 const chr *endp) /* one past end of interpolation */ 00217 { 00218 assert(v->savenow == NULL); /* only one level of nesting */ 00219 v->savenow = v->now; 00220 v->savestop = v->stop; 00221 v->now = beginp; 00222 v->stop = endp; 00223 } 00224 00225 /* 00226 * string constants to interpolate as expansions of things like \d 00227 */ 00228 00229 static const chr backd[] = { /* \d */ 00230 CHR('['), CHR('['), CHR(':'), 00231 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 00232 CHR(':'), CHR(']'), CHR(']') 00233 }; 00234 static const chr backD[] = { /* \D */ 00235 CHR('['), CHR('^'), CHR('['), CHR(':'), 00236 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 00237 CHR(':'), CHR(']'), CHR(']') 00238 }; 00239 static const chr brbackd[] = { /* \d within brackets */ 00240 CHR('['), CHR(':'), 00241 CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'), 00242 CHR(':'), CHR(']') 00243 }; 00244 static const chr backs[] = { /* \s */ 00245 CHR('['), CHR('['), CHR(':'), 00246 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 00247 CHR(':'), CHR(']'), CHR(']') 00248 }; 00249 static const chr backS[] = { /* \S */ 00250 CHR('['), CHR('^'), CHR('['), CHR(':'), 00251 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 00252 CHR(':'), CHR(']'), CHR(']') 00253 }; 00254 static const chr brbacks[] = { /* \s within brackets */ 00255 CHR('['), CHR(':'), 00256 CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'), 00257 CHR(':'), CHR(']') 00258 }; 00259 static const chr backw[] = { /* \w */ 00260 CHR('['), CHR('['), CHR(':'), 00261 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 00262 CHR(':'), CHR(']'), CHR('_'), CHR(']') 00263 }; 00264 static const chr backW[] = { /* \W */ 00265 CHR('['), CHR('^'), CHR('['), CHR(':'), 00266 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 00267 CHR(':'), CHR(']'), CHR('_'), CHR(']') 00268 }; 00269 static const chr brbackw[] = { /* \w within brackets */ 00270 CHR('['), CHR(':'), 00271 CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'), 00272 CHR(':'), CHR(']'), CHR('_') 00273 }; 00274 00275 /* 00276 - lexword - interpolate a bracket expression for word characters 00277 * Possibly ought to inquire whether there is a "word" character class. 00278 ^ static VOID lexword(struct vars *); 00279 */ 00280 static void 00281 lexword( 00282 struct vars *v) 00283 { 00284 lexnest(v, backw, ENDOF(backw)); 00285 } 00286 00287 /* 00288 - next - get next token 00289 ^ static int next(struct vars *); 00290 */ 00291 static int /* 1 normal, 0 failure */ 00292 next( 00293 struct vars *v) 00294 { 00295 chr c; 00296 00297 /* 00298 * Errors yield an infinite sequence of failures. 00299 */ 00300 00301 if (ISERR()) { 00302 return 0; /* the error has set nexttype to EOS */ 00303 } 00304 00305 /* 00306 * Remember flavor of last token. 00307 */ 00308 00309 v->lasttype = v->nexttype; 00310 00311 /* 00312 * REG_BOSONLY 00313 */ 00314 00315 if (v->nexttype == EMPTY && (v->cflags®_BOSONLY)) { 00316 /* at start of a REG_BOSONLY RE */ 00317 RETV(SBEGIN, 0); /* same as \A */ 00318 } 00319 00320 /* 00321 * If we're nested and we've hit end, return to outer level. 00322 */ 00323 00324 if (v->savenow != NULL && ATEOS()) { 00325 v->now = v->savenow; 00326 v->stop = v->savestop; 00327 v->savenow = v->savestop = NULL; 00328 } 00329 00330 /* 00331 * Skip white space etc. if appropriate (not in literal or []) 00332 */ 00333 00334 if (v->cflags®_EXPANDED) { 00335 switch (v->lexcon) { 00336 case L_ERE: 00337 case L_BRE: 00338 case L_EBND: 00339 case L_BBND: 00340 skip(v); 00341 break; 00342 } 00343 } 00344 00345 /* 00346 * Handle EOS, depending on context. 00347 */ 00348 00349 if (ATEOS()) { 00350 switch (v->lexcon) { 00351 case L_ERE: 00352 case L_BRE: 00353 case L_Q: 00354 RET(EOS); 00355 break; 00356 case L_EBND: 00357 case L_BBND: 00358 FAILW(REG_EBRACE); 00359 break; 00360 case L_BRACK: 00361 case L_CEL: 00362 case L_ECL: 00363 case L_CCL: 00364 FAILW(REG_EBRACK); 00365 break; 00366 } 00367 assert(NOTREACHED); 00368 } 00369 00370 /* 00371 * Okay, time to actually get a character. 00372 */ 00373 00374 c = *v->now++; 00375 00376 /* 00377 * Deal with the easy contexts, punt EREs to code below. 00378 */ 00379 00380 switch (v->lexcon) { 00381 case L_BRE: /* punt BREs to separate function */ 00382 return brenext(v, c); 00383 break; 00384 case L_ERE: /* see below */ 00385 break; 00386 case L_Q: /* literal strings are easy */ 00387 RETV(PLAIN, c); 00388 break; 00389 case L_BBND: /* bounds are fairly simple */ 00390 case L_EBND: 00391 switch (c) { 00392 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): 00393 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): 00394 case CHR('8'): case CHR('9'): 00395 RETV(DIGIT, (chr)DIGITVAL(c)); 00396 break; 00397 case CHR(','): 00398 RET(','); 00399 break; 00400 case CHR('}'): /* ERE bound ends with } */ 00401 if (INCON(L_EBND)) { 00402 INTOCON(L_ERE); 00403 if ((v->cflags®_ADVF) && NEXT1('?')) { 00404 v->now++; 00405 NOTE(REG_UNONPOSIX); 00406 RETV('}', 0); 00407 } 00408 RETV('}', 1); 00409 } else { 00410 FAILW(REG_BADBR); 00411 } 00412 break; 00413 case CHR('\\'): /* BRE bound ends with \} */ 00414 if (INCON(L_BBND) && NEXT1('}')) { 00415 v->now++; 00416 INTOCON(L_BRE); 00417 RET('}'); 00418 } else { 00419 FAILW(REG_BADBR); 00420 } 00421 break; 00422 default: 00423 FAILW(REG_BADBR); 00424 break; 00425 } 00426 assert(NOTREACHED); 00427 break; 00428 case L_BRACK: /* brackets are not too hard */ 00429 switch (c) { 00430 case CHR(']'): 00431 if (LASTTYPE('[')) { 00432 RETV(PLAIN, c); 00433 } else { 00434 INTOCON((v->cflags®_EXTENDED) ? L_ERE : L_BRE); 00435 RET(']'); 00436 } 00437 break; 00438 case CHR('\\'): 00439 NOTE(REG_UBBS); 00440 if (!(v->cflags®_ADVF)) { 00441 RETV(PLAIN, c); 00442 } 00443 NOTE(REG_UNONPOSIX); 00444 if (ATEOS()) { 00445 FAILW(REG_EESCAPE); 00446 } 00447 (DISCARD)lexescape(v); 00448 switch (v->nexttype) { /* not all escapes okay here */ 00449 case PLAIN: 00450 return 1; 00451 break; 00452 case CCLASS: 00453 switch (v->nextvalue) { 00454 case 'd': 00455 lexnest(v, brbackd, ENDOF(brbackd)); 00456 break; 00457 case 's': 00458 lexnest(v, brbacks, ENDOF(brbacks)); 00459 break; 00460 case 'w': 00461 lexnest(v, brbackw, ENDOF(brbackw)); 00462 break; 00463 default: 00464 FAILW(REG_EESCAPE); 00465 break; 00466 } 00467 00468 /* 00469 * lexnest() done, back up and try again. 00470 */ 00471 00472 v->nexttype = v->lasttype; 00473 return next(v); 00474 break; 00475 } 00476 00477 /* 00478 * Not one of the acceptable escapes. 00479 */ 00480 00481 FAILW(REG_EESCAPE); 00482 break; 00483 case CHR('-'): 00484 if (LASTTYPE('[') || NEXT1(']')) { 00485 RETV(PLAIN, c); 00486 } else { 00487 RETV(RANGE, c); 00488 } 00489 break; 00490 case CHR('['): 00491 if (ATEOS()) { 00492 FAILW(REG_EBRACK); 00493 } 00494 switch (*v->now++) { 00495 case CHR('.'): 00496 INTOCON(L_CEL); 00497 00498 /* 00499 * Might or might not be locale-specific. 00500 */ 00501 00502 RET(COLLEL); 00503 break; 00504 case CHR('='): 00505 INTOCON(L_ECL); 00506 NOTE(REG_ULOCALE); 00507 RET(ECLASS); 00508 break; 00509 case CHR(':'): 00510 INTOCON(L_CCL); 00511 NOTE(REG_ULOCALE); 00512 RET(CCLASS); 00513 break; 00514 default: /* oops */ 00515 v->now--; 00516 RETV(PLAIN, c); 00517 break; 00518 } 00519 assert(NOTREACHED); 00520 break; 00521 default: 00522 RETV(PLAIN, c); 00523 break; 00524 } 00525 assert(NOTREACHED); 00526 break; 00527 case L_CEL: /* collating elements are easy */ 00528 if (c == CHR('.') && NEXT1(']')) { 00529 v->now++; 00530 INTOCON(L_BRACK); 00531 RETV(END, '.'); 00532 } else { 00533 RETV(PLAIN, c); 00534 } 00535 break; 00536 case L_ECL: /* ditto equivalence classes */ 00537 if (c == CHR('=') && NEXT1(']')) { 00538 v->now++; 00539 INTOCON(L_BRACK); 00540 RETV(END, '='); 00541 } else { 00542 RETV(PLAIN, c); 00543 } 00544 break; 00545 case L_CCL: /* ditto character classes */ 00546 if (c == CHR(':') && NEXT1(']')) { 00547 v->now++; 00548 INTOCON(L_BRACK); 00549 RETV(END, ':'); 00550 } else { 00551 RETV(PLAIN, c); 00552 } 00553 break; 00554 default: 00555 assert(NOTREACHED); 00556 break; 00557 } 00558 00559 /* 00560 * That got rid of everything except EREs and AREs. 00561 */ 00562 00563 assert(INCON(L_ERE)); 00564 00565 /* 00566 * Deal with EREs and AREs, except for backslashes. 00567 */ 00568 00569 switch (c) { 00570 case CHR('|'): 00571 RET('|'); 00572 break; 00573 case CHR('*'): 00574 if ((v->cflags®_ADVF) && NEXT1('?')) { 00575 v->now++; 00576 NOTE(REG_UNONPOSIX); 00577 RETV('*', 0); 00578 } 00579 RETV('*', 1); 00580 break; 00581 case CHR('+'): 00582 if ((v->cflags®_ADVF) && NEXT1('?')) { 00583 v->now++; 00584 NOTE(REG_UNONPOSIX); 00585 RETV('+', 0); 00586 } 00587 RETV('+', 1); 00588 break; 00589 case CHR('?'): 00590 if ((v->cflags®_ADVF) && NEXT1('?')) { 00591 v->now++; 00592 NOTE(REG_UNONPOSIX); 00593 RETV('?', 0); 00594 } 00595 RETV('?', 1); 00596 break; 00597 case CHR('{'): /* bounds start or plain character */ 00598 if (v->cflags®_EXPANDED) { 00599 skip(v); 00600 } 00601 if (ATEOS() || !iscdigit(*v->now)) { 00602 NOTE(REG_UBRACES); 00603 NOTE(REG_UUNSPEC); 00604 RETV(PLAIN, c); 00605 } else { 00606 NOTE(REG_UBOUNDS); 00607 INTOCON(L_EBND); 00608 RET('{'); 00609 } 00610 assert(NOTREACHED); 00611 break; 00612 case CHR('('): /* parenthesis, or advanced extension */ 00613 if ((v->cflags®_ADVF) && NEXT1('?')) { 00614 NOTE(REG_UNONPOSIX); 00615 v->now++; 00616 switch (*v->now++) { 00617 case CHR(':'): /* non-capturing paren */ 00618 RETV('(', 0); 00619 break; 00620 case CHR('#'): /* comment */ 00621 while (!ATEOS() && *v->now != CHR(')')) { 00622 v->now++; 00623 } 00624 if (!ATEOS()) { 00625 v->now++; 00626 } 00627 assert(v->nexttype == v->lasttype); 00628 return next(v); 00629 break; 00630 case CHR('='): /* positive lookahead */ 00631 NOTE(REG_ULOOKAHEAD); 00632 RETV(LACON, 1); 00633 break; 00634 case CHR('!'): /* negative lookahead */ 00635 NOTE(REG_ULOOKAHEAD); 00636 RETV(LACON, 0); 00637 break; 00638 default: 00639 FAILW(REG_BADRPT); 00640 break; 00641 } 00642 assert(NOTREACHED); 00643 } 00644 if (v->cflags®_NOSUB) { 00645 RETV('(', 0); /* all parens non-capturing */ 00646 } else { 00647 RETV('(', 1); 00648 } 00649 break; 00650 case CHR(')'): 00651 if (LASTTYPE('(')) { 00652 NOTE(REG_UUNSPEC); 00653 } 00654 RETV(')', c); 00655 break; 00656 case CHR('['): /* easy except for [[:<:]] and [[:>:]] */ 00657 if (HAVE(6) && *(v->now+0) == CHR('[') && 00658 *(v->now+1) == CHR(':') && 00659 (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && 00660 *(v->now+3) == CHR(':') && 00661 *(v->now+4) == CHR(']') && 00662 *(v->now+5) == CHR(']')) { 00663 c = *(v->now+2); 00664 v->now += 6; 00665 NOTE(REG_UNONPOSIX); 00666 RET((c == CHR('<')) ? '<' : '>'); 00667 } 00668 INTOCON(L_BRACK); 00669 if (NEXT1('^')) { 00670 v->now++; 00671 RETV('[', 0); 00672 } 00673 RETV('[', 1); 00674 break; 00675 case CHR('.'): 00676 RET('.'); 00677 break; 00678 case CHR('^'): 00679 RET('^'); 00680 break; 00681 case CHR('$'): 00682 RET('$'); 00683 break; 00684 case CHR('\\'): /* mostly punt backslashes to code below */ 00685 if (ATEOS()) { 00686 FAILW(REG_EESCAPE); 00687 } 00688 break; 00689 default: /* ordinary character */ 00690 RETV(PLAIN, c); 00691 break; 00692 } 00693 00694 /* 00695 * ERE/ARE backslash handling; backslash already eaten. 00696 */ 00697 00698 assert(!ATEOS()); 00699 if (!(v->cflags®_ADVF)) {/* only AREs have non-trivial escapes */ 00700 if (iscalnum(*v->now)) { 00701 NOTE(REG_UBSALNUM); 00702 NOTE(REG_UUNSPEC); 00703 } 00704 RETV(PLAIN, *v->now++); 00705 } 00706 (DISCARD)lexescape(v); 00707 if (ISERR()) { 00708 FAILW(REG_EESCAPE); 00709 } 00710 if (v->nexttype == CCLASS) {/* fudge at lexical level */ 00711 switch (v->nextvalue) { 00712 case 'd': lexnest(v, backd, ENDOF(backd)); break; 00713 case 'D': lexnest(v, backD, ENDOF(backD)); break; 00714 case 's': lexnest(v, backs, ENDOF(backs)); break; 00715 case 'S': lexnest(v, backS, ENDOF(backS)); break; 00716 case 'w': lexnest(v, backw, ENDOF(backw)); break; 00717 case 'W': lexnest(v, backW, ENDOF(backW)); break; 00718 default: 00719 assert(NOTREACHED); 00720 FAILW(REG_ASSERT); 00721 break; 00722 } 00723 /* lexnest done, back up and try again */ 00724 v->nexttype = v->lasttype; 00725 return next(v); 00726 } 00727 00728 /* 00729 * Otherwise, lexescape has already done the work. 00730 */ 00731 00732 return !ISERR(); 00733 } 00734 00735 /* 00736 - lexescape - parse an ARE backslash escape (backslash already eaten) 00737 * Note slightly nonstandard use of the CCLASS type code. 00738 ^ static int lexescape(struct vars *); 00739 */ 00740 static int /* not actually used, but convenient for RETV */ 00741 lexescape( 00742 struct vars *v) 00743 { 00744 chr c; 00745 static chr alert[] = { 00746 CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') 00747 }; 00748 static chr esc[] = { 00749 CHR('E'), CHR('S'), CHR('C') 00750 }; 00751 const chr *save; 00752 00753 assert(v->cflags®_ADVF); 00754 00755 assert(!ATEOS()); 00756 c = *v->now++; 00757 if (!iscalnum(c)) { 00758 RETV(PLAIN, c); 00759 } 00760 00761 NOTE(REG_UNONPOSIX); 00762 switch (c) { 00763 case CHR('a'): 00764 RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007'))); 00765 break; 00766 case CHR('A'): 00767 RETV(SBEGIN, 0); 00768 break; 00769 case CHR('b'): 00770 RETV(PLAIN, CHR('\b')); 00771 break; 00772 case CHR('B'): 00773 RETV(PLAIN, CHR('\\')); 00774 break; 00775 case CHR('c'): 00776 NOTE(REG_UUNPORT); 00777 if (ATEOS()) { 00778 FAILW(REG_EESCAPE); 00779 } 00780 RETV(PLAIN, (chr)(*v->now++ & 037)); 00781 break; 00782 case CHR('d'): 00783 NOTE(REG_ULOCALE); 00784 RETV(CCLASS, 'd'); 00785 break; 00786 case CHR('D'): 00787 NOTE(REG_ULOCALE); 00788 RETV(CCLASS, 'D'); 00789 break; 00790 case CHR('e'): 00791 NOTE(REG_UUNPORT); 00792 RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033'))); 00793 break; 00794 case CHR('f'): 00795 RETV(PLAIN, CHR('\f')); 00796 break; 00797 case CHR('m'): 00798 RET('<'); 00799 break; 00800 case CHR('M'): 00801 RET('>'); 00802 break; 00803 case CHR('n'): 00804 RETV(PLAIN, CHR('\n')); 00805 break; 00806 case CHR('r'): 00807 RETV(PLAIN, CHR('\r')); 00808 break; 00809 case CHR('s'): 00810 NOTE(REG_ULOCALE); 00811 RETV(CCLASS, 's'); 00812 break; 00813 case CHR('S'): 00814 NOTE(REG_ULOCALE); 00815 RETV(CCLASS, 'S'); 00816 break; 00817 case CHR('t'): 00818 RETV(PLAIN, CHR('\t')); 00819 break; 00820 case CHR('u'): 00821 c = lexdigits(v, 16, 4, 4); 00822 if (ISERR()) { 00823 FAILW(REG_EESCAPE); 00824 } 00825 RETV(PLAIN, c); 00826 break; 00827 case CHR('U'): 00828 c = lexdigits(v, 16, 8, 8); 00829 if (ISERR()) { 00830 FAILW(REG_EESCAPE); 00831 } 00832 RETV(PLAIN, c); 00833 break; 00834 case CHR('v'): 00835 RETV(PLAIN, CHR('\v')); 00836 break; 00837 case CHR('w'): 00838 NOTE(REG_ULOCALE); 00839 RETV(CCLASS, 'w'); 00840 break; 00841 case CHR('W'): 00842 NOTE(REG_ULOCALE); 00843 RETV(CCLASS, 'W'); 00844 break; 00845 case CHR('x'): 00846 NOTE(REG_UUNPORT); 00847 c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ 00848 if (ISERR()) { 00849 FAILW(REG_EESCAPE); 00850 } 00851 RETV(PLAIN, c); 00852 break; 00853 case CHR('y'): 00854 NOTE(REG_ULOCALE); 00855 RETV(WBDRY, 0); 00856 break; 00857 case CHR('Y'): 00858 NOTE(REG_ULOCALE); 00859 RETV(NWBDRY, 0); 00860 break; 00861 case CHR('Z'): 00862 RETV(SEND, 0); 00863 break; 00864 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): 00865 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): 00866 case CHR('9'): 00867 save = v->now; 00868 v->now--; /* put first digit back */ 00869 c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ 00870 if (ISERR()) { 00871 FAILW(REG_EESCAPE); 00872 } 00873 00874 /* 00875 * Ugly heuristic (first test is "exactly 1 digit?") 00876 */ 00877 00878 if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) { 00879 NOTE(REG_UBACKREF); 00880 RETV(BACKREF, (chr)c); 00881 } 00882 00883 /* 00884 * Oops, doesn't look like it's a backref after all... 00885 */ 00886 00887 v->now = save; 00888 00889 /* 00890 * And fall through into octal number. 00891 */ 00892 00893 case CHR('0'): 00894 NOTE(REG_UUNPORT); 00895 v->now--; /* put first digit back */ 00896 c = lexdigits(v, 8, 1, 3); 00897 if (ISERR()) { 00898 FAILW(REG_EESCAPE); 00899 } 00900 RETV(PLAIN, c); 00901 break; 00902 default: 00903 assert(iscalpha(c)); 00904 FAILW(REG_EESCAPE); /* unknown alphabetic escape */ 00905 break; 00906 } 00907 assert(NOTREACHED); 00908 } 00909 00910 /* 00911 - lexdigits - slurp up digits and return chr value 00912 ^ static chr lexdigits(struct vars *, int, int, int); 00913 */ 00914 static chr /* chr value; errors signalled via ERR */ 00915 lexdigits( 00916 struct vars *v, 00917 int base, 00918 int minlen, 00919 int maxlen) 00920 { 00921 uchr n; /* unsigned to avoid overflow misbehavior */ 00922 int len; 00923 chr c; 00924 int d; 00925 CONST uchr ub = (uchr) base; 00926 00927 n = 0; 00928 for (len = 0; len < maxlen && !ATEOS(); len++) { 00929 c = *v->now++; 00930 switch (c) { 00931 case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): 00932 case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): 00933 case CHR('8'): case CHR('9'): 00934 d = DIGITVAL(c); 00935 break; 00936 case CHR('a'): case CHR('A'): d = 10; break; 00937 case CHR('b'): case CHR('B'): d = 11; break; 00938 case CHR('c'): case CHR('C'): d = 12; break; 00939 case CHR('d'): case CHR('D'): d = 13; break; 00940 case CHR('e'): case CHR('E'): d = 14; break; 00941 case CHR('f'): case CHR('F'): d = 15; break; 00942 default: 00943 v->now--; /* oops, not a digit at all */ 00944 d = -1; 00945 break; 00946 } 00947 00948 if (d >= base) { /* not a plausible digit */ 00949 v->now--; 00950 d = -1; 00951 } 00952 if (d < 0) { 00953 break; /* NOTE BREAK OUT */ 00954 } 00955 n = n*ub + (uchr)d; 00956 } 00957 if (len < minlen) { 00958 ERR(REG_EESCAPE); 00959 } 00960 00961 return (chr)n; 00962 } 00963 00964 /* 00965 - brenext - get next BRE token 00966 * This is much like EREs except for all the stupid backslashes and the 00967 * context-dependency of some things. 00968 ^ static int brenext(struct vars *, pchr); 00969 */ 00970 static int /* 1 normal, 0 failure */ 00971 brenext( 00972 struct vars *v, 00973 pchr pc) 00974 { 00975 chr c = (chr)pc; 00976 00977 switch (c) { 00978 case CHR('*'): 00979 if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) { 00980 RETV(PLAIN, c); 00981 } 00982 RET('*'); 00983 break; 00984 case CHR('['): 00985 if (HAVE(6) && *(v->now+0) == CHR('[') && 00986 *(v->now+1) == CHR(':') && 00987 (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) && 00988 *(v->now+3) == CHR(':') && 00989 *(v->now+4) == CHR(']') && 00990 *(v->now+5) == CHR(']')) { 00991 c = *(v->now+2); 00992 v->now += 6; 00993 NOTE(REG_UNONPOSIX); 00994 RET((c == CHR('<')) ? '<' : '>'); 00995 } 00996 INTOCON(L_BRACK); 00997 if (NEXT1('^')) { 00998 v->now++; 00999 RETV('[', 0); 01000 } 01001 RETV('[', 1); 01002 break; 01003 case CHR('.'): 01004 RET('.'); 01005 break; 01006 case CHR('^'): 01007 if (LASTTYPE(EMPTY)) { 01008 RET('^'); 01009 } 01010 if (LASTTYPE('(')) { 01011 NOTE(REG_UUNSPEC); 01012 RET('^'); 01013 } 01014 RETV(PLAIN, c); 01015 break; 01016 case CHR('$'): 01017 if (v->cflags®_EXPANDED) { 01018 skip(v); 01019 } 01020 if (ATEOS()) { 01021 RET('$'); 01022 } 01023 if (NEXT2('\\', ')')) { 01024 NOTE(REG_UUNSPEC); 01025 RET('$'); 01026 } 01027 RETV(PLAIN, c); 01028 break; 01029 case CHR('\\'): 01030 break; /* see below */ 01031 default: 01032 RETV(PLAIN, c); 01033 break; 01034 } 01035 01036 assert(c == CHR('\\')); 01037 01038 if (ATEOS()) { 01039 FAILW(REG_EESCAPE); 01040 } 01041 01042 c = *v->now++; 01043 switch (c) { 01044 case CHR('{'): 01045 INTOCON(L_BBND); 01046 NOTE(REG_UBOUNDS); 01047 RET('{'); 01048 break; 01049 case CHR('('): 01050 RETV('(', 1); 01051 break; 01052 case CHR(')'): 01053 RETV(')', c); 01054 break; 01055 case CHR('<'): 01056 NOTE(REG_UNONPOSIX); 01057 RET('<'); 01058 break; 01059 case CHR('>'): 01060 NOTE(REG_UNONPOSIX); 01061 RET('>'); 01062 break; 01063 case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): 01064 case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): 01065 case CHR('9'): 01066 NOTE(REG_UBACKREF); 01067 RETV(BACKREF, (chr)DIGITVAL(c)); 01068 break; 01069 default: 01070 if (iscalnum(c)) { 01071 NOTE(REG_UBSALNUM); 01072 NOTE(REG_UUNSPEC); 01073 } 01074 RETV(PLAIN, c); 01075 break; 01076 } 01077 01078 assert(NOTREACHED); 01079 } 01080 01081 /* 01082 - skip - skip white space and comments in expanded form 01083 ^ static VOID skip(struct vars *); 01084 */ 01085 static void 01086 skip( 01087 struct vars *v) 01088 { 01089 const chr *start = v->now; 01090 01091 assert(v->cflags®_EXPANDED); 01092 01093 for (;;) { 01094 while (!ATEOS() && iscspace(*v->now)) { 01095 v->now++; 01096 } 01097 if (ATEOS() || *v->now != CHR('#')) { 01098 break; /* NOTE BREAK OUT */ 01099 } 01100 assert(NEXT1('#')); 01101 while (!ATEOS() && *v->now != CHR('\n')) { 01102 v->now++; 01103 } 01104 01105 /* 01106 * Leave the newline to be picked up by the iscspace loop. 01107 */ 01108 } 01109 01110 if (v->now != start) { 01111 NOTE(REG_UNONPOSIX); 01112 } 01113 } 01114 01115 /* 01116 - newline - return the chr for a newline 01117 * This helps confine use of CHR to this source file. 01118 ^ static chr newline(NOPARMS); 01119 */ 01120 static chr 01121 newline(void) 01122 { 01123 return CHR('\n'); 01124 } 01125 01126 /* 01127 - ch - return the chr sequence for regc_locale.c's fake collating element ch 01128 * This helps confine use of CHR to this source file. Beware that the caller 01129 * knows how long the sequence is. 01130 ^ #ifdef REG_DEBUG 01131 ^ static const chr *ch(NOPARMS); 01132 ^ #endif 01133 */ 01134 #ifdef REG_DEBUG 01135 static const chr * 01136 ch(void) 01137 { 01138 static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') }; 01139 01140 return chstr; 01141 } 01142 #endif 01143 01144 /* 01145 - chrnamed - return the chr known by a given (chr string) name 01146 * The code is a bit clumsy, but this routine gets only such specialized 01147 * use that it hardly matters. 01148 ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr); 01149 */ 01150 static chr 01151 chrnamed( 01152 struct vars *v, 01153 const chr *startp, /* start of name */ 01154 const chr *endp, /* just past end of name */ 01155 pchr lastresort) /* what to return if name lookup fails */ 01156 { 01157 celt c; 01158 int errsave; 01159 int e; 01160 struct cvec *cv; 01161 01162 errsave = v->err; 01163 v->err = 0; 01164 c = element(v, startp, endp); 01165 e = v->err; 01166 v->err = errsave; 01167 01168 if (e != 0) { 01169 return (chr)lastresort; 01170 } 01171 01172 cv = range(v, c, c, 0); 01173 if (cv->nchrs == 0) { 01174 return (chr)lastresort; 01175 } 01176 return cv->chrs[0]; 01177 } 01178 01179 /* 01180 * Local Variables: 01181 * mode: c 01182 * c-basic-offset: 4 01183 * fill-column: 78 01184 * End: 01185 */
Generated on Wed Mar 12 12:18:10 2008 by 1.5.1 |