regc_lex.c

Go to the documentation of this file.
00001 /*
00002  * lexical analyzer
00003  * This file is #included by regcomp.c.
00004  *
00005  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
00006  *
00007  * Development of this software was funded, in part, by Cray Research Inc.,
00008  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
00009  * Corporation, none of whom are responsible for the results.  The author
00010  * thanks all of them.
00011  *
00012  * Redistribution and use in source and binary forms -- with or without
00013  * modification -- are permitted for any purpose, provided that
00014  * redistributions in source form retain this entire copyright notice and
00015  * indicate the origin and nature of any modifications.
00016  *
00017  * I'd appreciate being given credit for this package in the documentation of
00018  * software which uses it, but that is not a requirement.
00019  *
00020  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
00021  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
00022  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
00023  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00024  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00025  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00026  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00027  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00028  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00029  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00030  */
00031 
00032 /* scanning macros (know about v) */
00033 #define ATEOS()         (v->now >= v->stop)
00034 #define HAVE(n)         (v->stop - v->now >= (n))
00035 #define NEXT1(c)        (!ATEOS() && *v->now == CHR(c))
00036 #define NEXT2(a,b)      (HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
00037 #define NEXT3(a,b,c) \
00038         (HAVE(3) && *v->now == CHR(a) && \
00039                 *(v->now+1) == CHR(b) && \
00040                 *(v->now+2) == CHR(c))
00041 #define SET(c)          (v->nexttype = (c))
00042 #define SETV(c, n)      (v->nexttype = (c), v->nextvalue = (n))
00043 #define RET(c)          return (SET(c), 1)
00044 #define RETV(c, n)      return (SETV(c, n), 1)
00045 #define FAILW(e)        return (ERR(e), 0)      /* ERR does SET(EOS) */
00046 #define LASTTYPE(t)     (v->lasttype == (t))
00047 
00048 /* lexical contexts */
00049 #define L_ERE   1       /* mainline ERE/ARE */
00050 #define L_BRE   2       /* mainline BRE */
00051 #define L_Q     3       /* REG_QUOTE */
00052 #define L_EBND  4       /* ERE/ARE bound */
00053 #define L_BBND  5       /* BRE bound */
00054 #define L_BRACK 6       /* brackets */
00055 #define L_CEL   7       /* collating element */
00056 #define L_ECL   8       /* equivalence class */
00057 #define L_CCL   9       /* character class */
00058 #define INTOCON(c)      (v->lexcon = (c))
00059 #define INCON(con)      (v->lexcon == (con))
00060 
00061 /* construct pointer past end of chr array */
00062 #define ENDOF(array)    ((array) + sizeof(array)/sizeof(chr))
00063 
00064 /*
00065  - lexstart - set up lexical stuff, scan leading options
00066  ^ static VOID lexstart(struct vars *);
00067  */
00068 static void
00069 lexstart(
00070     struct vars *v)
00071 {
00072     prefixes(v);                /* may turn on new type bits etc. */
00073     NOERR();
00074 
00075     if (v->cflags&REG_QUOTE) {
00076         assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)));
00077         INTOCON(L_Q);
00078     } else if (v->cflags&REG_EXTENDED) {
00079         assert(!(v->cflags&REG_QUOTE));
00080         INTOCON(L_ERE);
00081     } else {
00082         assert(!(v->cflags&(REG_QUOTE|REG_ADVF)));
00083         INTOCON(L_BRE);
00084     }
00085 
00086     v->nexttype = EMPTY;        /* remember we were at the start */
00087     next(v);                    /* set up the first token */
00088 }
00089 
00090 /*
00091  - prefixes - implement various special prefixes
00092  ^ static VOID prefixes(struct vars *);
00093  */
00094 static void
00095 prefixes(
00096     struct vars *v)
00097 {
00098     /*
00099      * Literal string doesn't get any of this stuff.
00100      */
00101 
00102     if (v->cflags&REG_QUOTE) {
00103         return;
00104     }
00105 
00106     /*
00107      * Initial "***" gets special things.
00108      */
00109 
00110     if (HAVE(4) && NEXT3('*', '*', '*')) {
00111         switch (*(v->now + 3)) {
00112         case CHR('?'):          /* "***?" error, msg shows version */
00113             ERR(REG_BADPAT);
00114             return;             /* proceed no further */
00115             break;
00116         case CHR('='):          /* "***=" shifts to literal string */
00117             NOTE(REG_UNONPOSIX);
00118             v->cflags |= REG_QUOTE;
00119             v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE);
00120             v->now += 4;
00121             return;             /* and there can be no more prefixes */
00122             break;
00123         case CHR(':'):          /* "***:" shifts to AREs */
00124             NOTE(REG_UNONPOSIX);
00125             v->cflags |= REG_ADVANCED;
00126             v->now += 4;
00127             break;
00128         default:                /* otherwise *** is just an error */
00129             ERR(REG_BADRPT);
00130             return;
00131             break;
00132         }
00133     }
00134 
00135     /*
00136      * BREs and EREs don't get embedded options.
00137      */
00138 
00139     if ((v->cflags&REG_ADVANCED) != REG_ADVANCED) {
00140         return;
00141     }
00142 
00143     /*
00144      * Embedded options (AREs only).
00145      */
00146 
00147     if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) {
00148         NOTE(REG_UNONPOSIX);
00149         v->now += 2;
00150         for (; !ATEOS() && iscalpha(*v->now); v->now++) {
00151             switch (*v->now) {
00152             case CHR('b'):      /* BREs (but why???) */
00153                 v->cflags &= ~(REG_ADVANCED|REG_QUOTE);
00154                 break;
00155             case CHR('c'):      /* case sensitive */
00156                 v->cflags &= ~REG_ICASE;
00157                 break;
00158             case CHR('e'):      /* plain EREs */
00159                 v->cflags |= REG_EXTENDED;
00160                 v->cflags &= ~(REG_ADVF|REG_QUOTE);
00161                 break;
00162             case CHR('i'):      /* case insensitive */
00163                 v->cflags |= REG_ICASE;
00164                 break;
00165             case CHR('m'):      /* Perloid synonym for n */
00166             case CHR('n'):      /* \n affects ^ $ . [^ */
00167                 v->cflags |= REG_NEWLINE;
00168                 break;
00169             case CHR('p'):      /* ~Perl, \n affects . [^ */
00170                 v->cflags |= REG_NLSTOP;
00171                 v->cflags &= ~REG_NLANCH;
00172                 break;
00173             case CHR('q'):      /* literal string */
00174                 v->cflags |= REG_QUOTE;
00175                 v->cflags &= ~REG_ADVANCED;
00176                 break;
00177             case CHR('s'):      /* single line, \n ordinary */
00178                 v->cflags &= ~REG_NEWLINE;
00179                 break;
00180             case CHR('t'):      /* tight syntax */
00181                 v->cflags &= ~REG_EXPANDED;
00182                 break;
00183             case CHR('w'):      /* weird, \n affects ^ $ only */
00184                 v->cflags &= ~REG_NLSTOP;
00185                 v->cflags |= REG_NLANCH;
00186                 break;
00187             case CHR('x'):      /* expanded syntax */
00188                 v->cflags |= REG_EXPANDED;
00189                 break;
00190             default:
00191                 ERR(REG_BADOPT);
00192                 return;
00193             }
00194         }
00195         if (!NEXT1(')')) {
00196             ERR(REG_BADOPT);
00197             return;
00198         }
00199         v->now++;
00200         if (v->cflags&REG_QUOTE) {
00201             v->cflags &= ~(REG_EXPANDED|REG_NEWLINE);
00202         }
00203     }
00204 }
00205 
00206 /*
00207  - lexnest - "call a subroutine", interpolating string at the lexical level
00208  * Note, this is not a very general facility.  There are a number of
00209  * implicit assumptions about what sorts of strings can be subroutines.
00210  ^ static VOID lexnest(struct vars *, const chr *, const chr *);
00211  */
00212 static void
00213 lexnest(
00214     struct vars *v,
00215     const chr *beginp,          /* start of interpolation */
00216     const chr *endp)            /* one past end of interpolation */
00217 {
00218     assert(v->savenow == NULL); /* only one level of nesting */
00219     v->savenow = v->now;
00220     v->savestop = v->stop;
00221     v->now = beginp;
00222     v->stop = endp;
00223 }
00224 
00225 /*
00226  * string constants to interpolate as expansions of things like \d
00227  */
00228 
00229 static const chr backd[] = {    /* \d */
00230     CHR('['), CHR('['), CHR(':'),
00231     CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
00232     CHR(':'), CHR(']'), CHR(']')
00233 };
00234 static const chr backD[] = {    /* \D */
00235     CHR('['), CHR('^'), CHR('['), CHR(':'),
00236     CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
00237     CHR(':'), CHR(']'), CHR(']')
00238 };
00239 static const chr brbackd[] = {  /* \d within brackets */
00240     CHR('['), CHR(':'),
00241     CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
00242     CHR(':'), CHR(']')
00243 };
00244 static const chr backs[] = {    /* \s */
00245     CHR('['), CHR('['), CHR(':'),
00246     CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
00247     CHR(':'), CHR(']'), CHR(']')
00248 };
00249 static const chr backS[] = {    /* \S */
00250     CHR('['), CHR('^'), CHR('['), CHR(':'),
00251     CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
00252     CHR(':'), CHR(']'), CHR(']')
00253 };
00254 static const chr brbacks[] = {  /* \s within brackets */
00255     CHR('['), CHR(':'),
00256     CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
00257     CHR(':'), CHR(']')
00258 };
00259 static const chr backw[] = {    /* \w */
00260     CHR('['), CHR('['), CHR(':'),
00261     CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
00262     CHR(':'), CHR(']'), CHR('_'), CHR(']')
00263 };
00264 static const chr backW[] = {    /* \W */
00265     CHR('['), CHR('^'), CHR('['), CHR(':'),
00266     CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
00267     CHR(':'), CHR(']'), CHR('_'), CHR(']')
00268 };
00269 static const chr brbackw[] = {  /* \w within brackets */
00270     CHR('['), CHR(':'),
00271     CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
00272     CHR(':'), CHR(']'), CHR('_')
00273 };
00274 
00275 /*
00276  - lexword - interpolate a bracket expression for word characters
00277  * Possibly ought to inquire whether there is a "word" character class.
00278  ^ static VOID lexword(struct vars *);
00279  */
00280 static void
00281 lexword(
00282     struct vars *v)
00283 {
00284     lexnest(v, backw, ENDOF(backw));
00285 }
00286 
00287 /*
00288  - next - get next token
00289  ^ static int next(struct vars *);
00290  */
00291 static int                      /* 1 normal, 0 failure */
00292 next(
00293     struct vars *v)
00294 {
00295     chr c;
00296 
00297     /*
00298      * Errors yield an infinite sequence of failures.
00299      */
00300 
00301     if (ISERR()) {
00302         return 0;               /* the error has set nexttype to EOS */
00303     }
00304 
00305     /*
00306      * Remember flavor of last token.
00307      */
00308 
00309     v->lasttype = v->nexttype;
00310 
00311     /*
00312      * REG_BOSONLY
00313      */
00314 
00315     if (v->nexttype == EMPTY && (v->cflags&REG_BOSONLY)) {
00316         /* at start of a REG_BOSONLY RE */
00317         RETV(SBEGIN, 0);        /* same as \A */
00318     }
00319 
00320     /*
00321      * If we're nested and we've hit end, return to outer level.
00322      */
00323 
00324     if (v->savenow != NULL && ATEOS()) {
00325         v->now = v->savenow;
00326         v->stop = v->savestop;
00327         v->savenow = v->savestop = NULL;
00328     }
00329 
00330     /*
00331      * Skip white space etc. if appropriate (not in literal or [])
00332      */
00333 
00334     if (v->cflags&REG_EXPANDED) {
00335         switch (v->lexcon) {
00336         case L_ERE:
00337         case L_BRE:
00338         case L_EBND:
00339         case L_BBND:
00340             skip(v);
00341             break;
00342         }
00343     }
00344 
00345     /*
00346      * Handle EOS, depending on context.
00347      */
00348 
00349     if (ATEOS()) {
00350         switch (v->lexcon) {
00351         case L_ERE:
00352         case L_BRE:
00353         case L_Q:
00354             RET(EOS);
00355             break;
00356         case L_EBND:
00357         case L_BBND:
00358             FAILW(REG_EBRACE);
00359             break;
00360         case L_BRACK:
00361         case L_CEL:
00362         case L_ECL:
00363         case L_CCL:
00364             FAILW(REG_EBRACK);
00365             break;
00366         }
00367         assert(NOTREACHED);
00368     }
00369 
00370     /*
00371      * Okay, time to actually get a character.
00372      */
00373 
00374     c = *v->now++;
00375 
00376     /*
00377      * Deal with the easy contexts, punt EREs to code below.
00378      */
00379 
00380     switch (v->lexcon) {
00381     case L_BRE:                 /* punt BREs to separate function */
00382         return brenext(v, c);
00383         break;
00384     case L_ERE:                 /* see below */
00385         break;
00386     case L_Q:                   /* literal strings are easy */
00387         RETV(PLAIN, c);
00388         break;
00389     case L_BBND:                /* bounds are fairly simple */
00390     case L_EBND:
00391         switch (c) {
00392         case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
00393         case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
00394         case CHR('8'): case CHR('9'):
00395             RETV(DIGIT, (chr)DIGITVAL(c));
00396             break;
00397         case CHR(','):
00398             RET(',');
00399             break;
00400         case CHR('}'):          /* ERE bound ends with } */
00401             if (INCON(L_EBND)) {
00402                 INTOCON(L_ERE);
00403                 if ((v->cflags&REG_ADVF) && NEXT1('?')) {
00404                     v->now++;
00405                     NOTE(REG_UNONPOSIX);
00406                     RETV('}', 0);
00407                 }
00408                 RETV('}', 1);
00409             } else {
00410                 FAILW(REG_BADBR);
00411             }
00412             break;
00413         case CHR('\\'):         /* BRE bound ends with \} */
00414             if (INCON(L_BBND) && NEXT1('}')) {
00415                 v->now++;
00416                 INTOCON(L_BRE);
00417                 RET('}');
00418             } else {
00419                 FAILW(REG_BADBR);
00420             }
00421             break;
00422         default:
00423             FAILW(REG_BADBR);
00424             break;
00425         }
00426         assert(NOTREACHED);
00427         break;
00428     case L_BRACK:               /* brackets are not too hard */
00429         switch (c) {
00430         case CHR(']'):
00431             if (LASTTYPE('[')) {
00432                 RETV(PLAIN, c);
00433             } else {
00434                 INTOCON((v->cflags&REG_EXTENDED) ? L_ERE : L_BRE);
00435                 RET(']');
00436             }
00437             break;
00438         case CHR('\\'):
00439             NOTE(REG_UBBS);
00440             if (!(v->cflags&REG_ADVF)) {
00441                 RETV(PLAIN, c);
00442             }
00443             NOTE(REG_UNONPOSIX);
00444             if (ATEOS()) {
00445                 FAILW(REG_EESCAPE);
00446             }
00447             (DISCARD)lexescape(v);
00448             switch (v->nexttype) {      /* not all escapes okay here */
00449             case PLAIN:
00450                 return 1;
00451                 break;
00452             case CCLASS:
00453                 switch (v->nextvalue) {
00454                 case 'd':
00455                     lexnest(v, brbackd, ENDOF(brbackd));
00456                     break;
00457                 case 's':
00458                     lexnest(v, brbacks, ENDOF(brbacks));
00459                     break;
00460                 case 'w':
00461                     lexnest(v, brbackw, ENDOF(brbackw));
00462                     break;
00463                 default:
00464                     FAILW(REG_EESCAPE);
00465                     break;
00466                 }
00467 
00468                 /*
00469                  * lexnest() done, back up and try again.
00470                  */
00471 
00472                 v->nexttype = v->lasttype;
00473                 return next(v);
00474                 break;
00475             }
00476 
00477             /*
00478              * Not one of the acceptable escapes.
00479              */
00480 
00481             FAILW(REG_EESCAPE);
00482             break;
00483         case CHR('-'):
00484             if (LASTTYPE('[') || NEXT1(']')) {
00485                 RETV(PLAIN, c);
00486             } else {
00487                 RETV(RANGE, c);
00488             }
00489             break;
00490         case CHR('['):
00491             if (ATEOS()) {
00492                 FAILW(REG_EBRACK);
00493             }
00494             switch (*v->now++) {
00495             case CHR('.'):
00496                 INTOCON(L_CEL);
00497 
00498                 /*
00499                  * Might or might not be locale-specific.
00500                  */
00501 
00502                 RET(COLLEL);
00503                 break;
00504             case CHR('='):
00505                 INTOCON(L_ECL);
00506                 NOTE(REG_ULOCALE);
00507                 RET(ECLASS);
00508                 break;
00509             case CHR(':'):
00510                 INTOCON(L_CCL);
00511                 NOTE(REG_ULOCALE);
00512                 RET(CCLASS);
00513                 break;
00514             default:            /* oops */
00515                 v->now--;
00516                 RETV(PLAIN, c);
00517                 break;
00518             }
00519             assert(NOTREACHED);
00520             break;
00521         default:
00522             RETV(PLAIN, c);
00523             break;
00524         }
00525         assert(NOTREACHED);
00526         break;
00527     case L_CEL:                 /* collating elements are easy */
00528         if (c == CHR('.') && NEXT1(']')) {
00529             v->now++;
00530             INTOCON(L_BRACK);
00531             RETV(END, '.');
00532         } else {
00533             RETV(PLAIN, c);
00534         }
00535         break;
00536     case L_ECL:                 /* ditto equivalence classes */
00537         if (c == CHR('=') && NEXT1(']')) {
00538             v->now++;
00539             INTOCON(L_BRACK);
00540             RETV(END, '=');
00541         } else {
00542             RETV(PLAIN, c);
00543         }
00544         break;
00545     case L_CCL:                 /* ditto character classes */
00546         if (c == CHR(':') && NEXT1(']')) {
00547             v->now++;
00548             INTOCON(L_BRACK);
00549             RETV(END, ':');
00550         } else {
00551             RETV(PLAIN, c);
00552         }
00553         break;
00554     default:
00555         assert(NOTREACHED);
00556         break;
00557     }
00558 
00559     /*
00560      * That got rid of everything except EREs and AREs.
00561      */
00562 
00563     assert(INCON(L_ERE));
00564 
00565     /*
00566      * Deal with EREs and AREs, except for backslashes.
00567      */
00568 
00569     switch (c) {
00570     case CHR('|'):
00571         RET('|');
00572         break;
00573     case CHR('*'):
00574         if ((v->cflags&REG_ADVF) && NEXT1('?')) {
00575             v->now++;
00576             NOTE(REG_UNONPOSIX);
00577             RETV('*', 0);
00578         }
00579         RETV('*', 1);
00580         break;
00581     case CHR('+'):
00582         if ((v->cflags&REG_ADVF) && NEXT1('?')) {
00583             v->now++;
00584             NOTE(REG_UNONPOSIX);
00585             RETV('+', 0);
00586         }
00587         RETV('+', 1);
00588         break;
00589     case CHR('?'):
00590         if ((v->cflags&REG_ADVF) && NEXT1('?')) {
00591             v->now++;
00592             NOTE(REG_UNONPOSIX);
00593             RETV('?', 0);
00594         }
00595         RETV('?', 1);
00596         break;
00597     case CHR('{'):              /* bounds start or plain character */
00598         if (v->cflags&REG_EXPANDED) {
00599             skip(v);
00600         }
00601         if (ATEOS() || !iscdigit(*v->now)) {
00602             NOTE(REG_UBRACES);
00603             NOTE(REG_UUNSPEC);
00604             RETV(PLAIN, c);
00605         } else {
00606             NOTE(REG_UBOUNDS);
00607             INTOCON(L_EBND);
00608             RET('{');
00609         }
00610         assert(NOTREACHED);
00611         break;
00612     case CHR('('):              /* parenthesis, or advanced extension */
00613         if ((v->cflags&REG_ADVF) && NEXT1('?')) {
00614             NOTE(REG_UNONPOSIX);
00615             v->now++;
00616             switch (*v->now++) {
00617             case CHR(':'):      /* non-capturing paren */
00618                 RETV('(', 0);
00619                 break;
00620             case CHR('#'):      /* comment */
00621                 while (!ATEOS() && *v->now != CHR(')')) {
00622                     v->now++;
00623                 }
00624                 if (!ATEOS()) {
00625                     v->now++;
00626                 }
00627                 assert(v->nexttype == v->lasttype);
00628                 return next(v);
00629                 break;
00630             case CHR('='):      /* positive lookahead */
00631                 NOTE(REG_ULOOKAHEAD);
00632                 RETV(LACON, 1);
00633                 break;
00634             case CHR('!'):      /* negative lookahead */
00635                 NOTE(REG_ULOOKAHEAD);
00636                 RETV(LACON, 0);
00637                 break;
00638             default:
00639                 FAILW(REG_BADRPT);
00640                 break;
00641             }
00642             assert(NOTREACHED);
00643         }
00644         if (v->cflags&REG_NOSUB) {
00645             RETV('(', 0);       /* all parens non-capturing */
00646         } else {
00647             RETV('(', 1);
00648         }
00649         break;
00650     case CHR(')'):
00651         if (LASTTYPE('(')) {
00652             NOTE(REG_UUNSPEC);
00653         }
00654         RETV(')', c);
00655         break;
00656     case CHR('['):              /* easy except for [[:<:]] and [[:>:]] */
00657         if (HAVE(6) &&  *(v->now+0) == CHR('[') &&
00658                 *(v->now+1) == CHR(':') &&
00659                 (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) &&
00660                 *(v->now+3) == CHR(':') &&
00661                 *(v->now+4) == CHR(']') &&
00662                 *(v->now+5) == CHR(']')) {
00663             c = *(v->now+2);
00664             v->now += 6;
00665             NOTE(REG_UNONPOSIX);
00666             RET((c == CHR('<')) ? '<' : '>');
00667         }
00668         INTOCON(L_BRACK);
00669         if (NEXT1('^')) {
00670             v->now++;
00671             RETV('[', 0);
00672         }
00673         RETV('[', 1);
00674         break;
00675     case CHR('.'):
00676         RET('.');
00677         break;
00678     case CHR('^'):
00679         RET('^');
00680         break;
00681     case CHR('$'):
00682         RET('$');
00683         break;
00684     case CHR('\\'):             /* mostly punt backslashes to code below */
00685         if (ATEOS()) {
00686             FAILW(REG_EESCAPE);
00687         }
00688         break;
00689     default:            /* ordinary character */
00690         RETV(PLAIN, c);
00691         break;
00692     }
00693 
00694     /*
00695      * ERE/ARE backslash handling; backslash already eaten.
00696      */
00697 
00698     assert(!ATEOS());
00699     if (!(v->cflags&REG_ADVF)) {/* only AREs have non-trivial escapes */
00700         if (iscalnum(*v->now)) {
00701             NOTE(REG_UBSALNUM);
00702             NOTE(REG_UUNSPEC);
00703         }
00704         RETV(PLAIN, *v->now++);
00705     }
00706     (DISCARD)lexescape(v);
00707     if (ISERR()) {
00708         FAILW(REG_EESCAPE);
00709     }
00710     if (v->nexttype == CCLASS) {/* fudge at lexical level */
00711         switch (v->nextvalue) {
00712         case 'd':       lexnest(v, backd, ENDOF(backd)); break;
00713         case 'D':       lexnest(v, backD, ENDOF(backD)); break;
00714         case 's':       lexnest(v, backs, ENDOF(backs)); break;
00715         case 'S':       lexnest(v, backS, ENDOF(backS)); break;
00716         case 'w':       lexnest(v, backw, ENDOF(backw)); break;
00717         case 'W':       lexnest(v, backW, ENDOF(backW)); break;
00718         default:
00719             assert(NOTREACHED);
00720             FAILW(REG_ASSERT);
00721             break;
00722         }
00723         /* lexnest done, back up and try again */
00724         v->nexttype = v->lasttype;
00725         return next(v);
00726     }
00727 
00728     /*
00729      * Otherwise, lexescape has already done the work.
00730      */
00731 
00732     return !ISERR();
00733 }
00734 
00735 /*
00736  - lexescape - parse an ARE backslash escape (backslash already eaten)
00737  * Note slightly nonstandard use of the CCLASS type code.
00738  ^ static int lexescape(struct vars *);
00739  */
00740 static int                      /* not actually used, but convenient for RETV */
00741 lexescape(
00742     struct vars *v)
00743 {
00744     chr c;
00745     static chr alert[] = {
00746         CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
00747     };
00748     static chr esc[] = {
00749         CHR('E'), CHR('S'), CHR('C')
00750     };
00751     const chr *save;
00752 
00753     assert(v->cflags&REG_ADVF);
00754 
00755     assert(!ATEOS());
00756     c = *v->now++;
00757     if (!iscalnum(c)) {
00758         RETV(PLAIN, c);
00759     }
00760 
00761     NOTE(REG_UNONPOSIX);
00762     switch (c) {
00763     case CHR('a'):
00764         RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
00765         break;
00766     case CHR('A'):
00767         RETV(SBEGIN, 0);
00768         break;
00769     case CHR('b'):
00770         RETV(PLAIN, CHR('\b'));
00771         break;
00772     case CHR('B'):
00773         RETV(PLAIN, CHR('\\'));
00774         break;
00775     case CHR('c'):
00776         NOTE(REG_UUNPORT);
00777         if (ATEOS()) {
00778             FAILW(REG_EESCAPE);
00779         }
00780         RETV(PLAIN, (chr)(*v->now++ & 037));
00781         break;
00782     case CHR('d'):
00783         NOTE(REG_ULOCALE);
00784         RETV(CCLASS, 'd');
00785         break;
00786     case CHR('D'):
00787         NOTE(REG_ULOCALE);
00788         RETV(CCLASS, 'D');
00789         break;
00790     case CHR('e'):
00791         NOTE(REG_UUNPORT);
00792         RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
00793         break;
00794     case CHR('f'):
00795         RETV(PLAIN, CHR('\f'));
00796         break;
00797     case CHR('m'):
00798         RET('<');
00799         break;
00800     case CHR('M'):
00801         RET('>');
00802         break;
00803     case CHR('n'):
00804         RETV(PLAIN, CHR('\n'));
00805         break;
00806     case CHR('r'):
00807         RETV(PLAIN, CHR('\r'));
00808         break;
00809     case CHR('s'):
00810         NOTE(REG_ULOCALE);
00811         RETV(CCLASS, 's');
00812         break;
00813     case CHR('S'):
00814         NOTE(REG_ULOCALE);
00815         RETV(CCLASS, 'S');
00816         break;
00817     case CHR('t'):
00818         RETV(PLAIN, CHR('\t'));
00819         break;
00820     case CHR('u'):
00821         c = lexdigits(v, 16, 4, 4);
00822         if (ISERR()) {
00823             FAILW(REG_EESCAPE);
00824         }
00825         RETV(PLAIN, c);
00826         break;
00827     case CHR('U'):
00828         c = lexdigits(v, 16, 8, 8);
00829         if (ISERR()) {
00830             FAILW(REG_EESCAPE);
00831         }
00832         RETV(PLAIN, c);
00833         break;
00834     case CHR('v'):
00835         RETV(PLAIN, CHR('\v'));
00836         break;
00837     case CHR('w'):
00838         NOTE(REG_ULOCALE);
00839         RETV(CCLASS, 'w');
00840         break;
00841     case CHR('W'):
00842         NOTE(REG_ULOCALE);
00843         RETV(CCLASS, 'W');
00844         break;
00845     case CHR('x'):
00846         NOTE(REG_UUNPORT);
00847         c = lexdigits(v, 16, 1, 255);   /* REs >255 long outside spec */
00848         if (ISERR()) {
00849             FAILW(REG_EESCAPE);
00850         }
00851         RETV(PLAIN, c);
00852         break;
00853     case CHR('y'):
00854         NOTE(REG_ULOCALE);
00855         RETV(WBDRY, 0);
00856         break;
00857     case CHR('Y'):
00858         NOTE(REG_ULOCALE);
00859         RETV(NWBDRY, 0);
00860         break;
00861     case CHR('Z'):
00862         RETV(SEND, 0);
00863         break;
00864     case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
00865     case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
00866     case CHR('9'):
00867         save = v->now;
00868         v->now--;               /* put first digit back */
00869         c = lexdigits(v, 10, 1, 255);   /* REs >255 long outside spec */
00870         if (ISERR()) {
00871             FAILW(REG_EESCAPE);
00872         }
00873 
00874         /*
00875          * Ugly heuristic (first test is "exactly 1 digit?")
00876          */
00877 
00878         if (v->now - save == 0 || ((int) c > 0 && (int)c <= v->nsubexp)) {
00879             NOTE(REG_UBACKREF);
00880             RETV(BACKREF, (chr)c);
00881         }
00882 
00883         /*
00884          * Oops, doesn't look like it's a backref after all...
00885          */
00886 
00887         v->now = save;
00888 
00889         /*
00890          * And fall through into octal number.
00891          */
00892 
00893     case CHR('0'):
00894         NOTE(REG_UUNPORT);
00895         v->now--;               /* put first digit back */
00896         c = lexdigits(v, 8, 1, 3);
00897         if (ISERR()) {
00898             FAILW(REG_EESCAPE);
00899         }
00900         RETV(PLAIN, c);
00901         break;
00902     default:
00903         assert(iscalpha(c));
00904         FAILW(REG_EESCAPE);     /* unknown alphabetic escape */
00905         break;
00906     }
00907     assert(NOTREACHED);
00908 }
00909 
00910 /*
00911  - lexdigits - slurp up digits and return chr value
00912  ^ static chr lexdigits(struct vars *, int, int, int);
00913  */
00914 static chr                      /* chr value; errors signalled via ERR */
00915 lexdigits(
00916     struct vars *v,
00917     int base,
00918     int minlen,
00919     int maxlen)
00920 {
00921     uchr n;                     /* unsigned to avoid overflow misbehavior */
00922     int len;
00923     chr c;
00924     int d;
00925     CONST uchr ub = (uchr) base;
00926 
00927     n = 0;
00928     for (len = 0; len < maxlen && !ATEOS(); len++) {
00929         c = *v->now++;
00930         switch (c) {
00931         case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
00932         case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
00933         case CHR('8'): case CHR('9'):
00934             d = DIGITVAL(c);
00935             break;
00936         case CHR('a'): case CHR('A'): d = 10; break;
00937         case CHR('b'): case CHR('B'): d = 11; break;
00938         case CHR('c'): case CHR('C'): d = 12; break;
00939         case CHR('d'): case CHR('D'): d = 13; break;
00940         case CHR('e'): case CHR('E'): d = 14; break;
00941         case CHR('f'): case CHR('F'): d = 15; break;
00942         default:
00943             v->now--;           /* oops, not a digit at all */
00944             d = -1;
00945             break;
00946         }
00947 
00948         if (d >= base) {        /* not a plausible digit */
00949             v->now--;
00950             d = -1;
00951         }
00952         if (d < 0) {
00953             break;              /* NOTE BREAK OUT */
00954         }
00955         n = n*ub + (uchr)d;
00956     }
00957     if (len < minlen) {
00958         ERR(REG_EESCAPE);
00959     }
00960 
00961     return (chr)n;
00962 }
00963 
00964 /*
00965  - brenext - get next BRE token
00966  * This is much like EREs except for all the stupid backslashes and the
00967  * context-dependency of some things.
00968  ^ static int brenext(struct vars *, pchr);
00969  */
00970 static int                      /* 1 normal, 0 failure */
00971 brenext(
00972     struct vars *v,
00973     pchr pc)
00974 {
00975     chr c = (chr)pc;
00976 
00977     switch (c) {
00978     case CHR('*'):
00979         if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^')) {
00980             RETV(PLAIN, c);
00981         }
00982         RET('*');
00983         break;
00984     case CHR('['):
00985         if (HAVE(6) &&  *(v->now+0) == CHR('[') &&
00986                 *(v->now+1) == CHR(':') &&
00987                 (*(v->now+2) == CHR('<') || *(v->now+2) == CHR('>')) &&
00988                 *(v->now+3) == CHR(':') &&
00989                 *(v->now+4) == CHR(']') &&
00990                 *(v->now+5) == CHR(']')) {
00991             c = *(v->now+2);
00992             v->now += 6;
00993             NOTE(REG_UNONPOSIX);
00994             RET((c == CHR('<')) ? '<' : '>');
00995         }
00996         INTOCON(L_BRACK);
00997         if (NEXT1('^')) {
00998             v->now++;
00999             RETV('[', 0);
01000         }
01001         RETV('[', 1);
01002         break;
01003     case CHR('.'):
01004         RET('.');
01005         break;
01006     case CHR('^'):
01007         if (LASTTYPE(EMPTY)) {
01008             RET('^');
01009         }
01010         if (LASTTYPE('(')) {
01011             NOTE(REG_UUNSPEC);
01012             RET('^');
01013         }
01014         RETV(PLAIN, c);
01015         break;
01016     case CHR('$'):
01017         if (v->cflags&REG_EXPANDED) {
01018             skip(v);
01019         }
01020         if (ATEOS()) {
01021             RET('$');
01022         }
01023         if (NEXT2('\\', ')')) {
01024             NOTE(REG_UUNSPEC);
01025             RET('$');
01026         }
01027         RETV(PLAIN, c);
01028         break;
01029     case CHR('\\'):
01030         break;                  /* see below */
01031     default:
01032         RETV(PLAIN, c);
01033         break;
01034     }
01035 
01036     assert(c == CHR('\\'));
01037 
01038     if (ATEOS()) {
01039         FAILW(REG_EESCAPE);
01040     }
01041 
01042     c = *v->now++;
01043     switch (c) {
01044     case CHR('{'):
01045         INTOCON(L_BBND);
01046         NOTE(REG_UBOUNDS);
01047         RET('{');
01048         break;
01049     case CHR('('):
01050         RETV('(', 1);
01051         break;
01052     case CHR(')'):
01053         RETV(')', c);
01054         break;
01055     case CHR('<'):
01056         NOTE(REG_UNONPOSIX);
01057         RET('<');
01058         break;
01059     case CHR('>'):
01060         NOTE(REG_UNONPOSIX);
01061         RET('>');
01062         break;
01063     case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
01064     case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
01065     case CHR('9'):
01066         NOTE(REG_UBACKREF);
01067         RETV(BACKREF, (chr)DIGITVAL(c));
01068         break;
01069     default:
01070         if (iscalnum(c)) {
01071             NOTE(REG_UBSALNUM);
01072             NOTE(REG_UUNSPEC);
01073         }
01074         RETV(PLAIN, c);
01075         break;
01076     }
01077 
01078     assert(NOTREACHED);
01079 }
01080 
01081 /*
01082  - skip - skip white space and comments in expanded form
01083  ^ static VOID skip(struct vars *);
01084  */
01085 static void
01086 skip(
01087     struct vars *v)
01088 {
01089     const chr *start = v->now;
01090 
01091     assert(v->cflags&REG_EXPANDED);
01092 
01093     for (;;) {
01094         while (!ATEOS() && iscspace(*v->now)) {
01095             v->now++;
01096         }
01097         if (ATEOS() || *v->now != CHR('#')) {
01098             break;              /* NOTE BREAK OUT */
01099         }
01100         assert(NEXT1('#'));
01101         while (!ATEOS() && *v->now != CHR('\n')) {
01102             v->now++;
01103         }
01104 
01105         /*
01106          * Leave the newline to be picked up by the iscspace loop.
01107          */
01108     }
01109 
01110     if (v->now != start) {
01111         NOTE(REG_UNONPOSIX);
01112     }
01113 }
01114 
01115 /*
01116  - newline - return the chr for a newline
01117  * This helps confine use of CHR to this source file.
01118  ^ static chr newline(NOPARMS);
01119  */
01120 static chr
01121 newline(void)
01122 {
01123     return CHR('\n');
01124 }
01125 
01126 /*
01127  - ch - return the chr sequence for regc_locale.c's fake collating element ch
01128  * This helps confine use of CHR to this source file.  Beware that the caller
01129  * knows how long the sequence is.
01130  ^ #ifdef REG_DEBUG
01131  ^ static const chr *ch(NOPARMS);
01132  ^ #endif
01133  */
01134 #ifdef REG_DEBUG
01135 static const chr *
01136 ch(void)
01137 {
01138     static chr chstr[] = { CHR('c'), CHR('h'), CHR('\0') };
01139 
01140     return chstr;
01141 }
01142 #endif
01143 
01144 /*
01145  - chrnamed - return the chr known by a given (chr string) name
01146  * The code is a bit clumsy, but this routine gets only such specialized
01147  * use that it hardly matters.
01148  ^ static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
01149  */
01150 static chr
01151 chrnamed(
01152     struct vars *v,
01153     const chr *startp,          /* start of name */
01154     const chr *endp,            /* just past end of name */
01155     pchr lastresort)            /* what to return if name lookup fails */
01156 {
01157     celt c;
01158     int errsave;
01159     int e;
01160     struct cvec *cv;
01161 
01162     errsave = v->err;
01163     v->err = 0;
01164     c = element(v, startp, endp);
01165     e = v->err;
01166     v->err = errsave;
01167 
01168     if (e != 0) {
01169         return (chr)lastresort;
01170     }
01171 
01172     cv = range(v, c, c, 0);
01173     if (cv->nchrs == 0) {
01174         return (chr)lastresort;
01175     }
01176     return cv->chrs[0];
01177 }
01178 
01179 /*
01180  * Local Variables:
01181  * mode: c
01182  * c-basic-offset: 4
01183  * fill-column: 78
01184  * End:
01185  */



Generated on Wed Mar 12 12:18:10 2008 by  doxygen 1.5.1