tclRegexp.c

Go to the documentation of this file.
00001 /*
00002  * tclRegexp.c --
00003  *
00004  *      This file contains the public interfaces to the Tcl regular expression
00005  *      mechanism.
00006  *
00007  * Copyright (c) 1998 by Sun Microsystems, Inc.
00008  * Copyright (c) 1998-1999 by Scriptics Corporation.
00009  *
00010  * See the file "license.terms" for information on usage and redistribution of
00011  * this file, and for a DISCLAIMER OF ALL WARRANTIES.
00012  *
00013  * RCS: @(#) $Id: tclRegexp.c,v 1.28 2007/12/13 15:23:20 dgp Exp $
00014  */
00015 
00016 #include "tclInt.h"
00017 #include "tclRegexp.h"
00018 
00019 /*
00020  *----------------------------------------------------------------------
00021  * The routines in this file use Henry Spencer's regular expression package
00022  * contained in the following additional source files:
00023  *
00024  *      regc_color.c    regc_cvec.c     regc_lex.c
00025  *      regc_nfa.c      regcomp.c       regcustom.h
00026  *      rege_dfa.c      regerror.c      regerrs.h
00027  *      regex.h         regexec.c       regfree.c
00028  *      regfronts.c     regguts.h
00029  *
00030  * Copyright (c) 1998 Henry Spencer.  All rights reserved.
00031  *
00032  * Development of this software was funded, in part, by Cray Research Inc.,
00033  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
00034  * Corporation, none of whom are responsible for the results. The author
00035  * thanks all of them.
00036  *
00037  * Redistribution and use in source and binary forms -- with or without
00038  * modification -- are permitted for any purpose, provided that
00039  * redistributions in source form retain this entire copyright notice and
00040  * indicate the origin and nature of any modifications.
00041  *
00042  * I'd appreciate being given credit for this package in the documentation of
00043  * software which uses it, but that is not a requirement.
00044  *
00045  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
00046  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
00047  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
00048  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
00049  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
00050  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
00051  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
00052  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
00053  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
00054  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
00055  *
00056  * *** NOTE: this code has been altered slightly for use in Tcl: ***
00057  * *** 1. Names have been changed, e.g. from re_comp to          ***
00058  * ***    TclRegComp, to avoid clashes with other                ***
00059  * ***    regexp implementations used by applications.           ***
00060  */
00061 
00062 /*
00063  * Thread local storage used to maintain a per-thread cache of compiled
00064  * regular expressions.
00065  */
00066 
00067 #define NUM_REGEXPS 30
00068 
00069 typedef struct ThreadSpecificData {
00070     int initialized;            /* Set to 1 when the module is initialized. */
00071     char *patterns[NUM_REGEXPS];/* Strings corresponding to compiled regular
00072                                  * expression patterns. NULL means that this
00073                                  * slot isn't used. Malloc-ed. */
00074     int patLengths[NUM_REGEXPS];/* Number of non-null characters in
00075                                  * corresponding entry in patterns. -1 means
00076                                  * entry isn't used. */
00077     struct TclRegexp *regexps[NUM_REGEXPS];
00078                                 /* Compiled forms of above strings. Also
00079                                  * malloc-ed, or NULL if not in use yet. */
00080 } ThreadSpecificData;
00081 
00082 static Tcl_ThreadDataKey dataKey;
00083 
00084 /*
00085  * Declarations for functions used only in this file.
00086  */
00087 
00088 static TclRegexp *      CompileRegexp(Tcl_Interp *interp, const char *pattern,
00089                             int length, int flags);
00090 static void             DupRegexpInternalRep(Tcl_Obj *srcPtr,
00091                             Tcl_Obj *copyPtr);
00092 static void             FinalizeRegexp(ClientData clientData);
00093 static void             FreeRegexp(TclRegexp *regexpPtr);
00094 static void             FreeRegexpInternalRep(Tcl_Obj *objPtr);
00095 static int              RegExpExecUniChar(Tcl_Interp *interp, Tcl_RegExp re,
00096                             const Tcl_UniChar *uniString, int numChars,
00097                             int nmatches, int flags);
00098 static int              SetRegexpFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr);
00099 
00100 /*
00101  * The regular expression Tcl object type. This serves as a cache of the
00102  * compiled form of the regular expression.
00103  */
00104 
00105 Tcl_ObjType tclRegexpType = {
00106     "regexp",                           /* name */
00107     FreeRegexpInternalRep,              /* freeIntRepProc */
00108     DupRegexpInternalRep,               /* dupIntRepProc */
00109     NULL,                               /* updateStringProc */
00110     SetRegexpFromAny                    /* setFromAnyProc */
00111 };
00112 
00113 /*
00114  *----------------------------------------------------------------------
00115  *
00116  * Tcl_RegExpCompile --
00117  *
00118  *      Compile a regular expression into a form suitable for fast matching.
00119  *      This function is DEPRECATED in favor of the object version of the
00120  *      command.
00121  *
00122  * Results:
00123  *      The return value is a pointer to the compiled form of string, suitable
00124  *      for passing to Tcl_RegExpExec. This compiled form is only valid up
00125  *      until the next call to this function, so don't keep these around for a
00126  *      long time! If an error occurred while compiling the pattern, then NULL
00127  *      is returned and an error message is left in the interp's result.
00128  *
00129  * Side effects:
00130  *      Updates the cache of compiled regexps.
00131  *
00132  *----------------------------------------------------------------------
00133  */
00134 
00135 Tcl_RegExp
00136 Tcl_RegExpCompile(
00137     Tcl_Interp *interp,         /* For use in error reporting and to access
00138                                  * the interp regexp cache. */
00139     const char *pattern)        /* String for which to produce compiled
00140                                  * regular expression. */
00141 {
00142     return (Tcl_RegExp) CompileRegexp(interp, pattern, (int) strlen(pattern),
00143             REG_ADVANCED);
00144 }
00145 
00146 /*
00147  *----------------------------------------------------------------------
00148  *
00149  * Tcl_RegExpExec --
00150  *
00151  *      Execute the regular expression matcher using a compiled form of a
00152  *      regular expression and save information about any match that is found.
00153  *
00154  * Results:
00155  *      If an error occurs during the matching operation then -1 is returned
00156  *      and the interp's result contains an error message. Otherwise the
00157  *      return value is 1 if a matching range is found and 0 if there is no
00158  *      matching range.
00159  *
00160  * Side effects:
00161  *      None.
00162  *
00163  *----------------------------------------------------------------------
00164  */
00165 
00166 int
00167 Tcl_RegExpExec(
00168     Tcl_Interp *interp,         /* Interpreter to use for error reporting. */
00169     Tcl_RegExp re,              /* Compiled regular expression; must have been
00170                                  * returned by previous call to
00171                                  * Tcl_GetRegExpFromObj. */
00172     const char *text,           /* Text against which to match re. */
00173     const char *start)          /* If text is part of a larger string, this
00174                                  * identifies beginning of larger string, so
00175                                  * that "^" won't match. */
00176 {
00177     int flags, result, numChars;
00178     TclRegexp *regexp = (TclRegexp *)re;
00179     Tcl_DString ds;
00180     const Tcl_UniChar *ustr;
00181 
00182     /*
00183      * If the starting point is offset from the beginning of the buffer, then
00184      * we need to tell the regexp engine not to match "^".
00185      */
00186 
00187     if (text > start) {
00188         flags = REG_NOTBOL;
00189     } else {
00190         flags = 0;
00191     }
00192 
00193     /*
00194      * Remember the string for use by Tcl_RegExpRange().
00195      */
00196 
00197     regexp->string = text;
00198     regexp->objPtr = NULL;
00199 
00200     /*
00201      * Convert the string to Unicode and perform the match.
00202      */
00203 
00204     Tcl_DStringInit(&ds);
00205     ustr = Tcl_UtfToUniCharDString(text, -1, &ds);
00206     numChars = Tcl_DStringLength(&ds) / sizeof(Tcl_UniChar);
00207     result = RegExpExecUniChar(interp, re, ustr, numChars, -1 /* nmatches */,
00208             flags);
00209     Tcl_DStringFree(&ds);
00210 
00211     return result;
00212 }
00213 
00214 /*
00215  *---------------------------------------------------------------------------
00216  *
00217  * Tcl_RegExpRange --
00218  *
00219  *      Returns pointers describing the range of a regular expression match,
00220  *      or one of the subranges within the match.
00221  *
00222  * Results:
00223  *      The variables at *startPtr and *endPtr are modified to hold the
00224  *      addresses of the endpoints of the range given by index. If the
00225  *      specified range doesn't exist then NULLs are returned.
00226  *
00227  * Side effects:
00228  *      None.
00229  *
00230  *---------------------------------------------------------------------------
00231  */
00232 
00233 void
00234 Tcl_RegExpRange(
00235     Tcl_RegExp re,              /* Compiled regular expression that has been
00236                                  * passed to Tcl_RegExpExec. */
00237     int index,                  /* 0 means give the range of the entire match,
00238                                  * > 0 means give the range of a matching
00239                                  * subrange. */
00240     const char **startPtr,      /* Store address of first character in
00241                                  * (sub-)range here. */
00242     const char **endPtr)        /* Store address of character just after last
00243                                  * in (sub-)range here. */
00244 {
00245     TclRegexp *regexpPtr = (TclRegexp *) re;
00246     const char *string;
00247 
00248     if ((size_t) index > regexpPtr->re.re_nsub) {
00249         *startPtr = *endPtr = NULL;
00250     } else if (regexpPtr->matches[index].rm_so < 0) {
00251         *startPtr = *endPtr = NULL;
00252     } else {
00253         if (regexpPtr->objPtr) {
00254             string = TclGetString(regexpPtr->objPtr);
00255         } else {
00256             string = regexpPtr->string;
00257         }
00258         *startPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_so);
00259         *endPtr = Tcl_UtfAtIndex(string, regexpPtr->matches[index].rm_eo);
00260     }
00261 }
00262 
00263 /*
00264  *---------------------------------------------------------------------------
00265  *
00266  * RegExpExecUniChar --
00267  *
00268  *      Execute the regular expression matcher using a compiled form of a
00269  *      regular expression and save information about any match that is found.
00270  *
00271  * Results:
00272  *      If an error occurs during the matching operation then -1 is returned
00273  *      and an error message is left in interp's result. Otherwise the return
00274  *      value is 1 if a matching range was found or 0 if there was no matching
00275  *      range.
00276  *
00277  * Side effects:
00278  *      None.
00279  *
00280  *----------------------------------------------------------------------
00281  */
00282 
00283 static int
00284 RegExpExecUniChar(
00285     Tcl_Interp *interp,         /* Interpreter to use for error reporting. */
00286     Tcl_RegExp re,              /* Compiled regular expression; returned by a
00287                                  * previous call to Tcl_GetRegExpFromObj */
00288     const Tcl_UniChar *wString, /* String against which to match re. */
00289     int numChars,               /* Length of Tcl_UniChar string (must be
00290                                  * >=0). */
00291     int nmatches,               /* How many subexpression matches (counting
00292                                  * the whole match as subexpression 0) are of
00293                                  * interest. -1 means "don't know". */
00294     int flags)                  /* Regular expression flags. */
00295 {
00296     int status;
00297     TclRegexp *regexpPtr = (TclRegexp *) re;
00298     size_t last = regexpPtr->re.re_nsub + 1;
00299     size_t nm = last;
00300 
00301     if (nmatches >= 0 && (size_t) nmatches < nm) {
00302         nm = (size_t) nmatches;
00303     }
00304 
00305     status = TclReExec(&regexpPtr->re, wString, (size_t) numChars,
00306             &regexpPtr->details, nm, regexpPtr->matches, flags);
00307 
00308     /*
00309      * Check for errors.
00310      */
00311 
00312     if (status != REG_OKAY) {
00313         if (status == REG_NOMATCH) {
00314             return 0;
00315         }
00316         if (interp != NULL) {
00317             TclRegError(interp, "error while matching regular expression: ",
00318                     status);
00319         }
00320         return -1;
00321     }
00322     return 1;
00323 }
00324 
00325 /*
00326  *---------------------------------------------------------------------------
00327  *
00328  * TclRegExpRangeUniChar --
00329  *
00330  *      Returns pointers describing the range of a regular expression match,
00331  *      or one of the subranges within the match, or the hypothetical range
00332  *      represented by the rm_extend field of the rm_detail_t.
00333  *
00334  * Results:
00335  *      The variables at *startPtr and *endPtr are modified to hold the
00336  *      offsets of the endpoints of the range given by index. If the specified
00337  *      range doesn't exist then -1s are supplied.
00338  *
00339  * Side effects:
00340  *      None.
00341  *
00342  *---------------------------------------------------------------------------
00343  */
00344 
00345 void
00346 TclRegExpRangeUniChar(
00347     Tcl_RegExp re,              /* Compiled regular expression that has been
00348                                  * passed to Tcl_RegExpExec. */
00349     int index,                  /* 0 means give the range of the entire match,
00350                                  * > 0 means give the range of a matching
00351                                  * subrange, -1 means the range of the
00352                                  * rm_extend field. */
00353     int *startPtr,              /* Store address of first character in
00354                                  * (sub-)range here. */
00355     int *endPtr)                /* Store address of character just after last
00356                                  * in (sub-)range here. */
00357 {
00358     TclRegexp *regexpPtr = (TclRegexp *) re;
00359 
00360     if ((regexpPtr->flags&REG_EXPECT) && index == -1) {
00361         *startPtr = regexpPtr->details.rm_extend.rm_so;
00362         *endPtr = regexpPtr->details.rm_extend.rm_eo;
00363     } else if ((size_t) index > regexpPtr->re.re_nsub) {
00364         *startPtr = -1;
00365         *endPtr = -1;
00366     } else {
00367         *startPtr = regexpPtr->matches[index].rm_so;
00368         *endPtr = regexpPtr->matches[index].rm_eo;
00369     }
00370 }
00371 
00372 /*
00373  *----------------------------------------------------------------------
00374  *
00375  * Tcl_RegExpMatch --
00376  *
00377  *      See if a string matches a regular expression.
00378  *
00379  * Results:
00380  *      If an error occurs during the matching operation then -1 is returned
00381  *      and the interp's result contains an error message. Otherwise the
00382  *      return value is 1 if "text" matches "pattern" and 0 otherwise.
00383  *
00384  * Side effects:
00385  *      None.
00386  *
00387  *----------------------------------------------------------------------
00388  */
00389 
00390 int
00391 Tcl_RegExpMatch(
00392     Tcl_Interp *interp,         /* Used for error reporting. May be NULL. */
00393     const char *text,           /* Text to search for pattern matches. */
00394     const char *pattern)        /* Regular expression to match against text. */
00395 {
00396     Tcl_RegExp re;
00397 
00398     re = Tcl_RegExpCompile(interp, pattern);
00399     if (re == NULL) {
00400         return -1;
00401     }
00402     return Tcl_RegExpExec(interp, re, text, text);
00403 }
00404 
00405 /*
00406  *----------------------------------------------------------------------
00407  *
00408  * Tcl_RegExpExecObj --
00409  *
00410  *      Execute a precompiled regexp against the given object.
00411  *
00412  * Results:
00413  *      If an error occurs during the matching operation then -1 is returned
00414  *      and the interp's result contains an error message. Otherwise the
00415  *      return value is 1 if "string" matches "pattern" and 0 otherwise.
00416  *
00417  * Side effects:
00418  *      Converts the object to a Unicode object.
00419  *
00420  *----------------------------------------------------------------------
00421  */
00422 
00423 int
00424 Tcl_RegExpExecObj(
00425     Tcl_Interp *interp,         /* Interpreter to use for error reporting. */
00426     Tcl_RegExp re,              /* Compiled regular expression; must have been
00427                                  * returned by previous call to
00428                                  * Tcl_GetRegExpFromObj. */
00429     Tcl_Obj *textObj,           /* Text against which to match re. */
00430     int offset,                 /* Character index that marks where matching
00431                                  * should begin. */
00432     int nmatches,               /* How many subexpression matches (counting
00433                                  * the whole match as subexpression 0) are of
00434                                  * interest. -1 means all of them. */
00435     int flags)                  /* Regular expression execution flags. */
00436 {
00437     TclRegexp *regexpPtr = (TclRegexp *) re;
00438     Tcl_UniChar *udata;
00439     int length;
00440     int reflags = regexpPtr->flags;
00441 #define TCL_REG_GLOBOK_FLAGS (TCL_REG_ADVANCED | TCL_REG_NOSUB | TCL_REG_NOCASE)
00442 
00443     /*
00444      * Take advantage of the equivalent glob pattern, if one exists.
00445      * This is possible based only on the right mix of incoming flags (0)
00446      * and regexp compile flags.
00447      */
00448     if ((offset == 0) && (nmatches == 0) && (flags == 0)
00449             && !(reflags & ~TCL_REG_GLOBOK_FLAGS)
00450             && (regexpPtr->globObjPtr != NULL)) {
00451         int nocase = (reflags & TCL_REG_NOCASE) ? TCL_MATCH_NOCASE : 0;
00452 
00453         /*
00454          * Pass to TclStringMatchObj for obj-specific handling.
00455          * XXX: Currently doesn't take advantage of exact-ness that
00456          * XXX: TclReToGlob tells us about
00457          */
00458 
00459         return TclStringMatchObj(textObj, regexpPtr->globObjPtr, nocase);
00460     }
00461 
00462     /*
00463      * Save the target object so we can extract strings from it later.
00464      */
00465 
00466     regexpPtr->string = NULL;
00467     regexpPtr->objPtr = textObj;
00468 
00469     udata = Tcl_GetUnicodeFromObj(textObj, &length);
00470 
00471     if (offset > length) {
00472         offset = length;
00473     }
00474     udata += offset;
00475     length -= offset;
00476 
00477     return RegExpExecUniChar(interp, re, udata, length, nmatches, flags);
00478 }
00479 
00480 /*
00481  *----------------------------------------------------------------------
00482  *
00483  * Tcl_RegExpMatchObj --
00484  *
00485  *      See if an object matches a regular expression.
00486  *
00487  * Results:
00488  *      If an error occurs during the matching operation then -1 is returned
00489  *      and the interp's result contains an error message. Otherwise the
00490  *      return value is 1 if "text" matches "pattern" and 0 otherwise.
00491  *
00492  * Side effects:
00493  *      Changes the internal rep of the pattern and string objects.
00494  *
00495  *----------------------------------------------------------------------
00496  */
00497 
00498 int
00499 Tcl_RegExpMatchObj(
00500     Tcl_Interp *interp,         /* Used for error reporting. May be NULL. */
00501     Tcl_Obj *textObj,           /* Object containing the String to search. */
00502     Tcl_Obj *patternObj)        /* Regular expression to match against
00503                                  * string. */
00504 {
00505     Tcl_RegExp re;
00506 
00507     re = Tcl_GetRegExpFromObj(interp, patternObj,
00508             TCL_REG_ADVANCED | TCL_REG_NOSUB);
00509     if (re == NULL) {
00510         return -1;
00511     }
00512     return Tcl_RegExpExecObj(interp, re, textObj, 0 /* offset */,
00513             0 /* nmatches */, 0 /* flags */);
00514 }
00515 
00516 /*
00517  *----------------------------------------------------------------------
00518  *
00519  * Tcl_RegExpGetInfo --
00520  *
00521  *      Retrieve information about the current match.
00522  *
00523  * Results:
00524  *      None.
00525  *
00526  * Side effects:
00527  *      None.
00528  *
00529  *----------------------------------------------------------------------
00530  */
00531 
00532 void
00533 Tcl_RegExpGetInfo(
00534     Tcl_RegExp regexp,          /* Pattern from which to get subexpressions. */
00535     Tcl_RegExpInfo *infoPtr)    /* Match information is stored here. */
00536 {
00537     TclRegexp *regexpPtr = (TclRegexp *) regexp;
00538 
00539     infoPtr->nsubs = regexpPtr->re.re_nsub;
00540     infoPtr->matches = (Tcl_RegExpIndices *) regexpPtr->matches;
00541     infoPtr->extendStart = regexpPtr->details.rm_extend.rm_so;
00542 }
00543 
00544 /*
00545  *----------------------------------------------------------------------
00546  *
00547  * Tcl_GetRegExpFromObj --
00548  *
00549  *      Compile a regular expression into a form suitable for fast matching.
00550  *      This function caches the result in a Tcl_Obj.
00551  *
00552  * Results:
00553  *      The return value is a pointer to the compiled form of string, suitable
00554  *      for passing to Tcl_RegExpExec. If an error occurred while compiling
00555  *      the pattern, then NULL is returned and an error message is left in the
00556  *      interp's result.
00557  *
00558  * Side effects:
00559  *      Updates the native rep of the Tcl_Obj.
00560  *
00561  *----------------------------------------------------------------------
00562  */
00563 
00564 Tcl_RegExp
00565 Tcl_GetRegExpFromObj(
00566     Tcl_Interp *interp,         /* For use in error reporting, and to access
00567                                  * the interp regexp cache. */
00568     Tcl_Obj *objPtr,            /* Object whose string rep contains regular
00569                                  * expression pattern. Internal rep will be
00570                                  * changed to compiled form of this regular
00571                                  * expression. */
00572     int flags)                  /* Regular expression compilation flags. */
00573 {
00574     int length;
00575     TclRegexp *regexpPtr;
00576     char *pattern;
00577 
00578     /*
00579      * This is OK because we only actually interpret this value properly as a
00580      * TclRegexp* when the type is tclRegexpType.
00581      */
00582 
00583     regexpPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;
00584 
00585     if ((objPtr->typePtr != &tclRegexpType) || (regexpPtr->flags != flags)) {
00586         pattern = TclGetStringFromObj(objPtr, &length);
00587 
00588         regexpPtr = CompileRegexp(interp, pattern, length, flags);
00589         if (regexpPtr == NULL) {
00590             return NULL;
00591         }
00592 
00593         /*
00594          * Add a reference to the regexp so it will persist even if it is
00595          * pushed out of the current thread's regexp cache. This reference
00596          * will be removed when the object's internal rep is freed.
00597          */
00598 
00599         regexpPtr->refCount++;
00600 
00601         /*
00602          * Free the old representation and set our type.
00603          */
00604 
00605         TclFreeIntRep(objPtr);
00606         objPtr->internalRep.otherValuePtr = (void *) regexpPtr;
00607         objPtr->typePtr = &tclRegexpType;
00608     }
00609     return (Tcl_RegExp) regexpPtr;
00610 }
00611 
00612 /*
00613  *----------------------------------------------------------------------
00614  *
00615  * TclRegAbout --
00616  *
00617  *      Return information about a compiled regular expression.
00618  *
00619  * Results:
00620  *      The return value is -1 for failure, 0 for success, although at the
00621  *      moment there's nothing that could fail. On success, a list is left in
00622  *      the interp's result: first element is the subexpression count, second
00623  *      is a list of re_info bit names.
00624  *
00625  * Side effects:
00626  *      None.
00627  *
00628  *----------------------------------------------------------------------
00629  */
00630 
00631 int
00632 TclRegAbout(
00633     Tcl_Interp *interp,         /* For use in variable assignment. */
00634     Tcl_RegExp re)              /* The compiled regular expression. */
00635 {
00636     TclRegexp *regexpPtr = (TclRegexp *) re;
00637     struct infoname {
00638         int bit;
00639         const char *text;
00640     };
00641     static const struct infoname infonames[] = {
00642         {REG_UBACKREF,          "REG_UBACKREF"},
00643         {REG_ULOOKAHEAD,        "REG_ULOOKAHEAD"},
00644         {REG_UBOUNDS,           "REG_UBOUNDS"},
00645         {REG_UBRACES,           "REG_UBRACES"},
00646         {REG_UBSALNUM,          "REG_UBSALNUM"},
00647         {REG_UPBOTCH,           "REG_UPBOTCH"},
00648         {REG_UBBS,              "REG_UBBS"},
00649         {REG_UNONPOSIX,         "REG_UNONPOSIX"},
00650         {REG_UUNSPEC,           "REG_UUNSPEC"},
00651         {REG_UUNPORT,           "REG_UUNPORT"},
00652         {REG_ULOCALE,           "REG_ULOCALE"},
00653         {REG_UEMPTYMATCH,       "REG_UEMPTYMATCH"},
00654         {REG_UIMPOSSIBLE,       "REG_UIMPOSSIBLE"},
00655         {REG_USHORTEST,         "REG_USHORTEST"},
00656         {0,                     NULL}
00657     };
00658     const struct infoname *inf;
00659     Tcl_Obj *infoObj;
00660 
00661     /*
00662      * The reset here guarantees that the interpreter result is empty and
00663      * unshared. This means that we can use Tcl_ListObjAppendElement on the
00664      * result object quite safely.
00665      */
00666 
00667     Tcl_ResetResult(interp);
00668 
00669     /*
00670      * Assume that there will never be more than INT_MAX subexpressions. This
00671      * is a pretty reasonable assumption; the RE engine doesn't scale _that_
00672      * well and Tcl has other limits that constrain things as well...
00673      */
00674 
00675     Tcl_ListObjAppendElement(NULL, Tcl_GetObjResult(interp),
00676             Tcl_NewIntObj((int) regexpPtr->re.re_nsub));
00677 
00678     /*
00679      * Now append a list of all the bit-flags set for the RE.
00680      */
00681 
00682     TclNewObj(infoObj);
00683     for (inf=infonames ; inf->bit != 0 ; inf++) {
00684         if (regexpPtr->re.re_info & inf->bit) {
00685             Tcl_ListObjAppendElement(NULL, infoObj,
00686                     Tcl_NewStringObj(inf->text, -1));
00687         }
00688     }
00689     Tcl_ListObjAppendElement(NULL, Tcl_GetObjResult(interp), infoObj);
00690 
00691     return 0;
00692 }
00693 
00694 /*
00695  *----------------------------------------------------------------------
00696  *
00697  * TclRegError --
00698  *
00699  *      Generate an error message based on the regexp status code.
00700  *
00701  * Results:
00702  *      Places an error in the interpreter.
00703  *
00704  * Side effects:
00705  *      Sets errorCode as well.
00706  *
00707  *----------------------------------------------------------------------
00708  */
00709 
00710 void
00711 TclRegError(
00712     Tcl_Interp *interp,         /* Interpreter for error reporting. */
00713     const char *msg,            /* Message to prepend to error. */
00714     int status)                 /* Status code to report. */
00715 {
00716     char buf[100];              /* ample in practice */
00717     char cbuf[100];             /* lots in practice */
00718     size_t n;
00719     const char *p;
00720 
00721     Tcl_ResetResult(interp);
00722     n = TclReError(status, NULL, buf, sizeof(buf));
00723     p = (n > sizeof(buf)) ? "..." : "";
00724     Tcl_AppendResult(interp, msg, buf, p, NULL);
00725 
00726     sprintf(cbuf, "%d", status);
00727     (void) TclReError(REG_ITOA, NULL, cbuf, sizeof(cbuf));
00728     Tcl_SetErrorCode(interp, "REGEXP", cbuf, buf, NULL);
00729 }
00730 
00731 /*
00732  *----------------------------------------------------------------------
00733  *
00734  * FreeRegexpInternalRep --
00735  *
00736  *      Deallocate the storage associated with a regexp object's internal
00737  *      representation.
00738  *
00739  * Results:
00740  *      None.
00741  *
00742  * Side effects:
00743  *      Frees the compiled regular expression.
00744  *
00745  *----------------------------------------------------------------------
00746  */
00747 
00748 static void
00749 FreeRegexpInternalRep(
00750     Tcl_Obj *objPtr)            /* Regexp object with internal rep to free. */
00751 {
00752     TclRegexp *regexpRepPtr = (TclRegexp *) objPtr->internalRep.otherValuePtr;
00753 
00754     /*
00755      * If this is the last reference to the regexp, free it.
00756      */
00757 
00758     if (--(regexpRepPtr->refCount) <= 0) {
00759         FreeRegexp(regexpRepPtr);
00760     }
00761 }
00762 
00763 /*
00764  *----------------------------------------------------------------------
00765  *
00766  * DupRegexpInternalRep --
00767  *
00768  *      We copy the reference to the compiled regexp and bump its reference
00769  *      count.
00770  *
00771  * Results:
00772  *      None.
00773  *
00774  * Side effects:
00775  *      Increments the reference count of the regexp.
00776  *
00777  *----------------------------------------------------------------------
00778  */
00779 
00780 static void
00781 DupRegexpInternalRep(
00782     Tcl_Obj *srcPtr,            /* Object with internal rep to copy. */
00783     Tcl_Obj *copyPtr)           /* Object with internal rep to set. */
00784 {
00785     TclRegexp *regexpPtr = (TclRegexp *) srcPtr->internalRep.otherValuePtr;
00786 
00787     regexpPtr->refCount++;
00788     copyPtr->internalRep.otherValuePtr = srcPtr->internalRep.otherValuePtr;
00789     copyPtr->typePtr = &tclRegexpType;
00790 }
00791 
00792 /*
00793  *----------------------------------------------------------------------
00794  *
00795  * SetRegexpFromAny --
00796  *
00797  *      Attempt to generate a compiled regular expression for the Tcl object
00798  *      "objPtr".
00799  *
00800  * Results:
00801  *      The return value is TCL_OK or TCL_ERROR. If an error occurs during
00802  *      conversion, an error message is left in the interpreter's result
00803  *      unless "interp" is NULL.
00804  *
00805  * Side effects:
00806  *      If no error occurs, a regular expression is stored as "objPtr"s
00807  *      internal representation.
00808  *
00809  *----------------------------------------------------------------------
00810  */
00811 
00812 static int
00813 SetRegexpFromAny(
00814     Tcl_Interp *interp,         /* Used for error reporting if not NULL. */
00815     Tcl_Obj *objPtr)            /* The object to convert. */
00816 {
00817     if (Tcl_GetRegExpFromObj(interp, objPtr, REG_ADVANCED) == NULL) {
00818         return TCL_ERROR;
00819     }
00820     return TCL_OK;
00821 }
00822 
00823 /*
00824  *---------------------------------------------------------------------------
00825  *
00826  * CompileRegexp --
00827  *
00828  *      Attempt to compile the given regexp pattern. If the compiled regular
00829  *      expression can be found in the per-thread cache, it will be used
00830  *      instead of compiling a new copy.
00831  *
00832  * Results:
00833  *      The return value is a pointer to a newly allocated TclRegexp that
00834  *      represents the compiled pattern, or NULL if the pattern could not be
00835  *      compiled. If NULL is returned, an error message is left in the
00836  *      interp's result.
00837  *
00838  * Side effects:
00839  *      The thread-local regexp cache is updated and a new TclRegexp may be
00840  *      allocated.
00841  *
00842  *----------------------------------------------------------------------
00843  */
00844 
00845 static TclRegexp *
00846 CompileRegexp(
00847     Tcl_Interp *interp,         /* Used for error reporting if not NULL. */
00848     const char *string,         /* The regexp to compile (UTF-8). */
00849     int length,                 /* The length of the string in bytes. */
00850     int flags)                  /* Compilation flags. */
00851 {
00852     TclRegexp *regexpPtr;
00853     const Tcl_UniChar *uniString;
00854     int numChars, status, i, exact;
00855     Tcl_DString stringBuf;
00856     ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);
00857 
00858     if (!tsdPtr->initialized) {
00859         tsdPtr->initialized = 1;
00860         Tcl_CreateThreadExitHandler(FinalizeRegexp, NULL);
00861     }
00862 
00863     /*
00864      * This routine maintains a second-level regular expression cache in
00865      * addition to the per-object regexp cache. The per-thread cache is needed
00866      * to handle the case where for various reasons the object is lost between
00867      * invocations of the regexp command, but the literal pattern is the same.
00868      */
00869 
00870     /*
00871      * Check the per-thread compiled regexp cache. We can only reuse a regexp
00872      * if it has the same pattern and the same flags.
00873      */
00874 
00875     for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) {
00876         if ((length == tsdPtr->patLengths[i])
00877                 && (tsdPtr->regexps[i]->flags == flags)
00878                 && (strcmp(string, tsdPtr->patterns[i]) == 0)) {
00879             /*
00880              * Move the matched pattern to the first slot in the cache and
00881              * shift the other patterns down one position.
00882              */
00883 
00884             if (i != 0) {
00885                 int j;
00886                 char *cachedString;
00887 
00888                 cachedString = tsdPtr->patterns[i];
00889                 regexpPtr = tsdPtr->regexps[i];
00890                 for (j = i-1; j >= 0; j--) {
00891                     tsdPtr->patterns[j+1] = tsdPtr->patterns[j];
00892                     tsdPtr->patLengths[j+1] = tsdPtr->patLengths[j];
00893                     tsdPtr->regexps[j+1] = tsdPtr->regexps[j];
00894                 }
00895                 tsdPtr->patterns[0] = cachedString;
00896                 tsdPtr->patLengths[0] = length;
00897                 tsdPtr->regexps[0] = regexpPtr;
00898             }
00899             return tsdPtr->regexps[0];
00900         }
00901     }
00902 
00903     /*
00904      * This is a new expression, so compile it and add it to the cache.
00905      */
00906 
00907     regexpPtr = (TclRegexp *) ckalloc(sizeof(TclRegexp));
00908     regexpPtr->objPtr = NULL;
00909     regexpPtr->string = NULL;
00910     regexpPtr->details.rm_extend.rm_so = -1;
00911     regexpPtr->details.rm_extend.rm_eo = -1;
00912 
00913     /*
00914      * Get the up-to-date string representation and map to unicode.
00915      */
00916 
00917     Tcl_DStringInit(&stringBuf);
00918     uniString = Tcl_UtfToUniCharDString(string, length, &stringBuf);
00919     numChars = Tcl_DStringLength(&stringBuf) / sizeof(Tcl_UniChar);
00920 
00921     /*
00922      * Compile the string and check for errors.
00923      */
00924 
00925     regexpPtr->flags = flags;
00926     status = TclReComp(&regexpPtr->re, uniString, (size_t) numChars, flags);
00927     Tcl_DStringFree(&stringBuf);
00928 
00929     if (status != REG_OKAY) {
00930         /*
00931          * Clean up and report errors in the interpreter, if possible.
00932          */
00933 
00934         ckfree((char *)regexpPtr);
00935         if (interp) {
00936             TclRegError(interp,
00937                     "couldn't compile regular expression pattern: ", status);
00938         }
00939         return NULL;
00940     }
00941 
00942     /*
00943      * Convert RE to a glob pattern equivalent, if any, and cache it.  If this
00944      * is not possible, then globObjPtr will be NULL.  This is used by
00945      * Tcl_RegExpExecObj to optionally do a fast match (avoids RE engine).
00946      */
00947 
00948     if (TclReToGlob(NULL, string, length, &stringBuf, &exact) == TCL_OK) {
00949         regexpPtr->globObjPtr = Tcl_NewStringObj(Tcl_DStringValue(&stringBuf),
00950                 Tcl_DStringLength(&stringBuf));
00951         Tcl_IncrRefCount(regexpPtr->globObjPtr);
00952         Tcl_DStringFree(&stringBuf);
00953     } else {
00954         regexpPtr->globObjPtr = NULL;
00955     }
00956 
00957     /*
00958      * Allocate enough space for all of the subexpressions, plus one extra for
00959      * the entire pattern.
00960      */
00961 
00962     regexpPtr->matches = (regmatch_t *) ckalloc(
00963             sizeof(regmatch_t) * (regexpPtr->re.re_nsub + 1));
00964 
00965     /*
00966      * Initialize the refcount to one initially, since it is in the cache.
00967      */
00968 
00969     regexpPtr->refCount = 1;
00970 
00971     /*
00972      * Free the last regexp, if necessary, and make room at the head of the
00973      * list for the new regexp.
00974      */
00975 
00976     if (tsdPtr->patterns[NUM_REGEXPS-1] != NULL) {
00977         TclRegexp *oldRegexpPtr = tsdPtr->regexps[NUM_REGEXPS-1];
00978         if (--(oldRegexpPtr->refCount) <= 0) {
00979             FreeRegexp(oldRegexpPtr);
00980         }
00981         ckfree(tsdPtr->patterns[NUM_REGEXPS-1]);
00982     }
00983     for (i = NUM_REGEXPS - 2; i >= 0; i--) {
00984         tsdPtr->patterns[i+1] = tsdPtr->patterns[i];
00985         tsdPtr->patLengths[i+1] = tsdPtr->patLengths[i];
00986         tsdPtr->regexps[i+1] = tsdPtr->regexps[i];
00987     }
00988     tsdPtr->patterns[0] = (char *) ckalloc((unsigned) (length+1));
00989     strcpy(tsdPtr->patterns[0], string);
00990     tsdPtr->patLengths[0] = length;
00991     tsdPtr->regexps[0] = regexpPtr;
00992 
00993     return regexpPtr;
00994 }
00995 
00996 /*
00997  *----------------------------------------------------------------------
00998  *
00999  * FreeRegexp --
01000  *
01001  *      Release the storage associated with a TclRegexp.
01002  *
01003  * Results:
01004  *      None.
01005  *
01006  * Side effects:
01007  *      None.
01008  *
01009  *----------------------------------------------------------------------
01010  */
01011 
01012 static void
01013 FreeRegexp(
01014     TclRegexp *regexpPtr)       /* Compiled regular expression to free. */
01015 {
01016     TclReFree(&regexpPtr->re);
01017     if (regexpPtr->globObjPtr) {
01018         TclDecrRefCount(regexpPtr->globObjPtr);
01019     }
01020     if (regexpPtr->matches) {
01021         ckfree((char *) regexpPtr->matches);
01022     }
01023     ckfree((char *) regexpPtr);
01024 }
01025 
01026 /*
01027  *----------------------------------------------------------------------
01028  *
01029  * FinalizeRegexp --
01030  *
01031  *      Release the storage associated with the per-thread regexp cache.
01032  *
01033  * Results:
01034  *      None.
01035  *
01036  * Side effects:
01037  *      None.
01038  *
01039  *----------------------------------------------------------------------
01040  */
01041 
01042 static void
01043 FinalizeRegexp(
01044     ClientData clientData)      /* Not used. */
01045 {
01046     int i;
01047     TclRegexp *regexpPtr;
01048     ThreadSpecificData *tsdPtr = TCL_TSD_INIT(&dataKey);
01049 
01050     for (i = 0; (i < NUM_REGEXPS) && (tsdPtr->patterns[i] != NULL); i++) {
01051         regexpPtr = tsdPtr->regexps[i];
01052         if (--(regexpPtr->refCount) <= 0) {
01053             FreeRegexp(regexpPtr);
01054         }
01055         ckfree(tsdPtr->patterns[i]);
01056         tsdPtr->patterns[i] = NULL;
01057     }
01058     /*
01059      * We may find ourselves reinitialized if another finalization routine
01060      * invokes regexps.
01061      */
01062     tsdPtr->initialized = 0;
01063 }
01064 
01065 /*
01066  * Local Variables:
01067  * mode: c
01068  * c-basic-offset: 4
01069  * fill-column: 78
01070  * End:
01071  */



Generated on Wed Mar 12 12:18:21 2008 by  doxygen 1.5.1